refactor(core): Use the utf8 decoding in /deps for JSON

2025-06-03 04:00:21 +00:00 · 2024-12-19 03:45:22 +01:00 · 2024-12-19 03:45:22 +01:00 · c29c0901f7
commit c29c0901f7
parent ba1810606f
2 changed files with 66 additions and 111 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -742,6 +742,7 @@ set(lib_headers ${PROJECT_SOURCE_DIR}/deps/open62541_queue.h
                ${PROJECT_SOURCE_DIR}/deps/base64.h
                ${PROJECT_SOURCE_DIR}/deps/dtoa.h
                ${PROJECT_SOURCE_DIR}/deps/mp_printf.h
                ${PROJECT_SOURCE_DIR}/deps/utf8.h
                ${PROJECT_SOURCE_DIR}/deps/itoa.h
                ${PROJECT_SOURCE_DIR}/deps/ziptree.h
                ${PROJECT_SOURCE_DIR}/src/ua_types_encoding_binary.h
@ -812,6 +813,7 @@ set(lib_sources ${PROJECT_SOURCE_DIR}/src/ua_types.c
                ${PROJECT_SOURCE_DIR}/deps/base64.c
                ${PROJECT_SOURCE_DIR}/deps/dtoa.c
                ${PROJECT_SOURCE_DIR}/deps/mp_printf.c
                ${PROJECT_SOURCE_DIR}/deps/utf8.c
                ${PROJECT_SOURCE_DIR}/deps/itoa.c
                ${PROJECT_SOURCE_DIR}/deps/ziptree.c)
--- a/src/ua_types_encoding_json.c
+++ b/src/ua_types_encoding_json.c
@ -16,6 +16,7 @@
 #include <float.h>
 #include <math.h>
 #include "../deps/utf8.h"
 #include "../deps/itoa.h"
 #include "../deps/dtoa.h"
 #include "../deps/parse_num.h"
@ -446,54 +447,9 @@ encodeJsonArray(CtxJson *ctx, const void *ptr, size_t length,
    return ret | writeJsonArrEnd(ctx, type);
 }
 static const uint32_t min_codepoints[5] = {0x00, 0x00, 0x80, 0x800, 0x10000};
 static const u8 hexmap[16] =
    {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
 /* Extract the next utf8 codepoint from the buffer. Return the next position in
 * the buffer or NULL upon an error. */
 static const unsigned char *
 extract_codepoint(const unsigned char *pos, size_t len, uint32_t *codepoint) {
    UA_assert(len > 0);
    *codepoint = pos[0];
    if(UA_LIKELY(*codepoint < 0x80))
        return pos + 1; /* Normal ASCII */
    if(UA_UNLIKELY(*codepoint <= 0xC1))
        return NULL; /* Continuation byte not allowed here */
    unsigned char count;
    if(*codepoint <= 0xDF) {
        count = 2; /* 2-byte sequence */
        *codepoint &= 0x1F;
    } else if(*codepoint <= 0xEF) {
        count = 3; /* 3-byte sequence */
        *codepoint &= 0xF;
    } else if(*codepoint <= 0xF4) {
        count = 4; /* 4-byte sequence */
        *codepoint &= 0x7;
    } else {
        return NULL; /* invalid utf8 */
    }
    if(UA_UNLIKELY(count > len))
        return NULL; /* Not enough bytes left */
    for(unsigned char i = 1; i < count; i++) {
        unsigned char byte = pos[i];
        if(UA_UNLIKELY(byte < 0x80 || byte > 0xBF))
            return NULL; /* Not a continuation byte */
        *codepoint = (*codepoint << 6) + (byte & 0x3F);
    }
    /* Not in Unicode range or too small for the encoding length */
    if(UA_UNLIKELY(*codepoint > 0x10FFFF || *codepoint < min_codepoints[count]))
        return NULL;
    return pos + count; /* Return the new position in the pos */
 }
 ENCODE_JSON(String) {
    if(!src->data)
        return writeChars(ctx, "null", 4);
@ -503,71 +459,65 @@ ENCODE_JSON(String) {
    UA_StatusCode ret = writeJsonQuote(ctx);
-    const unsigned char *str = src->data;
+    const unsigned char *pos = src->data;         /* Input position */
-    const unsigned char *pos = str;
+    const unsigned char *end = pos + src->length; /* End of input */
-    const unsigned char *end = str;
+    while(pos < end) {
-    const unsigned char *lim = str + src->length;
+        /* Find the first escaped character */
-    uint32_t codepoint = 0;
+        const unsigned char *start = pos;
-    while(1) {
+        for(; pos < end; pos++) {
-        /* Iterate over codepoints in the utf8 encoding. Until the first
+            if(*pos >= 127 || *pos < ' ' || *pos == '\\' || *pos == '\"')
         * character that needs to be escaped. */
        while(end < lim) {
            end = extract_codepoint(pos, (size_t)(lim - pos), &codepoint);
            if(!end)  {
                /* A malformed utf8 character. Print anyway and let the
                 * receiving side choose how to handle it. */
                pos++;
                end = pos;
                continue;
            }
            /* Escape unprintable ASCII and escape characters */
            if(codepoint < ' '   || codepoint == 127  ||
               codepoint == '\\' || codepoint == '\"')
                break;
            pos = end;
        }
-        /* Write out the characters that don't need escaping */
+        /* Write out the unescaped ascii sequence */
-        if(pos != str) {
+        if(pos > start) {
-            if(ctx->pos + (pos - str) > ctx->end)
+            if(ctx->pos + (pos - start) > ctx->end)
                return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
            if(!ctx->calcOnly)
-                memcpy(ctx->pos, str, (size_t)(pos - str));
+                memcpy(ctx->pos, start, (size_t)(pos - start));
-            ctx->pos += pos - str;
+            ctx->pos += pos - start;
        }
-        /* Reached the end of the utf8 encoding */
+        /* The unescaped ascii sequence reached the end */
-        if(end == pos)
+        if(pos == end)
            break;
-        /* Handle an escaped character */
+        /* Parse an escaped character */
-        size_t length = 2;
+        unsigned codepoint = 0;
-        u8 seq[13];
+        unsigned len = utf8_to_codepoint(pos, (size_t)(end - pos), &codepoint);
-        const char *text;
+        if(len == 0)  {
            /* A malformed utf8 character. Print anyway and let the
             * receiving side choose how to handle it. */
            codepoint = *pos;
            len = 1;
        }
        pos += len;
        /* Write an escaped character */
        u8 escape_buf[13];
        const char *escape_text;
        size_t escape_length = 2;
        switch(codepoint) {
-        case '\\': text = "\\\\"; break;
+        case '\\': escape_text = "\\\\"; break;
-        case '\"': text = "\\\""; break;
+        case '\"': escape_text = "\\\""; break;
-        case '\b': text = "\\b"; break;
+        case '\b': escape_text = "\\b";  break;
-        case '\f': text = "\\f"; break;
+        case '\f': escape_text = "\\f";  break;
-        case '\n': text = "\\n"; break;
+        case '\n': escape_text = "\\n";  break;
-        case '\r': text = "\\r"; break;
+        case '\r': escape_text = "\\r";  break;
-        case '\t': text = "\\t"; break;
+        case '\t': escape_text = "\\t";  break;
        default:
-            text = (char*)seq;
+            escape_text = (char*)escape_buf;
            if(codepoint < 0x10000) {
                /* codepoint is in BMP */
-                seq[0] = '\\';
+                escape_buf[0] = '\\';
-                seq[1] = 'u';
+                escape_buf[1] = 'u';
                UA_Byte b1 = (UA_Byte)(codepoint >> 8u);
                UA_Byte b2 = (UA_Byte)(codepoint >> 0u);
-                seq[2] = hexmap[(b1 & 0xF0u) >> 4u];
+                escape_buf[2] = hexmap[(b1 & 0xF0u) >> 4u];
-                seq[3] = hexmap[b1 & 0x0Fu];
+                escape_buf[3] = hexmap[b1 & 0x0Fu];
-                seq[4] = hexmap[(b2 & 0xF0u) >> 4u];
+                escape_buf[4] = hexmap[(b2 & 0xF0u) >> 4u];
-                seq[5] = hexmap[b2 & 0x0Fu];
+                escape_buf[5] = hexmap[b2 & 0x0Fu];
-                length = 6;
+                escape_length = 6;
            } else {
                /* not in BMP -> construct a UTF-16 surrogate pair */
                codepoint -= 0x10000;
@ -577,28 +527,31 @@ ENCODE_JSON(String) {
                UA_Byte fb2 = (UA_Byte)(first >> 0u);
                UA_Byte lb1 = (UA_Byte)(last >> 8u);
                UA_Byte lb2 = (UA_Byte)(last >> 0u);
-                seq[0] = '\\';
+                escape_buf[0] = '\\';
-                seq[1] = 'u';
+                escape_buf[1] = 'u';
-                seq[2] = hexmap[(fb1 & 0xF0u) >> 4u];
+                escape_buf[2] = hexmap[(fb1 & 0xF0u) >> 4u];
-                seq[3] = hexmap[fb1 & 0x0Fu];
+                escape_buf[3] = hexmap[fb1 & 0x0Fu];
-                seq[4] = hexmap[(fb2 & 0xF0u) >> 4u];
+                escape_buf[4] = hexmap[(fb2 & 0xF0u) >> 4u];
-                seq[5] = hexmap[fb2 & 0x0Fu];
+                escape_buf[5] = hexmap[fb2 & 0x0Fu];
-                seq[6] = '\\';
+                escape_buf[6] = '\\';
-                seq[7] = 'u';
+                escape_buf[7] = 'u';
-                seq[8] = hexmap[(lb1 & 0xF0u) >> 4u];
+                escape_buf[8] = hexmap[(lb1 & 0xF0u) >> 4u];
-                seq[9] = hexmap[lb1 & 0x0Fu];
+                escape_buf[9] = hexmap[lb1 & 0x0Fu];
-                seq[10] = hexmap[(lb2 & 0xF0u) >> 4u];
+                escape_buf[10] = hexmap[(lb2 & 0xF0u) >> 4u];
-                seq[11] = hexmap[lb2 & 0x0Fu];
+                escape_buf[11] = hexmap[lb2 & 0x0Fu];
-                length = 12;
+                escape_length = 12;
            }
            break;
        }
-        if(ctx->pos + length > ctx->end)
+
        /* Enough space? */
        if(ctx->pos + escape_length > ctx->end)
            return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
        /* Write the escaped character */
        if(!ctx->calcOnly)
-            memcpy(ctx->pos, text, length);
+            memcpy(ctx->pos, escape_text, escape_length);
-        ctx->pos += length;
+        ctx->pos += escape_length;
        str = pos = end;
    }
    return ret | writeJsonQuote(ctx);