refactor(core): Use the utf8 decoding in /deps for JSON

This commit is contained in:
Julius Pfrommer 2024-12-19 03:45:22 +01:00 committed by Julius Pfrommer
parent ba1810606f
commit c29c0901f7
2 changed files with 66 additions and 111 deletions

View File

@ -742,6 +742,7 @@ set(lib_headers ${PROJECT_SOURCE_DIR}/deps/open62541_queue.h
${PROJECT_SOURCE_DIR}/deps/base64.h ${PROJECT_SOURCE_DIR}/deps/base64.h
${PROJECT_SOURCE_DIR}/deps/dtoa.h ${PROJECT_SOURCE_DIR}/deps/dtoa.h
${PROJECT_SOURCE_DIR}/deps/mp_printf.h ${PROJECT_SOURCE_DIR}/deps/mp_printf.h
${PROJECT_SOURCE_DIR}/deps/utf8.h
${PROJECT_SOURCE_DIR}/deps/itoa.h ${PROJECT_SOURCE_DIR}/deps/itoa.h
${PROJECT_SOURCE_DIR}/deps/ziptree.h ${PROJECT_SOURCE_DIR}/deps/ziptree.h
${PROJECT_SOURCE_DIR}/src/ua_types_encoding_binary.h ${PROJECT_SOURCE_DIR}/src/ua_types_encoding_binary.h
@ -812,6 +813,7 @@ set(lib_sources ${PROJECT_SOURCE_DIR}/src/ua_types.c
${PROJECT_SOURCE_DIR}/deps/base64.c ${PROJECT_SOURCE_DIR}/deps/base64.c
${PROJECT_SOURCE_DIR}/deps/dtoa.c ${PROJECT_SOURCE_DIR}/deps/dtoa.c
${PROJECT_SOURCE_DIR}/deps/mp_printf.c ${PROJECT_SOURCE_DIR}/deps/mp_printf.c
${PROJECT_SOURCE_DIR}/deps/utf8.c
${PROJECT_SOURCE_DIR}/deps/itoa.c ${PROJECT_SOURCE_DIR}/deps/itoa.c
${PROJECT_SOURCE_DIR}/deps/ziptree.c) ${PROJECT_SOURCE_DIR}/deps/ziptree.c)

View File

@ -16,6 +16,7 @@
#include <float.h> #include <float.h>
#include <math.h> #include <math.h>
#include "../deps/utf8.h"
#include "../deps/itoa.h" #include "../deps/itoa.h"
#include "../deps/dtoa.h" #include "../deps/dtoa.h"
#include "../deps/parse_num.h" #include "../deps/parse_num.h"
@ -446,54 +447,9 @@ encodeJsonArray(CtxJson *ctx, const void *ptr, size_t length,
return ret | writeJsonArrEnd(ctx, type); return ret | writeJsonArrEnd(ctx, type);
} }
static const uint32_t min_codepoints[5] = {0x00, 0x00, 0x80, 0x800, 0x10000};
static const u8 hexmap[16] = static const u8 hexmap[16] =
{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
/* Extract the next utf8 codepoint from the buffer. Return the next position in
* the buffer or NULL upon an error. */
static const unsigned char *
extract_codepoint(const unsigned char *pos, size_t len, uint32_t *codepoint) {
UA_assert(len > 0);
*codepoint = pos[0];
if(UA_LIKELY(*codepoint < 0x80))
return pos + 1; /* Normal ASCII */
if(UA_UNLIKELY(*codepoint <= 0xC1))
return NULL; /* Continuation byte not allowed here */
unsigned char count;
if(*codepoint <= 0xDF) {
count = 2; /* 2-byte sequence */
*codepoint &= 0x1F;
} else if(*codepoint <= 0xEF) {
count = 3; /* 3-byte sequence */
*codepoint &= 0xF;
} else if(*codepoint <= 0xF4) {
count = 4; /* 4-byte sequence */
*codepoint &= 0x7;
} else {
return NULL; /* invalid utf8 */
}
if(UA_UNLIKELY(count > len))
return NULL; /* Not enough bytes left */
for(unsigned char i = 1; i < count; i++) {
unsigned char byte = pos[i];
if(UA_UNLIKELY(byte < 0x80 || byte > 0xBF))
return NULL; /* Not a continuation byte */
*codepoint = (*codepoint << 6) + (byte & 0x3F);
}
/* Not in Unicode range or too small for the encoding length */
if(UA_UNLIKELY(*codepoint > 0x10FFFF || *codepoint < min_codepoints[count]))
return NULL;
return pos + count; /* Return the new position in the pos */
}
ENCODE_JSON(String) { ENCODE_JSON(String) {
if(!src->data) if(!src->data)
return writeChars(ctx, "null", 4); return writeChars(ctx, "null", 4);
@ -503,71 +459,65 @@ ENCODE_JSON(String) {
UA_StatusCode ret = writeJsonQuote(ctx); UA_StatusCode ret = writeJsonQuote(ctx);
const unsigned char *str = src->data; const unsigned char *pos = src->data; /* Input position */
const unsigned char *pos = str; const unsigned char *end = pos + src->length; /* End of input */
const unsigned char *end = str; while(pos < end) {
const unsigned char *lim = str + src->length; /* Find the first escaped character */
uint32_t codepoint = 0; const unsigned char *start = pos;
while(1) { for(; pos < end; pos++) {
/* Iterate over codepoints in the utf8 encoding. Until the first if(*pos >= 127 || *pos < ' ' || *pos == '\\' || *pos == '\"')
* character that needs to be escaped. */
while(end < lim) {
end = extract_codepoint(pos, (size_t)(lim - pos), &codepoint);
if(!end) {
/* A malformed utf8 character. Print anyway and let the
* receiving side choose how to handle it. */
pos++;
end = pos;
continue;
}
/* Escape unprintable ASCII and escape characters */
if(codepoint < ' ' || codepoint == 127 ||
codepoint == '\\' || codepoint == '\"')
break; break;
pos = end;
} }
/* Write out the characters that don't need escaping */ /* Write out the unescaped ascii sequence */
if(pos != str) { if(pos > start) {
if(ctx->pos + (pos - str) > ctx->end) if(ctx->pos + (pos - start) > ctx->end)
return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED; return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
if(!ctx->calcOnly) if(!ctx->calcOnly)
memcpy(ctx->pos, str, (size_t)(pos - str)); memcpy(ctx->pos, start, (size_t)(pos - start));
ctx->pos += pos - str; ctx->pos += pos - start;
} }
/* Reached the end of the utf8 encoding */ /* The unescaped ascii sequence reached the end */
if(end == pos) if(pos == end)
break; break;
/* Handle an escaped character */ /* Parse an escaped character */
size_t length = 2; unsigned codepoint = 0;
u8 seq[13]; unsigned len = utf8_to_codepoint(pos, (size_t)(end - pos), &codepoint);
const char *text; if(len == 0) {
/* A malformed utf8 character. Print anyway and let the
* receiving side choose how to handle it. */
codepoint = *pos;
len = 1;
}
pos += len;
/* Write an escaped character */
u8 escape_buf[13];
const char *escape_text;
size_t escape_length = 2;
switch(codepoint) { switch(codepoint) {
case '\\': text = "\\\\"; break; case '\\': escape_text = "\\\\"; break;
case '\"': text = "\\\""; break; case '\"': escape_text = "\\\""; break;
case '\b': text = "\\b"; break; case '\b': escape_text = "\\b"; break;
case '\f': text = "\\f"; break; case '\f': escape_text = "\\f"; break;
case '\n': text = "\\n"; break; case '\n': escape_text = "\\n"; break;
case '\r': text = "\\r"; break; case '\r': escape_text = "\\r"; break;
case '\t': text = "\\t"; break; case '\t': escape_text = "\\t"; break;
default: default:
text = (char*)seq; escape_text = (char*)escape_buf;
if(codepoint < 0x10000) { if(codepoint < 0x10000) {
/* codepoint is in BMP */ /* codepoint is in BMP */
seq[0] = '\\'; escape_buf[0] = '\\';
seq[1] = 'u'; escape_buf[1] = 'u';
UA_Byte b1 = (UA_Byte)(codepoint >> 8u); UA_Byte b1 = (UA_Byte)(codepoint >> 8u);
UA_Byte b2 = (UA_Byte)(codepoint >> 0u); UA_Byte b2 = (UA_Byte)(codepoint >> 0u);
seq[2] = hexmap[(b1 & 0xF0u) >> 4u]; escape_buf[2] = hexmap[(b1 & 0xF0u) >> 4u];
seq[3] = hexmap[b1 & 0x0Fu]; escape_buf[3] = hexmap[b1 & 0x0Fu];
seq[4] = hexmap[(b2 & 0xF0u) >> 4u]; escape_buf[4] = hexmap[(b2 & 0xF0u) >> 4u];
seq[5] = hexmap[b2 & 0x0Fu]; escape_buf[5] = hexmap[b2 & 0x0Fu];
length = 6; escape_length = 6;
} else { } else {
/* not in BMP -> construct a UTF-16 surrogate pair */ /* not in BMP -> construct a UTF-16 surrogate pair */
codepoint -= 0x10000; codepoint -= 0x10000;
@ -577,28 +527,31 @@ ENCODE_JSON(String) {
UA_Byte fb2 = (UA_Byte)(first >> 0u); UA_Byte fb2 = (UA_Byte)(first >> 0u);
UA_Byte lb1 = (UA_Byte)(last >> 8u); UA_Byte lb1 = (UA_Byte)(last >> 8u);
UA_Byte lb2 = (UA_Byte)(last >> 0u); UA_Byte lb2 = (UA_Byte)(last >> 0u);
seq[0] = '\\'; escape_buf[0] = '\\';
seq[1] = 'u'; escape_buf[1] = 'u';
seq[2] = hexmap[(fb1 & 0xF0u) >> 4u]; escape_buf[2] = hexmap[(fb1 & 0xF0u) >> 4u];
seq[3] = hexmap[fb1 & 0x0Fu]; escape_buf[3] = hexmap[fb1 & 0x0Fu];
seq[4] = hexmap[(fb2 & 0xF0u) >> 4u]; escape_buf[4] = hexmap[(fb2 & 0xF0u) >> 4u];
seq[5] = hexmap[fb2 & 0x0Fu]; escape_buf[5] = hexmap[fb2 & 0x0Fu];
seq[6] = '\\'; escape_buf[6] = '\\';
seq[7] = 'u'; escape_buf[7] = 'u';
seq[8] = hexmap[(lb1 & 0xF0u) >> 4u]; escape_buf[8] = hexmap[(lb1 & 0xF0u) >> 4u];
seq[9] = hexmap[lb1 & 0x0Fu]; escape_buf[9] = hexmap[lb1 & 0x0Fu];
seq[10] = hexmap[(lb2 & 0xF0u) >> 4u]; escape_buf[10] = hexmap[(lb2 & 0xF0u) >> 4u];
seq[11] = hexmap[lb2 & 0x0Fu]; escape_buf[11] = hexmap[lb2 & 0x0Fu];
length = 12; escape_length = 12;
} }
break; break;
} }
if(ctx->pos + length > ctx->end)
/* Enough space? */
if(ctx->pos + escape_length > ctx->end)
return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED; return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED;
/* Write the escaped character */
if(!ctx->calcOnly) if(!ctx->calcOnly)
memcpy(ctx->pos, text, length); memcpy(ctx->pos, escape_text, escape_length);
ctx->pos += length; ctx->pos += escape_length;
str = pos = end;
} }
return ret | writeJsonQuote(ctx); return ret | writeJsonQuote(ctx);