diff --git a/CMakeLists.txt b/CMakeLists.txt index c19880926..ba75f2e0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -742,6 +742,7 @@ set(lib_headers ${PROJECT_SOURCE_DIR}/deps/open62541_queue.h ${PROJECT_SOURCE_DIR}/deps/base64.h ${PROJECT_SOURCE_DIR}/deps/dtoa.h ${PROJECT_SOURCE_DIR}/deps/mp_printf.h + ${PROJECT_SOURCE_DIR}/deps/utf8.h ${PROJECT_SOURCE_DIR}/deps/itoa.h ${PROJECT_SOURCE_DIR}/deps/ziptree.h ${PROJECT_SOURCE_DIR}/src/ua_types_encoding_binary.h @@ -812,6 +813,7 @@ set(lib_sources ${PROJECT_SOURCE_DIR}/src/ua_types.c ${PROJECT_SOURCE_DIR}/deps/base64.c ${PROJECT_SOURCE_DIR}/deps/dtoa.c ${PROJECT_SOURCE_DIR}/deps/mp_printf.c + ${PROJECT_SOURCE_DIR}/deps/utf8.c ${PROJECT_SOURCE_DIR}/deps/itoa.c ${PROJECT_SOURCE_DIR}/deps/ziptree.c) diff --git a/src/ua_types_encoding_json.c b/src/ua_types_encoding_json.c index 539cefef2..b8200d295 100644 --- a/src/ua_types_encoding_json.c +++ b/src/ua_types_encoding_json.c @@ -16,6 +16,7 @@ #include #include +#include "../deps/utf8.h" #include "../deps/itoa.h" #include "../deps/dtoa.h" #include "../deps/parse_num.h" @@ -446,54 +447,9 @@ encodeJsonArray(CtxJson *ctx, const void *ptr, size_t length, return ret | writeJsonArrEnd(ctx, type); } -static const uint32_t min_codepoints[5] = {0x00, 0x00, 0x80, 0x800, 0x10000}; static const u8 hexmap[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; -/* Extract the next utf8 codepoint from the buffer. Return the next position in - * the buffer or NULL upon an error. */ -static const unsigned char * -extract_codepoint(const unsigned char *pos, size_t len, uint32_t *codepoint) { - UA_assert(len > 0); - - *codepoint = pos[0]; - if(UA_LIKELY(*codepoint < 0x80)) - return pos + 1; /* Normal ASCII */ - - if(UA_UNLIKELY(*codepoint <= 0xC1)) - return NULL; /* Continuation byte not allowed here */ - - unsigned char count; - if(*codepoint <= 0xDF) { - count = 2; /* 2-byte sequence */ - *codepoint &= 0x1F; - } else if(*codepoint <= 0xEF) { - count = 3; /* 3-byte sequence */ - *codepoint &= 0xF; - } else if(*codepoint <= 0xF4) { - count = 4; /* 4-byte sequence */ - *codepoint &= 0x7; - } else { - return NULL; /* invalid utf8 */ - } - - if(UA_UNLIKELY(count > len)) - return NULL; /* Not enough bytes left */ - - for(unsigned char i = 1; i < count; i++) { - unsigned char byte = pos[i]; - if(UA_UNLIKELY(byte < 0x80 || byte > 0xBF)) - return NULL; /* Not a continuation byte */ - *codepoint = (*codepoint << 6) + (byte & 0x3F); - } - - /* Not in Unicode range or too small for the encoding length */ - if(UA_UNLIKELY(*codepoint > 0x10FFFF || *codepoint < min_codepoints[count])) - return NULL; - - return pos + count; /* Return the new position in the pos */ -} - ENCODE_JSON(String) { if(!src->data) return writeChars(ctx, "null", 4); @@ -503,71 +459,65 @@ ENCODE_JSON(String) { UA_StatusCode ret = writeJsonQuote(ctx); - const unsigned char *str = src->data; - const unsigned char *pos = str; - const unsigned char *end = str; - const unsigned char *lim = str + src->length; - uint32_t codepoint = 0; - while(1) { - /* Iterate over codepoints in the utf8 encoding. Until the first - * character that needs to be escaped. */ - while(end < lim) { - end = extract_codepoint(pos, (size_t)(lim - pos), &codepoint); - if(!end) { - /* A malformed utf8 character. Print anyway and let the - * receiving side choose how to handle it. */ - pos++; - end = pos; - continue; - } - - /* Escape unprintable ASCII and escape characters */ - if(codepoint < ' ' || codepoint == 127 || - codepoint == '\\' || codepoint == '\"') + const unsigned char *pos = src->data; /* Input position */ + const unsigned char *end = pos + src->length; /* End of input */ + while(pos < end) { + /* Find the first escaped character */ + const unsigned char *start = pos; + for(; pos < end; pos++) { + if(*pos >= 127 || *pos < ' ' || *pos == '\\' || *pos == '\"') break; - - pos = end; } - /* Write out the characters that don't need escaping */ - if(pos != str) { - if(ctx->pos + (pos - str) > ctx->end) + /* Write out the unescaped ascii sequence */ + if(pos > start) { + if(ctx->pos + (pos - start) > ctx->end) return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED; if(!ctx->calcOnly) - memcpy(ctx->pos, str, (size_t)(pos - str)); - ctx->pos += pos - str; + memcpy(ctx->pos, start, (size_t)(pos - start)); + ctx->pos += pos - start; } - /* Reached the end of the utf8 encoding */ - if(end == pos) + /* The unescaped ascii sequence reached the end */ + if(pos == end) break; - /* Handle an escaped character */ - size_t length = 2; - u8 seq[13]; - const char *text; + /* Parse an escaped character */ + unsigned codepoint = 0; + unsigned len = utf8_to_codepoint(pos, (size_t)(end - pos), &codepoint); + if(len == 0) { + /* A malformed utf8 character. Print anyway and let the + * receiving side choose how to handle it. */ + codepoint = *pos; + len = 1; + } + pos += len; + /* Write an escaped character */ + u8 escape_buf[13]; + const char *escape_text; + size_t escape_length = 2; switch(codepoint) { - case '\\': text = "\\\\"; break; - case '\"': text = "\\\""; break; - case '\b': text = "\\b"; break; - case '\f': text = "\\f"; break; - case '\n': text = "\\n"; break; - case '\r': text = "\\r"; break; - case '\t': text = "\\t"; break; + case '\\': escape_text = "\\\\"; break; + case '\"': escape_text = "\\\""; break; + case '\b': escape_text = "\\b"; break; + case '\f': escape_text = "\\f"; break; + case '\n': escape_text = "\\n"; break; + case '\r': escape_text = "\\r"; break; + case '\t': escape_text = "\\t"; break; default: - text = (char*)seq; + escape_text = (char*)escape_buf; if(codepoint < 0x10000) { /* codepoint is in BMP */ - seq[0] = '\\'; - seq[1] = 'u'; + escape_buf[0] = '\\'; + escape_buf[1] = 'u'; UA_Byte b1 = (UA_Byte)(codepoint >> 8u); UA_Byte b2 = (UA_Byte)(codepoint >> 0u); - seq[2] = hexmap[(b1 & 0xF0u) >> 4u]; - seq[3] = hexmap[b1 & 0x0Fu]; - seq[4] = hexmap[(b2 & 0xF0u) >> 4u]; - seq[5] = hexmap[b2 & 0x0Fu]; - length = 6; + escape_buf[2] = hexmap[(b1 & 0xF0u) >> 4u]; + escape_buf[3] = hexmap[b1 & 0x0Fu]; + escape_buf[4] = hexmap[(b2 & 0xF0u) >> 4u]; + escape_buf[5] = hexmap[b2 & 0x0Fu]; + escape_length = 6; } else { /* not in BMP -> construct a UTF-16 surrogate pair */ codepoint -= 0x10000; @@ -577,28 +527,31 @@ ENCODE_JSON(String) { UA_Byte fb2 = (UA_Byte)(first >> 0u); UA_Byte lb1 = (UA_Byte)(last >> 8u); UA_Byte lb2 = (UA_Byte)(last >> 0u); - seq[0] = '\\'; - seq[1] = 'u'; - seq[2] = hexmap[(fb1 & 0xF0u) >> 4u]; - seq[3] = hexmap[fb1 & 0x0Fu]; - seq[4] = hexmap[(fb2 & 0xF0u) >> 4u]; - seq[5] = hexmap[fb2 & 0x0Fu]; - seq[6] = '\\'; - seq[7] = 'u'; - seq[8] = hexmap[(lb1 & 0xF0u) >> 4u]; - seq[9] = hexmap[lb1 & 0x0Fu]; - seq[10] = hexmap[(lb2 & 0xF0u) >> 4u]; - seq[11] = hexmap[lb2 & 0x0Fu]; - length = 12; + escape_buf[0] = '\\'; + escape_buf[1] = 'u'; + escape_buf[2] = hexmap[(fb1 & 0xF0u) >> 4u]; + escape_buf[3] = hexmap[fb1 & 0x0Fu]; + escape_buf[4] = hexmap[(fb2 & 0xF0u) >> 4u]; + escape_buf[5] = hexmap[fb2 & 0x0Fu]; + escape_buf[6] = '\\'; + escape_buf[7] = 'u'; + escape_buf[8] = hexmap[(lb1 & 0xF0u) >> 4u]; + escape_buf[9] = hexmap[lb1 & 0x0Fu]; + escape_buf[10] = hexmap[(lb2 & 0xF0u) >> 4u]; + escape_buf[11] = hexmap[lb2 & 0x0Fu]; + escape_length = 12; } break; } - if(ctx->pos + length > ctx->end) + + /* Enough space? */ + if(ctx->pos + escape_length > ctx->end) return UA_STATUSCODE_BADENCODINGLIMITSEXCEEDED; + + /* Write the escaped character */ if(!ctx->calcOnly) - memcpy(ctx->pos, text, length); - ctx->pos += length; - str = pos = end; + memcpy(ctx->pos, escape_text, escape_length); + ctx->pos += escape_length; } return ret | writeJsonQuote(ctx);