diff -r 8ff82697f2c3 -r 148b7c7ccaf9 src/json.c --- a/src/json.c Sat Jan 25 15:22:01 2025 +0100 +++ b/src/json.c Tue Jan 28 18:31:17 2025 +0100 @@ -353,6 +353,72 @@ return CX_JSON_INCOMPLETE_DATA; } +// converts a Unicode codepoint to utf8 +static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) { + if (codepoint <= 0x7F) { + *output_buf = (char)codepoint; + return 1; + } else if (codepoint <= 0x7FF) { + output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); + output_buf[1] = (char)(0x80 | (codepoint & 0x3F)); + return 2; + } else if (codepoint <= 0xFFFF) { + output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); + output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); + return 3; + } else if (codepoint <= 0x10FFFF) { + output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); + output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); + return 4; + } + + return 0; // LCOV_EXCL_LINE +} + +// converts a utf16 surrogate pair to utf8 +static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { + return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; +} + +static unsigned unescape_unicode_string(cxstring str, char *utf8buf) { + // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX" + // remaining bytes in the string are ignored (str may be larger!) + + if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') { + return 0; + } + + unsigned utf8len = 0; + cxstring ustr1 = { str.ptr + 2, 4}; + uint16_t utf16a, utf16b; + if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { + uint32_t codepoint; + if (utf16a < 0xD800 || utf16a > 0xE000) { + // character is in the Basic Multilingual Plane + // and encoded as a single utf16 char + codepoint = utf16a; + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) { + // character is encoded as a surrogate pair + // get next 6 bytes + if (str.length >= 12) { + if (str.ptr[6] == '\\' && str.ptr[7] == 'u') { + cxstring ustr2 = { str.ptr+8, 4 }; + if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") + && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { + codepoint = utf16pair_to_codepoint(utf16a, utf16b); + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } + } + } + } + } + return utf8len; +} + static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { // note: this function expects that str contains the enclosing quotes! @@ -368,6 +434,8 @@ u = false; if (c == 'n') { c = '\n'; + } else if (c == '"') { + c = '"'; } else if (c == 't') { c = '\t'; } else if (c == 'r') { @@ -380,10 +448,31 @@ c = '\f'; } else if (c == 'b') { c = '\b'; + } else if (c == 'u') { + char utf8buf[4]; + unsigned utf8len = unescape_unicode_string( + cx_strn(str.ptr + i - 1, str.length + 1 - i), + utf8buf + ); + if(utf8len > 0) { + i += utf8len < 4 ? 4 : 10; + // add all bytes from utf8buf except the last char + // to the result (last char will be added below) + utf8len--; + c = utf8buf[utf8len]; + for (unsigned x = 0; x < utf8len; x++) { + result.ptr[result.length++] = utf8buf[x]; + } + } else { + // decoding failed, ignore the entire sequence + result.ptr[result.length++] = '\\'; + } + } else { + // TODO: discuss the behavior for unrecognized escape sequences + // most parsers throw an error here - we just ignore it + result.ptr[result.length++] = '\\'; } - // TODO: support \uXXXX escape sequences - // TODO: discuss the behavior for unrecognized escape sequences - // most parsers throw an error here + result.ptr[result.length++] = c; } else { if (c == '\\') {