--- a/src/json.c Sun Jan 26 12:24:49 2025 +0100 +++ b/src/json.c Sun Jan 26 13:20:05 2025 +0100 @@ -353,8 +353,8 @@ return CX_JSON_INCOMPLETE_DATA; } -// converts a unicode (up to U+FFFF) codepoint to utf8 -static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { +// converts a Unicode codepoint to utf8 +static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) { if (codepoint <= 0x7F) { *output_buf = (char)codepoint; return 1; @@ -375,7 +375,7 @@ return 4; } - return 0; + return 0; // LCOV_EXCL_LINE } // converts a utf16 surrogate pair to utf8 @@ -398,6 +398,8 @@ u = false; if (c == 'n') { c = '\n'; + } else if (c == '"') { + c = '"'; } else if (c == 't') { c = '\t'; } else if (c == 'r') { @@ -411,48 +413,54 @@ } else if (c == 'b') { c = '\b'; } else if (c == 'u') { - if (i+4 < str.length - 1) { - cxstring codepoint_str = { str.ptr + i + 1, 4}; - uint32_t codepoint; - if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { - char utf8buf[4]; - int utf8len = 0; - if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + if (i + 4 < str.length - 1) { + cxstring ustr1 = { str.ptr + i + 1, 4}; + uint16_t utf16a, utf16b; + char utf8buf[4]; + unsigned utf8len = 0; + if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { + uint32_t codepoint; + if (utf16a >= 0xD800 && utf16a <= 0xDFFF) { // character is encoded as a surrogate pair // get next 6 bytes if (i + 10 < str.length - 1) { - char *surrogate2 = str.ptr+i+5; - if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { - cxstring c2_str = { surrogate2 + 2, 4 }; - uint32_t c2; - if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { - codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); + if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') { + cxstring ustr2 = { str.ptr+i+7, 4 }; + if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") + && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { + codepoint = utf16pair_to_codepoint(utf16a, utf16b); utf8len = codepoint_to_utf8(codepoint, utf8buf); - i += 6; + i += 10; } } } } else { // character is in the Basic Multilingual Plane // and encoded as a single utf16 char + codepoint = utf16a; utf8len = codepoint_to_utf8(codepoint, utf8buf); + i += 4; } - if(utf8len > 0) { - // add all bytes from utf8buf expect the last char - // to the result - utf8len--; - c = utf8buf[utf8len]; - for(int x=0;x<utf8len;x++) { - result.ptr[result.length++] = utf8buf[x]; - } + } + if(utf8len > 0) { + // add all bytes from utf8buf except the last char + // to the result (last char will be added below) + utf8len--; + c = utf8buf[utf8len]; + for (unsigned x = 0; x < utf8len; x++) { + result.ptr[result.length++] = utf8buf[x]; } - i += 4; + } else { + // decoding failed, ignore the entire sequence + result.ptr[result.length++] = '\\'; } } + } else { + // TODO: discuss the behavior for unrecognized escape sequences + // most parsers throw an error here - we just ignore it + result.ptr[result.length++] = '\\'; } - - // TODO: discuss the behavior for unrecognized escape sequences - // most parsers throw an error here + result.ptr[result.length++] = c; } else { if (c == '\\') {