Sun, 26 Jan 2025 14:13:48 +0100
extract unescape_unicode_string() and fix rages
src/json.c | file | annotate | diff | comparison | revisions |
--- a/src/json.c Sun Jan 26 13:22:58 2025 +0100 +++ b/src/json.c Sun Jan 26 14:13:48 2025 +0100 @@ -383,6 +383,42 @@ return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; } +static unsigned unescape_unicode_string(cxstring str, char *utf8buf) { + // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX" + // remaining bytes in the string are ignored (str may be larger!) + + if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') { + return 0; + } + + unsigned utf8len = 0; + cxstring ustr1 = { str.ptr + 2, 4}; + uint16_t utf16a, utf16b; + if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { + uint32_t codepoint; + if (utf16a < 0xD800 || utf16a > 0xE000) { + // character is in the Basic Multilingual Plane + // and encoded as a single utf16 char + codepoint = utf16a; + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) { + // character is encoded as a surrogate pair + // get next 6 bytes + if (str.length > 12) { + if (*(str.ptr+6) == '\\' && *(str.ptr+7) == 'u') { + cxstring ustr2 = { str.ptr+8, 4 }; + if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") + && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { + codepoint = utf16pair_to_codepoint(utf16a, utf16b); + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } + } + } + } + } + return utf8len; +} + static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { // note: this function expects that str contains the enclosing quotes! @@ -413,47 +449,23 @@ } else if (c == 'b') { c = '\b'; } else if (c == 'u') { - if (i + 4 < str.length - 1) { - cxstring ustr1 = { str.ptr + i + 1, 4}; - uint16_t utf16a, utf16b; - char utf8buf[4]; - unsigned utf8len = 0; - if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { - uint32_t codepoint; - if (utf16a >= 0xD800 && utf16a <= 0xDFFF) { - // character is encoded as a surrogate pair - // get next 6 bytes - if (i + 10 < str.length - 1) { - if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') { - cxstring ustr2 = { str.ptr+i+7, 4 }; - if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") - && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { - codepoint = utf16pair_to_codepoint(utf16a, utf16b); - utf8len = codepoint_to_utf8(codepoint, utf8buf); - i += 10; - } - } - } - } else { - // character is in the Basic Multilingual Plane - // and encoded as a single utf16 char - codepoint = utf16a; - utf8len = codepoint_to_utf8(codepoint, utf8buf); - i += 4; - } + char utf8buf[4]; + unsigned utf8len = unescape_unicode_string( + cx_strn(str.ptr + i - 1, str.length + 1 - i), + utf8buf + ); + if(utf8len > 0) { + i += utf8len < 4 ? 4 : 10; + // add all bytes from utf8buf except the last char + // to the result (last char will be added below) + utf8len--; + c = utf8buf[utf8len]; + for (unsigned x = 0; x < utf8len; x++) { + result.ptr[result.length++] = utf8buf[x]; } - if(utf8len > 0) { - // add all bytes from utf8buf except the last char - // to the result (last char will be added below) - utf8len--; - c = utf8buf[utf8len]; - for (unsigned x = 0; x < utf8len; x++) { - result.ptr[result.length++] = utf8buf[x]; - } - } else { - // decoding failed, ignore the entire sequence - result.ptr[result.length++] = '\\'; - } + } else { + // decoding failed, ignore the entire sequence + result.ptr[result.length++] = '\\'; } } else { // TODO: discuss the behavior for unrecognized escape sequences