ucx: comparison src/json.c

-:3565ae61a5a2
+:fa2811e9ab19
 // converts a utf16 surrogate pair to utf8
 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
 }
+static unsigned unescape_unicode_string(cxstring str, char *utf8buf) {
+// str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX"
+// remaining bytes in the string are ignored (str may be larger!)
+if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') {
+return 0;
+}
+unsigned utf8len = 0;
+cxstring ustr1 = { str.ptr + 2, 4};
+uint16_t utf16a, utf16b;
+if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+uint32_t codepoint;
+if (utf16a < 0xD800 || utf16a > 0xE000) {
+// character is in the Basic Multilingual Plane
+// and encoded as a single utf16 char
+codepoint = utf16a;
+utf8len = codepoint_to_utf8(codepoint, utf8buf);
+} else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) {
+// character is encoded as a surrogate pair
+// get next 6 bytes
+if (str.length > 12) {
+if (*(str.ptr+6) == '\\' && *(str.ptr+7) == 'u') {
+cxstring ustr2 = { str.ptr+8, 4 };
+if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
+&& utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
+codepoint = utf16pair_to_codepoint(utf16a, utf16b);
+utf8len = codepoint_to_utf8(codepoint, utf8buf);
+}
+}
+}
+}
+}
+return utf8len;
+}
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
 // note: this function expects that str contains the enclosing quotes!
 cxmutstr result;
 result.length = 0;
 } else if (c == 'f') {
 c = '\f';
 } else if (c == 'b') {
 c = '\b';
 } else if (c == 'u') {
-if (i + 4 < str.length - 1) {
+char utf8buf[4];
-cxstring ustr1 = { str.ptr + i + 1, 4};
+unsigned utf8len = unescape_unicode_string(
-uint16_t utf16a, utf16b;
+cx_strn(str.ptr + i - 1, str.length + 1 - i),
-char utf8buf[4];
+utf8buf
-unsigned utf8len = 0;
+);
-if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+if(utf8len > 0) {
-uint32_t codepoint;
+i += utf8len < 4 ? 4 : 10;
-if (utf16a >= 0xD800 && utf16a <= 0xDFFF) {
+// add all bytes from utf8buf except the last char
-// character is encoded as a surrogate pair
+// to the result (last char will be added below)
-// get next 6 bytes
+utf8len--;
-if (i + 10 < str.length - 1) {
+c = utf8buf[utf8len];
-if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') {
+for (unsigned x = 0; x < utf8len; x++) {
-cxstring ustr2 = { str.ptr+i+7, 4 };
+result.ptr[result.length++] = utf8buf[x];
-if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
-&& utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
-codepoint = utf16pair_to_codepoint(utf16a, utf16b);
-utf8len = codepoint_to_utf8(codepoint, utf8buf);
-i += 10;
-}
-}
-}
-} else {
-// character is in the Basic Multilingual Plane
-// and encoded as a single utf16 char
-codepoint = utf16a;
-utf8len = codepoint_to_utf8(codepoint, utf8buf);
-i += 4;
-}
 }
-if(utf8len > 0) {
+} else {
-// add all bytes from utf8buf except the last char
+// decoding failed, ignore the entire sequence
-// to the result (last char will be added below)
+result.ptr[result.length++] = '\\';
-utf8len--;
-c = utf8buf[utf8len];
-for (unsigned x = 0; x < utf8len; x++) {
-result.ptr[result.length++] = utf8buf[x];
-}
-} else {
-// decoding failed, ignore the entire sequence
-result.ptr[result.length++] = '\\';
-}
 }
 } else {
 // TODO: discuss the behavior for unrecognized escape sequences
 //       most parsers throw an error here - we just ignore it
 result.ptr[result.length++] = '\\';

Mercurial > hg > ucx / file comparison

comparison: src/json.c

src/json.c