ucx: comparison src/json.c

-:b77d56a27e9c
+:96f16b5a0029
 }
 return CX_JSON_INCOMPLETE_DATA;
 }
-// converts a unicode (up to U+FFFF) codepoint to utf8
+// converts a Unicode codepoint to utf8
-static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
+static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
 if (codepoint <= 0x7F) {
 *output_buf = (char)codepoint;
 return 1;
 } else if (codepoint <= 0x7FF) {
 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
 output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
 return 4;
 }
-return 0;
+return 0; // LCOV_EXCL_LINE
 }
 // converts a utf16 surrogate pair to utf8
 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
 char c = str.ptr[i];
 if (u) {
 u = false;
 if (c == 'n') {
 c = '\n';
+} else if (c == '"') {
+c = '"';
 } else if (c == 't') {
 c = '\t';
 } else if (c == 'r') {
 c = '\r';
 } else if (c == '\\') {
 } else if (c == 'f') {
 c = '\f';
 } else if (c == 'b') {
 c = '\b';
 } else if (c == 'u') {
-if (i+4 < str.length - 1) {
+if (i + 4 < str.length - 1) {
-cxstring codepoint_str = { str.ptr + i + 1, 4};
+cxstring ustr1 = { str.ptr + i + 1, 4};
-uint32_t codepoint;
+uint16_t utf16a, utf16b;
-if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
+char utf8buf[4];
-char utf8buf[4];
+unsigned utf8len = 0;
-int utf8len = 0;
+if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
-if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+uint32_t codepoint;
+if (utf16a >= 0xD800 && utf16a <= 0xDFFF) {
 // character is encoded as a surrogate pair
 // get next 6 bytes
 if (i + 10 < str.length - 1) {
-char *surrogate2 = str.ptr+i+5;
+if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') {
-if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
+cxstring ustr2 = { str.ptr+i+7, 4 };
-cxstring c2_str = { surrogate2 + 2, 4 };
+if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
-uint32_t c2;
+&& utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
-if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
+codepoint = utf16pair_to_codepoint(utf16a, utf16b);
-codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
 utf8len = codepoint_to_utf8(codepoint, utf8buf);
-i += 6;
+i += 10;
 }
 }
 }
 } else {
 // character is in the Basic Multilingual Plane
 // and encoded as a single utf16 char
+codepoint = utf16a;
 utf8len = codepoint_to_utf8(codepoint, utf8buf);
+i += 4;
 }
-if(utf8len > 0) {
+}
-// add all bytes from utf8buf expect the last char
+if(utf8len > 0) {
-// to the result
+// add all bytes from utf8buf except the last char
-utf8len--;
+// to the result (last char will be added below)
-c = utf8buf[utf8len];
+utf8len--;
-for(int x=0;x<utf8len;x++) {
+c = utf8buf[utf8len];
-result.ptr[result.length++] = utf8buf[x];
+for (unsigned x = 0; x < utf8len; x++) {
-}
+result.ptr[result.length++] = utf8buf[x];
 }
-i += 4;
+} else {
+// decoding failed, ignore the entire sequence
+result.ptr[result.length++] = '\\';
 }
 }
-}
+} else {
+// TODO: discuss the behavior for unrecognized escape sequences
-// TODO: discuss the behavior for unrecognized escape sequences
+//       most parsers throw an error here - we just ignore it
-//       most parsers throw an error here
+result.ptr[result.length++] = '\\';
+}
 result.ptr[result.length++] = c;
 } else {
 if (c == '\\') {
 u = true;
 } else {

Mercurial > hg > ucx / file comparison

comparison: src/json.c

src/json.c