ucx: comparison src/json.c

-:8ff82697f2c3
+:148b7c7ccaf9
 }
 return CX_JSON_INCOMPLETE_DATA;
 }
+// converts a Unicode codepoint to utf8
+static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
+if (codepoint <= 0x7F) {
+*output_buf = (char)codepoint;
+return 1;
+} else if (codepoint <= 0x7FF) {
+output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
+output_buf[1] = (char)(0x80 | (codepoint & 0x3F));
+return 2;
+} else if (codepoint <= 0xFFFF) {
+output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
+output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
+return 3;
+} else if (codepoint <= 0x10FFFF) {
+output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
+output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
+return 4;
+}
+return 0; // LCOV_EXCL_LINE
+}
+// converts a utf16 surrogate pair to utf8
+static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
+return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
+}
+static unsigned unescape_unicode_string(cxstring str, char *utf8buf) {
+// str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX"
+// remaining bytes in the string are ignored (str may be larger!)
+if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') {
+return 0;
+}
+unsigned utf8len = 0;
+cxstring ustr1 = { str.ptr + 2, 4};
+uint16_t utf16a, utf16b;
+if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+uint32_t codepoint;
+if (utf16a < 0xD800 || utf16a > 0xE000) {
+// character is in the Basic Multilingual Plane
+// and encoded as a single utf16 char
+codepoint = utf16a;
+utf8len = codepoint_to_utf8(codepoint, utf8buf);
+} else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) {
+// character is encoded as a surrogate pair
+// get next 6 bytes
+if (str.length >= 12) {
+if (str.ptr[6] == '\\' && str.ptr[7] == 'u') {
+cxstring ustr2 = { str.ptr+8, 4 };
+if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
+&& utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
+codepoint = utf16pair_to_codepoint(utf16a, utf16b);
+utf8len = codepoint_to_utf8(codepoint, utf8buf);
+}
+}
+}
+}
+}
+return utf8len;
+}
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
 // note: this function expects that str contains the enclosing quotes!
 cxmutstr result;
 result.length = 0;
 char c = str.ptr[i];
 if (u) {
 u = false;
 if (c == 'n') {
 c = '\n';
+} else if (c == '"') {
+c = '"';
 } else if (c == 't') {
 c = '\t';
 } else if (c == 'r') {
 c = '\r';
 } else if (c == '\\') {
 c = '/'; // always unescape, we don't need settings here
 } else if (c == 'f') {
 c = '\f';
 } else if (c == 'b') {
 c = '\b';
-}
+} else if (c == 'u') {
-// TODO: support \uXXXX escape sequences
+char utf8buf[4];
-// TODO: discuss the behavior for unrecognized escape sequences
+unsigned utf8len = unescape_unicode_string(
-//       most parsers throw an error here
+cx_strn(str.ptr + i - 1, str.length + 1 - i),
+utf8buf
+);
+if(utf8len > 0) {
+i += utf8len < 4 ? 4 : 10;
+// add all bytes from utf8buf except the last char
+// to the result (last char will be added below)
+utf8len--;
+c = utf8buf[utf8len];
+for (unsigned x = 0; x < utf8len; x++) {
+result.ptr[result.length++] = utf8buf[x];
+}
+} else {
+// decoding failed, ignore the entire sequence
+result.ptr[result.length++] = '\\';
+}
+} else {
+// TODO: discuss the behavior for unrecognized escape sequences
+//       most parsers throw an error here - we just ignore it
+result.ptr[result.length++] = '\\';
+}
 result.ptr[result.length++] = c;
 } else {
 if (c == '\\') {
 u = true;
 } else {

Mercurial > hg > ucx / file comparison

comparison: src/json.c

src/json.c