src/json.c

branch
docs/3.1
changeset 1164
148b7c7ccaf9
parent 1160
4f02c1101f2e
--- a/src/json.c	Sat Jan 25 15:22:01 2025 +0100
+++ b/src/json.c	Tue Jan 28 18:31:17 2025 +0100
@@ -353,6 +353,72 @@
     return CX_JSON_INCOMPLETE_DATA;
 }
 
+// converts a Unicode codepoint to utf8
+static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
+    if (codepoint <= 0x7F) {
+        *output_buf = (char)codepoint;
+        return 1;
+    } else if (codepoint <= 0x7FF) {
+        output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
+        output_buf[1] = (char)(0x80 | (codepoint & 0x3F));
+        return 2;
+    } else if (codepoint <= 0xFFFF) {
+        output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
+        output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+        output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
+        return 3;
+    } else if (codepoint <= 0x10FFFF) {
+        output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
+        output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+        output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+        output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
+        return 4;
+    }
+    
+    return 0; // LCOV_EXCL_LINE
+}
+
+// converts a utf16 surrogate pair to utf8
+static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
+    return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
+}
+
+static unsigned unescape_unicode_string(cxstring str, char *utf8buf) {
+    // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX"
+    // remaining bytes in the string are ignored (str may be larger!)
+
+    if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') {
+        return 0;
+    }
+
+    unsigned utf8len = 0;
+    cxstring ustr1 = { str.ptr + 2, 4};
+    uint16_t utf16a, utf16b;
+    if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
+        uint32_t codepoint;
+        if (utf16a < 0xD800 || utf16a > 0xE000) {
+            // character is in the Basic Multilingual Plane
+            // and encoded as a single utf16 char
+            codepoint = utf16a;
+            utf8len = codepoint_to_utf8(codepoint, utf8buf);
+        } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) {
+            // character is encoded as a surrogate pair
+            // get next 6 bytes
+            if (str.length >= 12) {
+                if (str.ptr[6] == '\\' && str.ptr[7] == 'u') {
+                    cxstring ustr2 = { str.ptr+8, 4 };
+                    if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
+                            && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
+                        codepoint = utf16pair_to_codepoint(utf16a, utf16b);
+                        utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                    }
+                }
+            }
+        }
+    }
+    return utf8len;
+}
+
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
     // note: this function expects that str contains the enclosing quotes!
 
@@ -368,6 +434,8 @@
             u = false;
             if (c == 'n') {
                 c = '\n';
+            } else if (c == '"') {
+                c = '"';
             } else if (c == 't') {
                 c = '\t';
             } else if (c == 'r') {
@@ -380,10 +448,31 @@
                 c = '\f';
             } else if (c == 'b') {
                 c = '\b';
+            } else if (c == 'u') {
+                char utf8buf[4];
+                unsigned utf8len = unescape_unicode_string(
+                    cx_strn(str.ptr + i - 1, str.length + 1 - i),
+                    utf8buf
+                );
+                if(utf8len > 0) {
+                    i += utf8len < 4 ? 4 : 10;
+                    // add all bytes from utf8buf except the last char
+                    // to the result (last char will be added below)
+                    utf8len--;
+                    c = utf8buf[utf8len];
+                    for (unsigned x = 0; x < utf8len; x++) {
+                        result.ptr[result.length++] = utf8buf[x];
+                    }
+                } else {
+                    // decoding failed, ignore the entire sequence
+                    result.ptr[result.length++] = '\\';
+                }
+            } else {
+                // TODO: discuss the behavior for unrecognized escape sequences
+                //       most parsers throw an error here - we just ignore it
+                result.ptr[result.length++] = '\\';
             }
-            // TODO: support \uXXXX escape sequences
-            // TODO: discuss the behavior for unrecognized escape sequences
-            //       most parsers throw an error here
+
             result.ptr[result.length++] = c;
         } else {
             if (c == '\\') {

mercurial