src/json.c

changeset 1152
e4af44b488bc
parent 1151
60113356a7de
child 1156
96f16b5a0029
--- a/src/json.c	Sat Jan 25 16:27:48 2025 +0100
+++ b/src/json.c	Sun Jan 26 10:23:32 2025 +0100
@@ -353,6 +353,7 @@
     return CX_JSON_INCOMPLETE_DATA;
 }
 
+// converts a unicode (up to U+FFFF) codepoint to utf8
 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
     if (codepoint <= 0x7F) {
         *output_buf = (char)codepoint;
@@ -366,11 +367,22 @@
         output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
         output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
         return 3;
+    } else if (codepoint <= 0x10FFFF) {
+        output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
+        output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+        output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+        output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
+        return 4;
     }
     
     return 0;
 }
 
+// converts a utf16 surrogate pair to utf8
+static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
+    return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
+}
+
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
     // note: this function expects that str contains the enclosing quotes!
 
@@ -402,16 +414,36 @@
                 if (i+4 < str.length - 1) {
                     cxstring codepoint_str = { str.ptr + i + 1, 4};
                     uint32_t codepoint;
-                    if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
+                    if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
                         char utf8buf[4];
-                        int utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                        int utf8len = 0;
+                        if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+                            // character is encoded as a surrogate pair
+                            // get next 6 bytes
+                            if (i + 10 < str.length - 1) {
+                                char *surrogate2 = str.ptr+i+5;
+                                if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
+                                    cxstring c2_str = { surrogate2 + 2, 4 };
+                                    uint32_t c2;
+                                    if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
+                                        codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
+                                        utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                                        i += 6;
+                                    }
+                                }
+                            }
+                        } else {
+                            // character is in the Basic Multilingual Plane
+                            // and encoded as a single utf16 char
+                            utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                        }
                         if(utf8len > 0) {
                             // add all bytes from utf8buf expect the last char
                             // to the result
                             utf8len--;
                             c = utf8buf[utf8len];
-                            for(int i=0;i<utf8len;i++) {
-                                result.ptr[result.length++] = utf8buf[i];
+                            for(int x=0;x<utf8len;x++) {
+                                result.ptr[result.length++] = utf8buf[x];
                             }
                         }
                         i += 4;

mercurial