implement decoder for utf16 surrogate pairs in unescape_string

Sun, 26 Jan 2025 10:23:32 +0100

author
Olaf Wintermann <olaf.wintermann@gmail.com>
date
Sun, 26 Jan 2025 10:23:32 +0100
changeset 1152
e4af44b488bc
parent 1151
60113356a7de
child 1153
8e4f46349e23

implement decoder for utf16 surrogate pairs in unescape_string

src/json.c file | annotate | diff | comparison | revisions
tests/test_json.c file | annotate | diff | comparison | revisions
--- a/src/json.c	Sat Jan 25 16:27:48 2025 +0100
+++ b/src/json.c	Sun Jan 26 10:23:32 2025 +0100
@@ -353,6 +353,7 @@
     return CX_JSON_INCOMPLETE_DATA;
 }
 
+// converts a unicode (up to U+FFFF) codepoint to utf8
 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
     if (codepoint <= 0x7F) {
         *output_buf = (char)codepoint;
@@ -366,11 +367,22 @@
         output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
         output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
         return 3;
+    } else if (codepoint <= 0x10FFFF) {
+        output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
+        output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
+        output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+        output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
+        return 4;
     }
     
     return 0;
 }
 
+// converts a utf16 surrogate pair to utf8
+static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
+    return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
+}
+
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
     // note: this function expects that str contains the enclosing quotes!
 
@@ -402,16 +414,36 @@
                 if (i+4 < str.length - 1) {
                     cxstring codepoint_str = { str.ptr + i + 1, 4};
                     uint32_t codepoint;
-                    if(!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
+                    if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) {
                         char utf8buf[4];
-                        int utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                        int utf8len = 0;
+                        if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+                            // character is encoded as a surrogate pair
+                            // get next 6 bytes
+                            if (i + 10 < str.length - 1) {
+                                char *surrogate2 = str.ptr+i+5;
+                                if (surrogate2[0] == '\\' && surrogate2[1] == 'u') {
+                                    cxstring c2_str = { surrogate2 + 2, 4 };
+                                    uint32_t c2;
+                                    if (!cx_strtou32_lc(c2_str, &c2, 16, "")) {
+                                        codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
+                                        utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                                        i += 6;
+                                    }
+                                }
+                            }
+                        } else {
+                            // character is in the Basic Multilingual Plane
+                            // and encoded as a single utf16 char
+                            utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                        }
                         if(utf8len > 0) {
                             // add all bytes from utf8buf expect the last char
                             // to the result
                             utf8len--;
                             c = utf8buf[utf8len];
-                            for(int i=0;i<utf8len;i++) {
-                                result.ptr[result.length++] = utf8buf[i];
+                            for(int x=0;x<utf8len;x++) {
+                                result.ptr[result.length++] = utf8buf[x];
                             }
                         }
                         i += 4;
--- a/tests/test_json.c	Sat Jan 25 16:27:48 2025 +0100
+++ b/tests/test_json.c	Sun Jan 26 10:23:32 2025 +0100
@@ -149,7 +149,10 @@
             "\"ascii\":\"\\u0041\\u0053\\u0043\\u0049\\u0049\",\n"
             "\"unicode\":\"\\u00df\\u00DF\",\n"
             "\"mixed\":\"mixed ä ö \\u00e4 \\u00f6\",\n"
-            "\"wide\":\"\\u03a3\\u29b0\""
+            "\"wide\":\"\\u03a3\\u29b0\",\n"
+            "\"surrogatepair1\":\"\\ud83e\\udff5\",\n"
+            "\"surrogatepair2\":\"test\\ud83e\\udff1AA\"\n,"
+            "\"mixed2\":\"123\\u03a3\\ud83e\\udfc5\\u00df\""
             "}"
     );
 
@@ -190,6 +193,27 @@
             CX_STR("\u03a3\u29b0"))
         );
         
+        CxJsonValue *surrogatepair1 = cxJsonObjGet(obj, "surrogatepair1");
+        CX_TEST_ASSERT(cxJsonIsString(surrogatepair1));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(surrogatepair1),
+            CX_STR("\xf0\x9f\xaf\xb5"))
+        );
+        
+        CxJsonValue *surrogatepair2 = cxJsonObjGet(obj, "surrogatepair2");
+        CX_TEST_ASSERT(cxJsonIsString(surrogatepair2));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(surrogatepair2),
+            CX_STR("test\xf0\x9f\xaf\xb1" "AA"))
+        );
+        
+        CxJsonValue *mixed2 = cxJsonObjGet(obj, "mixed2");
+        CX_TEST_ASSERT(cxJsonIsString(mixed2));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(mixed2),
+            CX_STR("123\u03a3\xf0\x9f\xaf\x85ß"))
+        );
+        
         cxJsonValueFree(obj);
     }
     cxJsonDestroy(&json);

mercurial