implement unicode escape sequences in json unescape_string function

Sat, 25 Jan 2025 16:13:28 +0100

author
Olaf Wintermann <olaf.wintermann@gmail.com>
date
Sat, 25 Jan 2025 16:13:28 +0100
changeset 1149
df5665de7344
parent 1139
7dfa5bcf39ee
child 1150
7b0bd5e76b5d

implement unicode escape sequences in json unescape_string function

src/json.c file | annotate | diff | comparison | revisions
tests/test_json.c file | annotate | diff | comparison | revisions
--- a/src/json.c	Wed Jan 22 21:02:46 2025 +0100
+++ b/src/json.c	Sat Jan 25 16:13:28 2025 +0100
@@ -353,6 +353,24 @@
     return CX_JSON_INCOMPLETE_DATA;
 }
 
+static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
+    if (codepoint <= 0x7F) {
+        *output_buf = (char)codepoint;
+        return 1;
+    } else if (codepoint <= 0x7FF) {
+        output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
+        output_buf[1] = (char)(0x80 | (codepoint & 0x3F));
+        return 2;
+    } else if (codepoint <= 0xFFFF) {
+        output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
+        output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
+        output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
+        return 3;
+    }
+    
+    return 0;
+}
+
 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
     // note: this function expects that str contains the enclosing quotes!
 
@@ -380,8 +398,27 @@
                 c = '\f';
             } else if (c == 'b') {
                 c = '\b';
+            } else if (c == 'u') {
+                if (i+4 < str.length) {
+                    cxstring codepoint_str = { str.ptr + i + 1, 4};
+                    uint32_t codepoint;
+                    if(!cx_strtou32_lc_(codepoint_str, &codepoint, 16, "")) {
+                        char utf8buf[4];
+                        int utf8len = codepoint_to_utf8(codepoint, utf8buf);
+                        if(utf8len > 0) {
+                            // add all bytes from utf8buf expect the last char
+                            // to the result
+                            utf8len--;
+                            c = utf8buf[utf8len];
+                            for(int i=0;i<utf8len;i++) {
+                                result.ptr[result.length++] = utf8buf[i];
+                            }
+                        }
+                        i += 4;
+                    }
+                }
             }
-            // TODO: support \uXXXX escape sequences
+            
             // TODO: discuss the behavior for unrecognized escape sequences
             //       most parsers throw an error here
             result.ptr[result.length++] = c;
--- a/tests/test_json.c	Wed Jan 22 21:02:46 2025 +0100
+++ b/tests/test_json.c	Sat Jan 25 16:13:28 2025 +0100
@@ -143,6 +143,58 @@
     cxJsonDestroy(&json);
 }
 
+CX_TEST(test_json_escaped_unicode_strings) {
+    cxstring text = cx_str(
+            "{\n"
+            "\"ascii\":\"\\u0041\\u0053\\u0043\\u0049\\u0049\",\n"
+            "\"unicode\":\"\\u00df\\u00DF\",\n"
+            "\"mixed\":\"mixed ä ö \\u00e4 \\u00f6\",\n"
+            "\"wide\":\"\\u03a3\\u29b0\""
+            "}"
+    );
+
+    CxJson json;
+    cxJsonInit(&json, NULL);
+    CX_TEST_DO {
+        cxJsonFill(&json, text);
+        CxJsonValue *obj;
+        CxJsonStatus result = cxJsonNext(&json, &obj);
+        CX_TEST_ASSERT(result == CX_JSON_NO_ERROR);
+        CX_TEST_ASSERT(cxJsonIsObject(obj));
+        
+        CxJsonValue *ascii = cxJsonObjGet(obj, "ascii");
+        CX_TEST_ASSERT(cxJsonIsString(ascii));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(ascii),
+            CX_STR("ASCII"))
+        );
+        
+        CxJsonValue *unicode = cxJsonObjGet(obj, "unicode");
+        CX_TEST_ASSERT(cxJsonIsString(unicode));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(unicode),
+            CX_STR("ßß"))
+        );
+        
+        CxJsonValue *mixed = cxJsonObjGet(obj, "mixed");
+        CX_TEST_ASSERT(cxJsonIsString(mixed));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(mixed),
+            CX_STR("mixed ä ö ä ö"))
+        );
+        
+        CxJsonValue *wide = cxJsonObjGet(obj, "wide");
+        CX_TEST_ASSERT(cxJsonIsString(wide));
+        CX_TEST_ASSERT(0 == cx_strcmp(
+            cxJsonAsCxString(wide),
+            CX_STR("\u03a3\u29b0"))
+        );
+        
+        cxJsonValueFree(obj);
+    }
+    cxJsonDestroy(&json);
+}
+
 CX_TEST(test_json_escaped_end_of_string) {
     CxJson json;
     cxJsonInit(&json, NULL);
@@ -1042,6 +1094,7 @@
     cx_test_register(suite, test_json_init_default);
     cx_test_register(suite, test_json_simple_object);
     cx_test_register(suite, test_json_escaped_strings);
+    cx_test_register(suite, test_json_escaped_unicode_strings);
     cx_test_register(suite, test_json_escaped_end_of_string);
     cx_test_register(suite, test_json_object_incomplete_token);
     cx_test_register(suite, test_json_token_wrongly_completed);

mercurial