351 } |
351 } |
352 |
352 |
353 return CX_JSON_INCOMPLETE_DATA; |
353 return CX_JSON_INCOMPLETE_DATA; |
354 } |
354 } |
355 |
355 |
|
356 // converts a Unicode codepoint to utf8 |
|
357 static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
|
358 if (codepoint <= 0x7F) { |
|
359 *output_buf = (char)codepoint; |
|
360 return 1; |
|
361 } else if (codepoint <= 0x7FF) { |
|
362 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); |
|
363 output_buf[1] = (char)(0x80 | (codepoint & 0x3F)); |
|
364 return 2; |
|
365 } else if (codepoint <= 0xFFFF) { |
|
366 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); |
|
367 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
|
368 output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); |
|
369 return 3; |
|
370 } else if (codepoint <= 0x10FFFF) { |
|
371 output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); |
|
372 output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); |
|
373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); |
|
374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); |
|
375 return 4; |
|
376 } |
|
377 |
|
378 return 0; // LCOV_EXCL_LINE |
|
379 } |
|
380 |
|
381 // converts a utf16 surrogate pair to utf8 |
|
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { |
|
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; |
|
384 } |
|
385 |
|
386 static unsigned unescape_unicode_string(cxstring str, char *utf8buf) { |
|
387 // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX" |
|
388 // remaining bytes in the string are ignored (str may be larger!) |
|
389 |
|
390 if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') { |
|
391 return 0; |
|
392 } |
|
393 |
|
394 unsigned utf8len = 0; |
|
395 cxstring ustr1 = { str.ptr + 2, 4}; |
|
396 uint16_t utf16a, utf16b; |
|
397 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { |
|
398 uint32_t codepoint; |
|
399 if (utf16a < 0xD800 || utf16a > 0xE000) { |
|
400 // character is in the Basic Multilingual Plane |
|
401 // and encoded as a single utf16 char |
|
402 codepoint = utf16a; |
|
403 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
404 } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) { |
|
405 // character is encoded as a surrogate pair |
|
406 // get next 6 bytes |
|
407 if (str.length >= 12) { |
|
408 if (str.ptr[6] == '\\' && str.ptr[7] == 'u') { |
|
409 cxstring ustr2 = { str.ptr+8, 4 }; |
|
410 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") |
|
411 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { |
|
412 codepoint = utf16pair_to_codepoint(utf16a, utf16b); |
|
413 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
414 } |
|
415 } |
|
416 } |
|
417 } |
|
418 } |
|
419 return utf8len; |
|
420 } |
|
421 |
356 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
422 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
357 // note: this function expects that str contains the enclosing quotes! |
423 // note: this function expects that str contains the enclosing quotes! |
358 |
424 |
359 cxmutstr result; |
425 cxmutstr result; |
360 result.length = 0; |
426 result.length = 0; |
378 c = '/'; // always unescape, we don't need settings here |
446 c = '/'; // always unescape, we don't need settings here |
379 } else if (c == 'f') { |
447 } else if (c == 'f') { |
380 c = '\f'; |
448 c = '\f'; |
381 } else if (c == 'b') { |
449 } else if (c == 'b') { |
382 c = '\b'; |
450 c = '\b'; |
383 } |
451 } else if (c == 'u') { |
384 // TODO: support \uXXXX escape sequences |
452 char utf8buf[4]; |
385 // TODO: discuss the behavior for unrecognized escape sequences |
453 unsigned utf8len = unescape_unicode_string( |
386 // most parsers throw an error here |
454 cx_strn(str.ptr + i - 1, str.length + 1 - i), |
|
455 utf8buf |
|
456 ); |
|
457 if(utf8len > 0) { |
|
458 i += utf8len < 4 ? 4 : 10; |
|
459 // add all bytes from utf8buf except the last char |
|
460 // to the result (last char will be added below) |
|
461 utf8len--; |
|
462 c = utf8buf[utf8len]; |
|
463 for (unsigned x = 0; x < utf8len; x++) { |
|
464 result.ptr[result.length++] = utf8buf[x]; |
|
465 } |
|
466 } else { |
|
467 // decoding failed, ignore the entire sequence |
|
468 result.ptr[result.length++] = '\\'; |
|
469 } |
|
470 } else { |
|
471 // TODO: discuss the behavior for unrecognized escape sequences |
|
472 // most parsers throw an error here - we just ignore it |
|
473 result.ptr[result.length++] = '\\'; |
|
474 } |
|
475 |
387 result.ptr[result.length++] = c; |
476 result.ptr[result.length++] = c; |
388 } else { |
477 } else { |
389 if (c == '\\') { |
478 if (c == '\\') { |
390 u = true; |
479 u = true; |
391 } else { |
480 } else { |