381 // converts a utf16 surrogate pair to utf8 |
381 // converts a utf16 surrogate pair to utf8 |
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { |
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { |
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; |
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; |
384 } |
384 } |
385 |
385 |
|
386 static unsigned unescape_unicode_string(cxstring str, char *utf8buf) { |
|
387 // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX" |
|
388 // remaining bytes in the string are ignored (str may be larger!) |
|
389 |
|
390 if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') { |
|
391 return 0; |
|
392 } |
|
393 |
|
394 unsigned utf8len = 0; |
|
395 cxstring ustr1 = { str.ptr + 2, 4}; |
|
396 uint16_t utf16a, utf16b; |
|
397 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { |
|
398 uint32_t codepoint; |
|
399 if (utf16a < 0xD800 || utf16a > 0xE000) { |
|
400 // character is in the Basic Multilingual Plane |
|
401 // and encoded as a single utf16 char |
|
402 codepoint = utf16a; |
|
403 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
404 } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) { |
|
405 // character is encoded as a surrogate pair |
|
406 // get next 6 bytes |
|
407 if (str.length > 12) { |
|
408 if (*(str.ptr+6) == '\\' && *(str.ptr+7) == 'u') { |
|
409 cxstring ustr2 = { str.ptr+8, 4 }; |
|
410 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") |
|
411 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { |
|
412 codepoint = utf16pair_to_codepoint(utf16a, utf16b); |
|
413 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
414 } |
|
415 } |
|
416 } |
|
417 } |
|
418 } |
|
419 return utf8len; |
|
420 } |
|
421 |
386 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
422 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { |
387 // note: this function expects that str contains the enclosing quotes! |
423 // note: this function expects that str contains the enclosing quotes! |
388 |
424 |
389 cxmutstr result; |
425 cxmutstr result; |
390 result.length = 0; |
426 result.length = 0; |
411 } else if (c == 'f') { |
447 } else if (c == 'f') { |
412 c = '\f'; |
448 c = '\f'; |
413 } else if (c == 'b') { |
449 } else if (c == 'b') { |
414 c = '\b'; |
450 c = '\b'; |
415 } else if (c == 'u') { |
451 } else if (c == 'u') { |
416 if (i + 4 < str.length - 1) { |
452 char utf8buf[4]; |
417 cxstring ustr1 = { str.ptr + i + 1, 4}; |
453 unsigned utf8len = unescape_unicode_string( |
418 uint16_t utf16a, utf16b; |
454 cx_strn(str.ptr + i - 1, str.length + 1 - i), |
419 char utf8buf[4]; |
455 utf8buf |
420 unsigned utf8len = 0; |
456 ); |
421 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { |
457 if(utf8len > 0) { |
422 uint32_t codepoint; |
458 i += utf8len < 4 ? 4 : 10; |
423 if (utf16a >= 0xD800 && utf16a <= 0xDFFF) { |
459 // add all bytes from utf8buf except the last char |
424 // character is encoded as a surrogate pair |
460 // to the result (last char will be added below) |
425 // get next 6 bytes |
461 utf8len--; |
426 if (i + 10 < str.length - 1) { |
462 c = utf8buf[utf8len]; |
427 if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') { |
463 for (unsigned x = 0; x < utf8len; x++) { |
428 cxstring ustr2 = { str.ptr+i+7, 4 }; |
464 result.ptr[result.length++] = utf8buf[x]; |
429 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") |
|
430 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { |
|
431 codepoint = utf16pair_to_codepoint(utf16a, utf16b); |
|
432 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
433 i += 10; |
|
434 } |
|
435 } |
|
436 } |
|
437 } else { |
|
438 // character is in the Basic Multilingual Plane |
|
439 // and encoded as a single utf16 char |
|
440 codepoint = utf16a; |
|
441 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
442 i += 4; |
|
443 } |
|
444 } |
465 } |
445 if(utf8len > 0) { |
466 } else { |
446 // add all bytes from utf8buf except the last char |
467 // decoding failed, ignore the entire sequence |
447 // to the result (last char will be added below) |
468 result.ptr[result.length++] = '\\'; |
448 utf8len--; |
|
449 c = utf8buf[utf8len]; |
|
450 for (unsigned x = 0; x < utf8len; x++) { |
|
451 result.ptr[result.length++] = utf8buf[x]; |
|
452 } |
|
453 } else { |
|
454 // decoding failed, ignore the entire sequence |
|
455 result.ptr[result.length++] = '\\'; |
|
456 } |
|
457 } |
469 } |
458 } else { |
470 } else { |
459 // TODO: discuss the behavior for unrecognized escape sequences |
471 // TODO: discuss the behavior for unrecognized escape sequences |
460 // most parsers throw an error here - we just ignore it |
472 // most parsers throw an error here - we just ignore it |
461 result.ptr[result.length++] = '\\'; |
473 result.ptr[result.length++] = '\\'; |