351 } |
351 } |
352 |
352 |
353 return CX_JSON_INCOMPLETE_DATA; |
353 return CX_JSON_INCOMPLETE_DATA; |
354 } |
354 } |
355 |
355 |
356 // converts a unicode (up to U+FFFF) codepoint to utf8 |
356 // converts a Unicode codepoint to utf8 |
357 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
357 static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) { |
358 if (codepoint <= 0x7F) { |
358 if (codepoint <= 0x7F) { |
359 *output_buf = (char)codepoint; |
359 *output_buf = (char)codepoint; |
360 return 1; |
360 return 1; |
361 } else if (codepoint <= 0x7FF) { |
361 } else if (codepoint <= 0x7FF) { |
362 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); |
362 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); |
409 } else if (c == 'f') { |
411 } else if (c == 'f') { |
410 c = '\f'; |
412 c = '\f'; |
411 } else if (c == 'b') { |
413 } else if (c == 'b') { |
412 c = '\b'; |
414 c = '\b'; |
413 } else if (c == 'u') { |
415 } else if (c == 'u') { |
414 if (i+4 < str.length - 1) { |
416 if (i + 4 < str.length - 1) { |
415 cxstring codepoint_str = { str.ptr + i + 1, 4}; |
417 cxstring ustr1 = { str.ptr + i + 1, 4}; |
416 uint32_t codepoint; |
418 uint16_t utf16a, utf16b; |
417 if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { |
419 char utf8buf[4]; |
418 char utf8buf[4]; |
420 unsigned utf8len = 0; |
419 int utf8len = 0; |
421 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { |
420 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { |
422 uint32_t codepoint; |
|
423 if (utf16a >= 0xD800 && utf16a <= 0xDFFF) { |
421 // character is encoded as a surrogate pair |
424 // character is encoded as a surrogate pair |
422 // get next 6 bytes |
425 // get next 6 bytes |
423 if (i + 10 < str.length - 1) { |
426 if (i + 10 < str.length - 1) { |
424 char *surrogate2 = str.ptr+i+5; |
427 if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') { |
425 if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { |
428 cxstring ustr2 = { str.ptr+i+7, 4 }; |
426 cxstring c2_str = { surrogate2 + 2, 4 }; |
429 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") |
427 uint32_t c2; |
430 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { |
428 if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { |
431 codepoint = utf16pair_to_codepoint(utf16a, utf16b); |
429 codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2); |
|
430 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
432 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
431 i += 6; |
433 i += 10; |
432 } |
434 } |
433 } |
435 } |
434 } |
436 } |
435 } else { |
437 } else { |
436 // character is in the Basic Multilingual Plane |
438 // character is in the Basic Multilingual Plane |
437 // and encoded as a single utf16 char |
439 // and encoded as a single utf16 char |
|
440 codepoint = utf16a; |
438 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
441 utf8len = codepoint_to_utf8(codepoint, utf8buf); |
|
442 i += 4; |
439 } |
443 } |
440 if(utf8len > 0) { |
444 } |
441 // add all bytes from utf8buf expect the last char |
445 if(utf8len > 0) { |
442 // to the result |
446 // add all bytes from utf8buf except the last char |
443 utf8len--; |
447 // to the result (last char will be added below) |
444 c = utf8buf[utf8len]; |
448 utf8len--; |
445 for(int x=0;x<utf8len;x++) { |
449 c = utf8buf[utf8len]; |
446 result.ptr[result.length++] = utf8buf[x]; |
450 for (unsigned x = 0; x < utf8len; x++) { |
447 } |
451 result.ptr[result.length++] = utf8buf[x]; |
448 } |
452 } |
449 i += 4; |
453 } else { |
|
454 // decoding failed, ignore the entire sequence |
|
455 result.ptr[result.length++] = '\\'; |
450 } |
456 } |
451 } |
457 } |
452 } |
458 } else { |
453 |
459 // TODO: discuss the behavior for unrecognized escape sequences |
454 // TODO: discuss the behavior for unrecognized escape sequences |
460 // most parsers throw an error here - we just ignore it |
455 // most parsers throw an error here |
461 result.ptr[result.length++] = '\\'; |
|
462 } |
|
463 |
456 result.ptr[result.length++] = c; |
464 result.ptr[result.length++] = c; |
457 } else { |
465 } else { |
458 if (c == '\\') { |
466 if (c == '\\') { |
459 u = true; |
467 u = true; |
460 } else { |
468 } else { |