src/json.c

changeset 1158
fa2811e9ab19
parent 1156
96f16b5a0029
child 1159
48279dbe4811
equal deleted inserted replaced
1157:3565ae61a5a2 1158:fa2811e9ab19
381 // converts a utf16 surrogate pair to utf8 381 // converts a utf16 surrogate pair to utf8
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { 382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; 383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
384 } 384 }
385 385
386 static unsigned unescape_unicode_string(cxstring str, char *utf8buf) {
387 // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX"
388 // remaining bytes in the string are ignored (str may be larger!)
389
390 if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') {
391 return 0;
392 }
393
394 unsigned utf8len = 0;
395 cxstring ustr1 = { str.ptr + 2, 4};
396 uint16_t utf16a, utf16b;
397 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
398 uint32_t codepoint;
399 if (utf16a < 0xD800 || utf16a > 0xE000) {
400 // character is in the Basic Multilingual Plane
401 // and encoded as a single utf16 char
402 codepoint = utf16a;
403 utf8len = codepoint_to_utf8(codepoint, utf8buf);
404 } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) {
405 // character is encoded as a surrogate pair
406 // get next 6 bytes
407 if (str.length > 12) {
408 if (*(str.ptr+6) == '\\' && *(str.ptr+7) == 'u') {
409 cxstring ustr2 = { str.ptr+8, 4 };
410 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
411 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
412 codepoint = utf16pair_to_codepoint(utf16a, utf16b);
413 utf8len = codepoint_to_utf8(codepoint, utf8buf);
414 }
415 }
416 }
417 }
418 }
419 return utf8len;
420 }
421
386 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { 422 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
387 // note: this function expects that str contains the enclosing quotes! 423 // note: this function expects that str contains the enclosing quotes!
388 424
389 cxmutstr result; 425 cxmutstr result;
390 result.length = 0; 426 result.length = 0;
411 } else if (c == 'f') { 447 } else if (c == 'f') {
412 c = '\f'; 448 c = '\f';
413 } else if (c == 'b') { 449 } else if (c == 'b') {
414 c = '\b'; 450 c = '\b';
415 } else if (c == 'u') { 451 } else if (c == 'u') {
416 if (i + 4 < str.length - 1) { 452 char utf8buf[4];
417 cxstring ustr1 = { str.ptr + i + 1, 4}; 453 unsigned utf8len = unescape_unicode_string(
418 uint16_t utf16a, utf16b; 454 cx_strn(str.ptr + i - 1, str.length + 1 - i),
419 char utf8buf[4]; 455 utf8buf
420 unsigned utf8len = 0; 456 );
421 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { 457 if(utf8len > 0) {
422 uint32_t codepoint; 458 i += utf8len < 4 ? 4 : 10;
423 if (utf16a >= 0xD800 && utf16a <= 0xDFFF) { 459 // add all bytes from utf8buf except the last char
424 // character is encoded as a surrogate pair 460 // to the result (last char will be added below)
425 // get next 6 bytes 461 utf8len--;
426 if (i + 10 < str.length - 1) { 462 c = utf8buf[utf8len];
427 if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') { 463 for (unsigned x = 0; x < utf8len; x++) {
428 cxstring ustr2 = { str.ptr+i+7, 4 }; 464 result.ptr[result.length++] = utf8buf[x];
429 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
430 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
431 codepoint = utf16pair_to_codepoint(utf16a, utf16b);
432 utf8len = codepoint_to_utf8(codepoint, utf8buf);
433 i += 10;
434 }
435 }
436 }
437 } else {
438 // character is in the Basic Multilingual Plane
439 // and encoded as a single utf16 char
440 codepoint = utf16a;
441 utf8len = codepoint_to_utf8(codepoint, utf8buf);
442 i += 4;
443 }
444 } 465 }
445 if(utf8len > 0) { 466 } else {
446 // add all bytes from utf8buf except the last char 467 // decoding failed, ignore the entire sequence
447 // to the result (last char will be added below) 468 result.ptr[result.length++] = '\\';
448 utf8len--;
449 c = utf8buf[utf8len];
450 for (unsigned x = 0; x < utf8len; x++) {
451 result.ptr[result.length++] = utf8buf[x];
452 }
453 } else {
454 // decoding failed, ignore the entire sequence
455 result.ptr[result.length++] = '\\';
456 }
457 } 469 }
458 } else { 470 } else {
459 // TODO: discuss the behavior for unrecognized escape sequences 471 // TODO: discuss the behavior for unrecognized escape sequences
460 // most parsers throw an error here - we just ignore it 472 // most parsers throw an error here - we just ignore it
461 result.ptr[result.length++] = '\\'; 473 result.ptr[result.length++] = '\\';

mercurial