src/json.c

changeset 1156
96f16b5a0029
parent 1152
e4af44b488bc
child 1158
fa2811e9ab19
equal deleted inserted replaced
1155:b77d56a27e9c 1156:96f16b5a0029
351 } 351 }
352 352
353 return CX_JSON_INCOMPLETE_DATA; 353 return CX_JSON_INCOMPLETE_DATA;
354 } 354 }
355 355
356 // converts a unicode (up to U+FFFF) codepoint to utf8 356 // converts a Unicode codepoint to utf8
357 static int codepoint_to_utf8(uint32_t codepoint, char *output_buf) { 357 static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
358 if (codepoint <= 0x7F) { 358 if (codepoint <= 0x7F) {
359 *output_buf = (char)codepoint; 359 *output_buf = (char)codepoint;
360 return 1; 360 return 1;
361 } else if (codepoint <= 0x7FF) { 361 } else if (codepoint <= 0x7FF) {
362 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); 362 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); 373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); 374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
375 return 4; 375 return 4;
376 } 376 }
377 377
378 return 0; 378 return 0; // LCOV_EXCL_LINE
379 } 379 }
380 380
381 // converts a utf16 surrogate pair to utf8 381 // converts a utf16 surrogate pair to utf8
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { 382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; 383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
396 char c = str.ptr[i]; 396 char c = str.ptr[i];
397 if (u) { 397 if (u) {
398 u = false; 398 u = false;
399 if (c == 'n') { 399 if (c == 'n') {
400 c = '\n'; 400 c = '\n';
401 } else if (c == '"') {
402 c = '"';
401 } else if (c == 't') { 403 } else if (c == 't') {
402 c = '\t'; 404 c = '\t';
403 } else if (c == 'r') { 405 } else if (c == 'r') {
404 c = '\r'; 406 c = '\r';
405 } else if (c == '\\') { 407 } else if (c == '\\') {
409 } else if (c == 'f') { 411 } else if (c == 'f') {
410 c = '\f'; 412 c = '\f';
411 } else if (c == 'b') { 413 } else if (c == 'b') {
412 c = '\b'; 414 c = '\b';
413 } else if (c == 'u') { 415 } else if (c == 'u') {
414 if (i+4 < str.length - 1) { 416 if (i + 4 < str.length - 1) {
415 cxstring codepoint_str = { str.ptr + i + 1, 4}; 417 cxstring ustr1 = { str.ptr + i + 1, 4};
416 uint32_t codepoint; 418 uint16_t utf16a, utf16b;
417 if (!cx_strtou32_lc(codepoint_str, &codepoint, 16, "")) { 419 char utf8buf[4];
418 char utf8buf[4]; 420 unsigned utf8len = 0;
419 int utf8len = 0; 421 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
420 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { 422 uint32_t codepoint;
423 if (utf16a >= 0xD800 && utf16a <= 0xDFFF) {
421 // character is encoded as a surrogate pair 424 // character is encoded as a surrogate pair
422 // get next 6 bytes 425 // get next 6 bytes
423 if (i + 10 < str.length - 1) { 426 if (i + 10 < str.length - 1) {
424 char *surrogate2 = str.ptr+i+5; 427 if (*(str.ptr+i+5) == '\\' && *(str.ptr+i+6) == 'u') {
425 if (surrogate2[0] == '\\' && surrogate2[1] == 'u') { 428 cxstring ustr2 = { str.ptr+i+7, 4 };
426 cxstring c2_str = { surrogate2 + 2, 4 }; 429 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
427 uint32_t c2; 430 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
428 if (!cx_strtou32_lc(c2_str, &c2, 16, "")) { 431 codepoint = utf16pair_to_codepoint(utf16a, utf16b);
429 codepoint = utf16pair_to_codepoint((uint16_t)codepoint, c2);
430 utf8len = codepoint_to_utf8(codepoint, utf8buf); 432 utf8len = codepoint_to_utf8(codepoint, utf8buf);
431 i += 6; 433 i += 10;
432 } 434 }
433 } 435 }
434 } 436 }
435 } else { 437 } else {
436 // character is in the Basic Multilingual Plane 438 // character is in the Basic Multilingual Plane
437 // and encoded as a single utf16 char 439 // and encoded as a single utf16 char
440 codepoint = utf16a;
438 utf8len = codepoint_to_utf8(codepoint, utf8buf); 441 utf8len = codepoint_to_utf8(codepoint, utf8buf);
442 i += 4;
439 } 443 }
440 if(utf8len > 0) { 444 }
441 // add all bytes from utf8buf expect the last char 445 if(utf8len > 0) {
442 // to the result 446 // add all bytes from utf8buf except the last char
443 utf8len--; 447 // to the result (last char will be added below)
444 c = utf8buf[utf8len]; 448 utf8len--;
445 for(int x=0;x<utf8len;x++) { 449 c = utf8buf[utf8len];
446 result.ptr[result.length++] = utf8buf[x]; 450 for (unsigned x = 0; x < utf8len; x++) {
447 } 451 result.ptr[result.length++] = utf8buf[x];
448 } 452 }
449 i += 4; 453 } else {
454 // decoding failed, ignore the entire sequence
455 result.ptr[result.length++] = '\\';
450 } 456 }
451 } 457 }
452 } 458 } else {
453 459 // TODO: discuss the behavior for unrecognized escape sequences
454 // TODO: discuss the behavior for unrecognized escape sequences 460 // most parsers throw an error here - we just ignore it
455 // most parsers throw an error here 461 result.ptr[result.length++] = '\\';
462 }
463
456 result.ptr[result.length++] = c; 464 result.ptr[result.length++] = c;
457 } else { 465 } else {
458 if (c == '\\') { 466 if (c == '\\') {
459 u = true; 467 u = true;
460 } else { 468 } else {

mercurial