src/json.c

branch
docs/3.1
changeset 1164
148b7c7ccaf9
parent 1160
4f02c1101f2e
equal deleted inserted replaced
1148:8ff82697f2c3 1164:148b7c7ccaf9
351 } 351 }
352 352
353 return CX_JSON_INCOMPLETE_DATA; 353 return CX_JSON_INCOMPLETE_DATA;
354 } 354 }
355 355
356 // converts a Unicode codepoint to utf8
357 static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) {
358 if (codepoint <= 0x7F) {
359 *output_buf = (char)codepoint;
360 return 1;
361 } else if (codepoint <= 0x7FF) {
362 output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F));
363 output_buf[1] = (char)(0x80 | (codepoint & 0x3F));
364 return 2;
365 } else if (codepoint <= 0xFFFF) {
366 output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F));
367 output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
368 output_buf[2] = (char)(0x80 | (codepoint & 0x3F));
369 return 3;
370 } else if (codepoint <= 0x10FFFF) {
371 output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07));
372 output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
373 output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
374 output_buf[3] = (char)(0x80 | (codepoint & 0x3F));
375 return 4;
376 }
377
378 return 0; // LCOV_EXCL_LINE
379 }
380
381 // converts a utf16 surrogate pair to utf8
382 static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) {
383 return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000;
384 }
385
386 static unsigned unescape_unicode_string(cxstring str, char *utf8buf) {
387 // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX"
388 // remaining bytes in the string are ignored (str may be larger!)
389
390 if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') {
391 return 0;
392 }
393
394 unsigned utf8len = 0;
395 cxstring ustr1 = { str.ptr + 2, 4};
396 uint16_t utf16a, utf16b;
397 if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) {
398 uint32_t codepoint;
399 if (utf16a < 0xD800 || utf16a > 0xE000) {
400 // character is in the Basic Multilingual Plane
401 // and encoded as a single utf16 char
402 codepoint = utf16a;
403 utf8len = codepoint_to_utf8(codepoint, utf8buf);
404 } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) {
405 // character is encoded as a surrogate pair
406 // get next 6 bytes
407 if (str.length >= 12) {
408 if (str.ptr[6] == '\\' && str.ptr[7] == 'u') {
409 cxstring ustr2 = { str.ptr+8, 4 };
410 if (!cx_strtou16_lc(ustr2, &utf16b, 16, "")
411 && utf16b >= 0xDC00 && utf16b <= 0xDFFF) {
412 codepoint = utf16pair_to_codepoint(utf16a, utf16b);
413 utf8len = codepoint_to_utf8(codepoint, utf8buf);
414 }
415 }
416 }
417 }
418 }
419 return utf8len;
420 }
421
356 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { 422 static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) {
357 // note: this function expects that str contains the enclosing quotes! 423 // note: this function expects that str contains the enclosing quotes!
358 424
359 cxmutstr result; 425 cxmutstr result;
360 result.length = 0; 426 result.length = 0;
366 char c = str.ptr[i]; 432 char c = str.ptr[i];
367 if (u) { 433 if (u) {
368 u = false; 434 u = false;
369 if (c == 'n') { 435 if (c == 'n') {
370 c = '\n'; 436 c = '\n';
437 } else if (c == '"') {
438 c = '"';
371 } else if (c == 't') { 439 } else if (c == 't') {
372 c = '\t'; 440 c = '\t';
373 } else if (c == 'r') { 441 } else if (c == 'r') {
374 c = '\r'; 442 c = '\r';
375 } else if (c == '\\') { 443 } else if (c == '\\') {
378 c = '/'; // always unescape, we don't need settings here 446 c = '/'; // always unescape, we don't need settings here
379 } else if (c == 'f') { 447 } else if (c == 'f') {
380 c = '\f'; 448 c = '\f';
381 } else if (c == 'b') { 449 } else if (c == 'b') {
382 c = '\b'; 450 c = '\b';
383 } 451 } else if (c == 'u') {
384 // TODO: support \uXXXX escape sequences 452 char utf8buf[4];
385 // TODO: discuss the behavior for unrecognized escape sequences 453 unsigned utf8len = unescape_unicode_string(
386 // most parsers throw an error here 454 cx_strn(str.ptr + i - 1, str.length + 1 - i),
455 utf8buf
456 );
457 if(utf8len > 0) {
458 i += utf8len < 4 ? 4 : 10;
459 // add all bytes from utf8buf except the last char
460 // to the result (last char will be added below)
461 utf8len--;
462 c = utf8buf[utf8len];
463 for (unsigned x = 0; x < utf8len; x++) {
464 result.ptr[result.length++] = utf8buf[x];
465 }
466 } else {
467 // decoding failed, ignore the entire sequence
468 result.ptr[result.length++] = '\\';
469 }
470 } else {
471 // TODO: discuss the behavior for unrecognized escape sequences
472 // most parsers throw an error here - we just ignore it
473 result.ptr[result.length++] = '\\';
474 }
475
387 result.ptr[result.length++] = c; 476 result.ptr[result.length++] = c;
388 } else { 477 } else {
389 if (c == '\\') { 478 if (c == '\\') {
390 u = true; 479 u = true;
391 } else { 480 } else {

mercurial