Mon, 18 Nov 2024 22:05:42 +0100
make ucx C++ compatible again (and add tests for it) - fixes #486
/* * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. * * Copyright 2024 Mike Becker, Olaf Wintermann All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <string.h> #include <ctype.h> #include "cx/json.h" /* * RFC 8259 * https://tools.ietf.org/html/rfc8259 */ #define PARSER_READVALUE_ALLOC 32 static CxJsonValue cx_json_value_nothing = {.type = CX_JSON_NOTHING}; static int token_append(CxJsonToken *token, const char *buf, size_t len) { if (len == 0) { return 0; } size_t newlen = token->length + len; if (token->alloc < newlen) { char *newbuf = realloc( token->alloc == 0 ? NULL : (char *) token->content, newlen); if (!newbuf) { return 1; } token->content = newbuf; token->alloc = newlen; } memcpy((char *) token->content + token->length, buf, len); token->length = newlen; return 0; } static CxJsonToken get_content(CxJson *p, size_t start, size_t end) { CxJsonToken token = {0}; size_t part2 = end - start; if (p->uncompleted.tokentype == CX_JSON_NO_TOKEN) { token.content = p->buffer + start; token.length = part2; } else if (part2 == 0) { token = p->uncompleted; } else { if (token_append(&p->uncompleted, p->buffer + start, end - start)) { // TODO: this does certainly not lead to correct error handling return (CxJsonToken){0}; } token = p->uncompleted; } p->uncompleted = (CxJsonToken){0}; return token; } static int token_isliteral(const char *content, size_t length) { if (length == 4) { if (!memcmp(content, "true", 4)) { return 1; } else if (!memcmp(content, "null", 4)) { return 1; } } else if (length == 5 && !memcmp(content, "false", 5)) { return 1; } return 0; } static int num_isexp(const char *content, size_t length, size_t pos) { if (pos >= length) { return 0; } int ok = 0; for (size_t i = pos; i < length; i++) { char c = content[i]; if (isdigit(c)) { ok = 1; } else if (i == pos) { if (!(c == '+' || c == '-')) { return 0; } } else { return 0; } } return ok; } static CxJsonTokenType token_numbertype(const char *content, size_t length) { if (length == 0) return CX_JSON_TOKEN_ERROR; if (content[0] != '-' && !isdigit(content[0])) { return CX_JSON_TOKEN_ERROR; } CxJsonTokenType type = CX_JSON_TOKEN_INTEGER; for (size_t i = 1; i < length; i++) { if (content[i] == '.') { if (type == CX_JSON_TOKEN_NUMBER) { return CX_JSON_TOKEN_ERROR; // more than one decimal separator } type = CX_JSON_TOKEN_NUMBER; } else if (content[i] == 'e' || content[i] == 'E') { return num_isexp(content, length, i + 1) ? CX_JSON_TOKEN_NUMBER : CX_JSON_TOKEN_ERROR; } else if (!isdigit(content[i])) { return CX_JSON_TOKEN_ERROR; // char is not a digit, decimal separator or exponent sep } } return type; } static CxJsonToken get_token(CxJson *p, size_t start, size_t end) { CxJsonToken token = get_content(p, start, end); if (token_isliteral(token.content, token.length)) { token.tokentype = CX_JSON_TOKEN_LITERAL; } else { token.tokentype = token_numbertype(token.content, token.length); } p->pos = end; return token; } static CxJsonTokenType char2ttype(char c) { switch (c) { case '[': { return CX_JSON_TOKEN_BEGIN_ARRAY; } case '{': { return CX_JSON_TOKEN_BEGIN_OBJECT; } case ']': { return CX_JSON_TOKEN_END_ARRAY; } case '}': { return CX_JSON_TOKEN_END_OBJECT; } case ':': { return CX_JSON_TOKEN_NAME_SEPARATOR; } case ',': { return CX_JSON_TOKEN_VALUE_SEPARATOR; } case '"': { return CX_JSON_TOKEN_STRING; } default: { if (isspace(c)) { return CX_JSON_TOKEN_SPACE; } } } return CX_JSON_NO_TOKEN; } static CxJsonToken json_parser_next_token(CxJson *p) { // current token type and start index CxJsonTokenType ttype = p->uncompleted.tokentype; size_t token_start = p->pos; for (size_t i = p->pos; i < p->size; i++) { char c = p->buffer[i]; if (ttype != CX_JSON_TOKEN_STRING) { // currently non-string token CxJsonTokenType ctype = char2ttype(c); // start of new token? if (ttype == CX_JSON_NO_TOKEN) { if (ctype == CX_JSON_TOKEN_SPACE) { continue; } else if (ctype == CX_JSON_TOKEN_STRING) { // begin string ttype = CX_JSON_TOKEN_STRING; token_start = i; } else if (ctype != CX_JSON_NO_TOKEN) { // single-char token p->pos = i + 1; CxJsonToken token = {ctype, NULL, 0, 0}; return token; } else { ttype = CX_JSON_TOKEN_LITERAL; // number or literal token_start = i; } } else { // finish token if (ctype != CX_JSON_NO_TOKEN) { return get_token(p, token_start, i); } } } else { // currently inside a string if (!p->tokenizer_escape) { if (c == '"') { CxJsonToken ret = get_content(p, token_start, i + 1); ret.tokentype = CX_JSON_TOKEN_STRING; p->pos = i + 1; return ret; } else if (c == '\\') { p->tokenizer_escape = 1; } } else { p->tokenizer_escape = 0; } } } if (ttype != CX_JSON_NO_TOKEN) { // uncompleted token size_t uncompeted_len = p->size - token_start; if (p->uncompleted.tokentype == CX_JSON_NO_TOKEN) { // current token is uncompleted // save current token content in p->uncompleted CxJsonToken uncompleted; uncompleted.tokentype = ttype; uncompleted.length = uncompeted_len; uncompleted.alloc = uncompeted_len + 16; char *tmp = malloc(uncompleted.alloc); if (tmp) { memcpy(tmp, p->buffer + token_start, uncompeted_len); uncompleted.content = tmp; p->uncompleted = uncompleted; } else { p->error = 1; } } else { // previously we also had an uncompleted token // combine the uncompleted token with the current token if (token_append(&p->uncompleted, p->buffer + token_start, uncompeted_len)) { p->error = 1; } } } CxJsonToken ret = {CX_JSON_NO_TOKEN, NULL, 0, 0}; return ret; } static cxmutstr unescape_string(const char *str, size_t len) { // TODO: support more escape sequences // we know that the unescaped string will be shorter by at least 2 chars cxmutstr result; result.length = 0; result.ptr = malloc(len - 1); if (result.ptr == NULL) { // TODO: check if this actually leads to correct error handling return result; } bool u = false; for (size_t i = 1; i < len - 1; i++) { char c = str[i]; if (u) { u = false; if (c == 'n') { c = '\n'; } else if (c == 't') { c = '\t'; } result.ptr[result.length++] = c; } else { if (c == '\\') { u = true; } else { result.ptr[result.length++] = c; } } } result.ptr[result.length] = 0; return result; } static int parse_number(const char *str, size_t len, void *value, bool asint) { char *endptr = NULL; char buf[32]; if (len > 30) { return 1; } // TODO: if we can guarantee that we are working on a copied string already, we can avoid this memcpy memcpy(buf, str, len); buf[len] = 0; if (asint) { long long v = strtoll(buf, &endptr, 10); *((int64_t*)value) = (int64_t) v; } else { // TODO: proper JSON spec number parser double v = strtod(buf, &endptr); *((double*)value) = v; } return (endptr != &buf[len]); } static int add_state(CxJson *p, int state) { CxArrayReallocator alloc = cx_array_reallocator(NULL, p->states_internal); size_t size = p->nstates + 1; size_t capacity = p->states_alloc; // TODO: fix that nstates does not denote the size of the array // TODO: replace with a 16 bit (or maybe even 8 bit) version of cx_array_add() int result = cx_array_add( &p->states, &size, &capacity, sizeof(int), &state, &alloc ); if (result == 0) { p->nstates = size - 1; p->states_alloc = capacity; } return result; } static void end_elm(CxJson *p, CxJsonReaderType type) { p->reader_type = type; p->nstates--; } #define JP_STATE_VALUE_BEGIN 0 #define JP_STATE_VALUE_BEGIN_OBJ 1 #define JP_STATE_VALUE_BEGIN_AR 2 #define JP_STATE_ARRAY_SEP_OR_CLOSE 3 #define JP_STATE_OBJ_NAME_OR_CLOSE 4 #define JP_STATE_OBJ_NAME 5 #define JP_STATE_OBJ_COLON 6 #define JP_STATE_OBJ_SEP_OR_CLOSE 7 static int next_state_after_value(int current) { switch (current) { default: return -1; // after value JSON complete, expect nothing case JP_STATE_VALUE_BEGIN: return -1; // after obj value, expect ',' or '}' case JP_STATE_VALUE_BEGIN_OBJ: return JP_STATE_OBJ_SEP_OR_CLOSE; // after array value, expect ',' or ']' case JP_STATE_VALUE_BEGIN_AR: return JP_STATE_ARRAY_SEP_OR_CLOSE; } } static void clear_valuename(CxJson *p) { free(p->value_name); p->value_name = NULL; p->value_name_len = 0; } static void clear_values(CxJson *p) { free(p->value_str); p->value_str = NULL; p->value_str_len = 0; p->value_int = 0; p->value_double = 0; } static int json_read(CxJson *p) { int state = p->states[p->nstates]; clear_values(p); CxJsonToken token = json_parser_next_token(p); p->reader_token = token; p->value_ready = 0; if (token.tokentype == CX_JSON_NO_TOKEN) { return 0; } int ret = 1; // 0 JP_STATE_VALUE_BEGIN value begin // 1 JP_STATE_VALUE_BEGIN_OBJ value begin (inside object) // 2 JP_STATE_VALUE_BEGIN_AR value begin (inside array) // 3 JP_STATE_ARRAY_SEP_OR_CLOSE array, expect separator or arrayclose // 4 JP_STATE_OBJ_NAME_OR_CLOSE object, expect name or objclose // 5 JP_STATE_OBJ_NAME object, expect name // 6 JP_STATE_OBJ_COLON object, expect ':' // 7 JP_STATE_OBJ_SEP_OR_CLOSE object, expect separator, objclose if (state == JP_STATE_VALUE_BEGIN_AR || state == JP_STATE_OBJ_SEP_OR_CLOSE) { clear_valuename(p); } if (state < 3) { // expect value p->states[p->nstates] = next_state_after_value(state); p->value_ready = 1; switch (token.tokentype) { case CX_JSON_TOKEN_BEGIN_ARRAY: { p->reader_type = CX_JSON_READER_ARRAY_BEGIN; ret = add_state(p, JP_STATE_VALUE_BEGIN_AR) ? -1 : 1; break; } case CX_JSON_TOKEN_BEGIN_OBJECT: { p->reader_type = CX_JSON_READER_OBJECT_BEGIN; ret = add_state(p, JP_STATE_OBJ_NAME_OR_CLOSE) ? -1 : 1; break; } case CX_JSON_TOKEN_END_ARRAY: { p->value_ready = 0; end_elm(p, CX_JSON_READER_ARRAY_END); break; } case CX_JSON_TOKEN_STRING: { p->reader_type = CX_JSON_READER_STRING; cxmutstr str = unescape_string(token.content, token.length); if (str.ptr) { p->value_str = str.ptr; p->value_str_len = str.length; } else { ret = -1; } break; } case CX_JSON_TOKEN_INTEGER: { p->reader_type = CX_JSON_READER_INTEGER; if (parse_number(token.content, token.length, &p->value_int, true)) { ret = -1; } break; } case CX_JSON_TOKEN_NUMBER: { p->reader_type = CX_JSON_READER_NUMBER; if (parse_number(token.content, token.length, &p->value_double, false)) { ret = -1; } break; } case CX_JSON_TOKEN_LITERAL: { p->reader_type = CX_JSON_READER_LITERAL; break; } default: ret = -1; } } else if (state == JP_STATE_ARRAY_SEP_OR_CLOSE) { // expect ',' or ']' if (token.tokentype == CX_JSON_TOKEN_VALUE_SEPARATOR) { p->states[p->nstates] = JP_STATE_VALUE_BEGIN_AR; ret = json_read(p); } else if (token.tokentype == CX_JSON_TOKEN_END_ARRAY) { end_elm(p, CX_JSON_READER_ARRAY_END); } else { ret = -1; } } else if (state == JP_STATE_OBJ_NAME_OR_CLOSE || state == JP_STATE_OBJ_NAME) { if (state == JP_STATE_OBJ_NAME_OR_CLOSE && token.tokentype == CX_JSON_TOKEN_END_OBJECT) { clear_valuename(p); end_elm(p, CX_JSON_READER_OBJECT_END); } else { // expect string if (token.tokentype != CX_JSON_TOKEN_STRING) return -1; if (p->value_name) free(p->value_name); cxmutstr valname = unescape_string(token.content, token.length); p->value_name = valname.ptr; p->value_name_len = valname.length; // next state p->states[p->nstates] = JP_STATE_OBJ_COLON; ret = json_read(p); } } else if (state == JP_STATE_OBJ_COLON) { // expect ':' if (token.tokentype != CX_JSON_TOKEN_NAME_SEPARATOR) return -1; // next state p->states[p->nstates] = JP_STATE_VALUE_BEGIN_OBJ; ret = json_read(p); } else if (state == JP_STATE_OBJ_SEP_OR_CLOSE) { // expect ',' or '}' if (token.tokentype == CX_JSON_TOKEN_VALUE_SEPARATOR) { p->states[p->nstates] = JP_STATE_OBJ_NAME; ret = json_read(p); } else if (token.tokentype == CX_JSON_TOKEN_END_OBJECT) { end_elm(p, CX_JSON_READER_OBJECT_END); } else { ret = -1; } } if (token.alloc > 0) { free((char*)token.content); } return ret; } static CxJsonLiteral json_reader_literal(CxJson *p) { const char *l = p->reader_token.content; size_t token_len = p->reader_token.length; if (token_len == 4 && !memcmp(l, "true", 4)) { return CX_JSON_TRUE; } else if (token_len == 5 && !memcmp(l, "false", 5)) { return CX_JSON_FALSE; } return CX_JSON_NULL; } /* -------------------- read value functions -------------------- */ static int setup_read_value(CxJson *p) { p->readvalue_alloc = PARSER_READVALUE_ALLOC; p->readvalue_nelm = 0; p->readvalue_stack = calloc(p->readvalue_alloc, sizeof(CxJsonValue *)); if (!p->readvalue_stack) return -1; p->read_value = NULL; p->readvalue_stack[0] = NULL; return 0; } static int add_to_parent(CxJson *p, CxJsonValue *parent, CxJsonValue *v) { if (!parent) { return -1; // shouldn't happen but who knows } if (parent->type == CX_JSON_OBJECT) { if (!p->value_name || p->value_name_len == 0) { return -1; } char *valuename = p->value_name; p->value_name = NULL; CxJsonObjValue newvalue; newvalue.name = valuename; newvalue.value = v; return cx_array_simple_add(parent->value.object.values, newvalue); } else if (parent->type == CX_JSON_ARRAY) { return cx_array_simple_add(parent->value.array.array, v); } else { return -1; // should also never happen } } static int readvaluestack_add(CxJson *p, CxJsonValue *v) { if (p->readvalue_nelm == p->readvalue_alloc) { p->readvalue_alloc *= 2; if (cx_reallocate(&p->readvalue_stack, sizeof(CxJsonValue *) * p->readvalue_alloc)) { return -1; } } p->readvalue_stack[p->readvalue_nelm++] = v; return 0; } void cxJsonInit(CxJson *json) { memset(json, 0, sizeof(CxJson)); json->states = json->states_internal; json->states_alloc = cx_nmemb(json->states_internal); // TODO: find better way to configure the initial allocation size for arrays and objects json->reader_array_alloc = 8; } void cxJsonDestroy(CxJson *p) { if (p->states != p->states_internal) { free(p->states); } free(p->readvalue_stack); cxJsonValueFree(p->read_value); free(p->value_name); free(p->value_str); } int cxJsonFilln(CxJson *p, const char *buf, size_t size) { // TODO: implement rescue buffer like in CxProperties to allow subsequent fills p->buffer = buf; p->size = size; p->pos = 0; return 0; } int cxJsonNext(CxJson *p, CxJsonValue **value) { // TODO: replace int with a status enum like in CxProperties *value = NULL; // TODO: maybe better initialize with NOTHING? if (!p->readvalue_stack) { if (setup_read_value(p)) return -1; } while (p->readvalue_nelm > 0 || !p->read_value) { if (p->value_ready) { // value available without another read CxJsonValue *v = calloc(1, sizeof(CxJsonValue)); if (!v) return -1; if (p->readvalue_nelm > 0) { if (add_to_parent(p, p->readvalue_stack[p->readvalue_nelm - 1], v)) { free(v); return -1; } } else { // set this value as root p->read_value = v; } switch (p->reader_type) { case CX_JSON_READER_OBJECT_BEGIN: { v->type = CX_JSON_OBJECT; if (readvaluestack_add(p, v)) { return -1; } break; } case CX_JSON_READER_OBJECT_END: return -1; // should not happen case CX_JSON_READER_ARRAY_BEGIN: { v->type = CX_JSON_ARRAY; if (readvaluestack_add(p, v)) { return -1; } break; } case CX_JSON_READER_ARRAY_END: return -1; // should not happen case CX_JSON_READER_STRING: { v->type = CX_JSON_STRING; if (p->value_str) { v->value.string.ptr = p->value_str; v->value.string.length = p->value_str_len; p->value_str = NULL; } break; } case CX_JSON_READER_INTEGER: { v->type = CX_JSON_INTEGER; v->value.integer = p->value_int; break; } case CX_JSON_READER_NUMBER: { v->type = CX_JSON_NUMBER; v->value.number = p->value_double; break; } case CX_JSON_READER_LITERAL: { v->type = CX_JSON_LITERAL; v->value.literal = json_reader_literal(p); break; } } } else if (p->readvalue_initialized) { CxJsonReaderType rt = p->reader_type; if (rt == CX_JSON_READER_OBJECT_END || rt == CX_JSON_READER_ARRAY_END) { p->readvalue_nelm--; } // else: p->value_ready is 1, this will be handled in the next run } if (p->readvalue_nelm > 0 || !p->read_value) { int r = json_read(p); if (r != 1) { p->readvalue_initialized = 0; return r; } p->readvalue_initialized = 1; } } *value = p->read_value; p->readvalue_initialized = 0; p->read_value = NULL; p->value_ready = 0; return 1; } void cxJsonValueFree(CxJsonValue *value) { if (value == NULL || value == &cx_json_value_nothing) return; // TODO: discuss if we should keep freeing the stuff recursively switch (value->type) { case CX_JSON_OBJECT: { CxJsonObject obj = value->value.object; for (size_t i = 0; i < obj.values_size; i++) { cxJsonValueFree(obj.values[i].value); free(obj.values[i].name); } free(obj.values); break; } case CX_JSON_ARRAY: { CxJsonArray array = value->value.array; for (size_t i = 0; i < array.array_size; i++) { cxJsonValueFree(array.array[i]); } free(array.array); break; } case CX_JSON_STRING: { free(value->value.string.ptr); break; } default: { break; } } free(value); } CxJsonValue *cxJsonArrGet(CxJsonValue *value, size_t index) { if (index >= value->value.array.array_size) { return &cx_json_value_nothing; } return value->value.array.array[index]; } CxJsonValue *cxJsonObjGet(CxJsonValue *value, const char *name) { CxJsonObject *obj = &(value->value.object); // TODO: think about sorting the object so that we can use binary search here for (size_t i = 0; i < obj->values_size; i++) { // TODO: we might want to store names as cxmutstr if (0 == strcmp(name, obj->values[i].name)) { return obj->values[i].value; } } return &cx_json_value_nothing; }