# HG changeset patch # User Mike Becker # Date 1675365934 -3600 # Node ID ec50abb285ad70b9742fc1754699a8ee0c934e98 # Parent fcaa0891ef28e4f5f05c477d7fc002fb9815ae80 add strtok API - fixes #220 diff -r fcaa0891ef28 -r ec50abb285ad src/cx/string.h --- a/src/cx/string.h Wed Feb 01 18:07:16 2023 +0100 +++ b/src/cx/string.h Thu Feb 02 20:25:34 2023 +0100 @@ -79,6 +79,57 @@ typedef struct cx_string_s cxstring; /** + * Context for string tokenizing. + */ +struct cx_strtok_ctx_s { + /** + * The string to tokenize. + */ + cxstring str; + /** + * The primary delimiter. + */ + cxstring delim; + /** + * Optional array of more delimiters. + */ + cxstring const *delim_more; + /** + * Length of the array containing more delimiters. + */ + size_t delim_more_count; + /** + * Position of the currently active token in the source string. + */ + size_t pos; + /** + * Position of next delimiter in the source string. + * + * If the tokenizer has not yet returned a token, the content of this field + * is undefined. If the tokenizer reached the end of the string, this field + * contains the length of the source string. + */ + size_t delim_pos; + /** + * The position of the next token in the source string. + */ + size_t next_pos; + /** + * The number of already found tokens. + */ + size_t found; + /** + * The maximum number of tokens that shall be returned. + */ + size_t limit; +}; + +/** + * A string tokenizing context. + */ +typedef struct cx_strtok_ctx_s CxStrtokCtx; + +/** * A literal initializer for an UCX string structure. * * The argument MUST be a string (const char*) \em literal. @@ -828,6 +879,85 @@ #define cx_strreplace(str, pattern, replacement) \ cx_strreplacen_a(cxDefaultAllocator, str, pattern, replacement, SIZE_MAX) +/** + * Creates a string tokenization context. + * + * @param str the string to tokenize + * @param delim the delimiter (must not be empty) + * @param limit the maximum number of tokens that shall be returned + * @return a new string tokenization context + */ +__attribute__((__warn_unused_result__)) +CxStrtokCtx cx_strtok( + cxstring str, + cxstring delim, + size_t limit +); + +/** +* Creates a string tokenization context for a mutable string. +* +* @param str the string to tokenize +* @param delim the delimiter (must not be empty) +* @param limit the maximum number of tokens that shall be returned +* @return a new string tokenization context +*/ +__attribute__((__warn_unused_result__)) +CxStrtokCtx cx_strtok_m( + cxmutstr str, + cxstring delim, + size_t limit +); + +/** + * Returns the next token. + * + * The token will point to the source string. + * + * @param ctx the tokenization context + * @param token a pointer to memory where the next token shall be stored + * @return true if successful, false if the limit or the end of the string + * has been reached + */ +__attribute__((__warn_unused_result__, __nonnull__)) +bool cx_strtok_next( + CxStrtokCtx *ctx, + cxstring *token +); + +/** + * Returns the next token of a mutable string. + * + * The token will point to the source string. + * If the context was not initialized over a mutable string, modifying + * the data of the returned token is undefined behavior. + * + * @param ctx the tokenization context + * @param token a pointer to memory where the next token shall be stored + * @return true if successful, false if the limit or the end of the string + * has been reached + */ +__attribute__((__warn_unused_result__, __nonnull__)) +bool cx_strtok_next_m( + CxStrtokCtx *ctx, + cxmutstr *token +); + +/** + * Defines an array of more delimiters for the specified tokenization context. + * + * @param ctx the tokenization context + * @param delim array of more delimiters + * @param count number of elements in the array + */ +__attribute__((__nonnull__)) +void cx_strtok_delim( + CxStrtokCtx *ctx, + cxstring const *delim, + size_t count +); + + #ifdef __cplusplus } // extern "C" #endif diff -r fcaa0891ef28 -r ec50abb285ad src/string.c --- a/src/string.c Wed Feb 01 18:07:16 2023 +0100 +++ b/src/string.c Thu Feb 02 20:25:34 2023 +0100 @@ -674,4 +674,87 @@ return result; } +CxStrtokCtx cx_strtok( + cxstring str, + cxstring delim, + size_t limit +) { + CxStrtokCtx ctx; + ctx.str = str; + ctx.delim = delim; + ctx.limit = limit; + ctx.pos = 0; + ctx.next_pos = 0; + ctx.delim_pos = 0; + ctx.found = 0; + ctx.delim_more = NULL; + ctx.delim_more_count = 0; + return ctx; +} +CxStrtokCtx cx_strtok_m( + cxmutstr str, + cxstring delim, + size_t limit +) { + return cx_strtok(cx_strcast(str), delim, limit); +} + +bool cx_strtok_next( + CxStrtokCtx *ctx, + cxstring *token +) { + // abortion criteria + if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) { + return false; + } + + // determine the search start + cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos); + + // search the next delimiter + cxstring delim = cx_strstr(haystack, ctx->delim); + + // if found, make delim capture exactly the delimiter + if (delim.length > 0) { + delim.length = ctx->delim.length; + } + + // if more delimiters are specified, check them now + if (ctx->delim_more_count > 0) { + cx_for_n(i, ctx->delim_more_count) { + cxstring d = cx_strstr(haystack, ctx->delim_more[i]); + if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { + delim.ptr = d.ptr; + delim.length = ctx->delim_more[i].length; + } + } + } + + // store the token information and adjust the context + ctx->found++; + ctx->pos = ctx->next_pos; + token->ptr = &ctx->str.ptr[ctx->pos]; + ctx->delim_pos = delim.length == 0 ? + ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr); + token->length = ctx->delim_pos - ctx->pos; + ctx->next_pos = ctx->delim_pos + delim.length; + + return true; +} + +bool cx_strtok_next_m( + CxStrtokCtx *ctx, + cxmutstr *token +) { + return cx_strtok_next(ctx, (cxstring *) token); +} + +void cx_strtok_delim( + CxStrtokCtx *ctx, + cxstring const *delim, + size_t count +) { + ctx->delim_more = delim; + ctx->delim_more_count = count; +} diff -r fcaa0891ef28 -r ec50abb285ad test/test_string.cpp --- a/test/test_string.cpp Wed Feb 01 18:07:16 2023 +0100 +++ b/test/test_string.cpp Thu Feb 02 20:25:34 2023 +0100 @@ -653,3 +653,213 @@ EXPECT_STREQ(str.ptr, "this 1s @ te$t"); cx_strfree(&str); } + +TEST(String, strtok) { + cxstring str = cx_str("a,comma,separated,string"); + cxstring delim = cx_str(","); + CxStrtokCtx ctx = cx_strtok(str, delim, 3); + EXPECT_EQ(ctx.str.ptr, str.ptr); + EXPECT_EQ(ctx.str.length, str.length); + EXPECT_EQ(ctx.delim.ptr, delim.ptr); + EXPECT_EQ(ctx.delim.length, delim.length); + EXPECT_EQ(ctx.limit, 3); + EXPECT_EQ(ctx.found, 0); + EXPECT_EQ(ctx.pos, 0); + EXPECT_EQ(ctx.next_pos, 0); + EXPECT_EQ(ctx.delim_more, nullptr); + EXPECT_EQ(ctx.delim_more_count, 0); +} + +TEST(String, strtok_m) { + cxmutstr str = cx_strdup(cx_str("a,comma,separated,string")); + cxstring delim = cx_str(","); + CxStrtokCtx ctx = cx_strtok_m(str, delim, 3); + EXPECT_EQ(ctx.str.ptr, str.ptr); + EXPECT_EQ(ctx.str.length, str.length); + EXPECT_EQ(ctx.delim.ptr, delim.ptr); + EXPECT_EQ(ctx.delim.length, delim.length); + EXPECT_EQ(ctx.limit, 3); + EXPECT_EQ(ctx.found, 0); + EXPECT_EQ(ctx.pos, 0); + EXPECT_EQ(ctx.next_pos, 0); + EXPECT_EQ(ctx.delim_more, nullptr); + EXPECT_EQ(ctx.delim_more_count, 0); + cx_strfree(&str); +} + +TEST(String, strtok_delim) { + cxstring str = cx_str("an,arbitrarily|separated;string"); + cxstring delim = cx_str(","); + cxstring delim_more[2] = {CX_STR("|"), CX_STR(";")}; + CxStrtokCtx ctx = cx_strtok(str, delim, 3); + cx_strtok_delim(&ctx, delim_more, 2); + EXPECT_EQ(ctx.str.ptr, str.ptr); + EXPECT_EQ(ctx.str.length, str.length); + EXPECT_EQ(ctx.delim.ptr, delim.ptr); + EXPECT_EQ(ctx.delim.length, delim.length); + EXPECT_EQ(ctx.limit, 3); + EXPECT_EQ(ctx.found, 0); + EXPECT_EQ(ctx.pos, 0); + EXPECT_EQ(ctx.next_pos, 0); + EXPECT_EQ(ctx.delim_more, delim_more); + EXPECT_EQ(ctx.delim_more_count, 2); +} + +TEST(String, strtok_next_easy) { + cxstring str = cx_str("a,comma,separated,string"); + cxstring delim = cx_str(","); + CxStrtokCtx ctx = cx_strtok(str, delim, 3); + bool ret; + cxstring tok; + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("a")), 0); + EXPECT_EQ(ctx.pos, 0); + EXPECT_EQ(ctx.next_pos, 2); + EXPECT_EQ(ctx.delim_pos, 1); + EXPECT_EQ(ctx.found, 1); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("comma")), 0); + EXPECT_EQ(ctx.pos, 2); + EXPECT_EQ(ctx.next_pos, 8); + EXPECT_EQ(ctx.delim_pos, 7); + EXPECT_EQ(ctx.found, 2); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0); + EXPECT_EQ(ctx.pos, 8); + EXPECT_EQ(ctx.next_pos, 18); + EXPECT_EQ(ctx.delim_pos, 17); + EXPECT_EQ(ctx.found, 3); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_FALSE(ret); + EXPECT_EQ(ctx.pos, 8); + EXPECT_EQ(ctx.next_pos, 18); + EXPECT_EQ(ctx.delim_pos, 17); + EXPECT_EQ(ctx.found, 3); +} + +TEST(String, strtok_next_unlimited) { + cxstring str = cx_str("some;-;otherwise;-;separated;-;string;-;"); + cxstring delim = cx_str(";-;"); + CxStrtokCtx ctx = cx_strtok(str, delim, SIZE_MAX); + bool ret; + cxstring tok; + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("some")), 0); + EXPECT_EQ(ctx.pos, 0); + EXPECT_EQ(ctx.next_pos, 7); + EXPECT_EQ(ctx.delim_pos, 4); + EXPECT_EQ(ctx.found, 1); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("otherwise")), 0); + EXPECT_EQ(ctx.pos, 7); + EXPECT_EQ(ctx.next_pos, 19); + EXPECT_EQ(ctx.delim_pos, 16); + EXPECT_EQ(ctx.found, 2); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0); + EXPECT_EQ(ctx.pos, 19); + EXPECT_EQ(ctx.next_pos, 31); + EXPECT_EQ(ctx.delim_pos, 28); + EXPECT_EQ(ctx.found, 3); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("string")), 0); + EXPECT_EQ(ctx.pos, 31); + EXPECT_EQ(ctx.next_pos, 40); + EXPECT_EQ(ctx.delim_pos, 37); + EXPECT_EQ(ctx.found, 4); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(tok, cx_str("")), 0); + EXPECT_EQ(ctx.pos, 40); + EXPECT_EQ(ctx.next_pos, 40); + EXPECT_EQ(ctx.delim_pos, 40); + EXPECT_EQ(ctx.found, 5); + + ret = cx_strtok_next(&ctx, &tok); + ASSERT_FALSE(ret); + EXPECT_EQ(ctx.pos, 40); + EXPECT_EQ(ctx.delim_pos, 40); + EXPECT_EQ(ctx.found, 5); +} + +TEST(String, strtok_next_advanced) { + cxmutstr str = cx_strdup(cx_str("an,arbitrarily;||separated;string")); + cxstring delim = cx_str(","); + cxstring delim_more[2] = {CX_STR("||"), CX_STR(";")}; + CxStrtokCtx ctx = cx_strtok_m(str, delim, 10); + cx_strtok_delim(&ctx, delim_more, 2); + bool ret; + cxmutstr tok; + + ret = cx_strtok_next_m(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("an")), 0); + EXPECT_EQ(ctx.pos, 0); + EXPECT_EQ(ctx.next_pos, 3); + EXPECT_EQ(ctx.delim_pos, 2); + EXPECT_EQ(ctx.found, 1); + cx_strupper(tok); + + ret = cx_strtok_next_m(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("arbitrarily")), 0); + EXPECT_EQ(ctx.pos, 3); + EXPECT_EQ(ctx.next_pos, 15); + EXPECT_EQ(ctx.delim_pos, 14); + EXPECT_EQ(ctx.found, 2); + cx_strupper(tok); + + ret = cx_strtok_next_m(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("")), 0); + EXPECT_EQ(ctx.pos, 15); + EXPECT_EQ(ctx.next_pos, 17); + EXPECT_EQ(ctx.delim_pos, 15); + EXPECT_EQ(ctx.found, 3); + cx_strupper(tok); + + ret = cx_strtok_next_m(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("separated")), 0); + EXPECT_EQ(ctx.pos, 17); + EXPECT_EQ(ctx.next_pos, 27); + EXPECT_EQ(ctx.delim_pos, 26); + EXPECT_EQ(ctx.found, 4); + cx_strupper(tok); + + ret = cx_strtok_next_m(&ctx, &tok); + ASSERT_TRUE(ret); + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("string")), 0); + EXPECT_EQ(ctx.pos, 27); + EXPECT_EQ(ctx.next_pos, 33); + EXPECT_EQ(ctx.delim_pos, 33); + EXPECT_EQ(ctx.found, 5); + cx_strupper(tok); + + ret = cx_strtok_next_m(&ctx, &tok); + ASSERT_FALSE(ret); + EXPECT_EQ(ctx.pos, 27); + EXPECT_EQ(ctx.next_pos, 33); + EXPECT_EQ(ctx.delim_pos, 33); + EXPECT_EQ(ctx.found, 5); + + EXPECT_EQ(cx_strcmp(cx_strcast(str), cx_str("AN,ARBITRARILY;||SEPARATED;STRING")), 0); + + cx_strfree(&str); +}