add strtok API - fixes #220

24 months ago

author
Mike Becker <universe@uap-core.de>
date
Thu, 02 Feb 2023 20:25:34 +0100 (24 months ago)
changeset 645
ec50abb285ad
parent 644
fcaa0891ef28
child 646
dfd0403ff8b6
child 648
4e115b610b37

add strtok API - fixes #220

src/cx/string.h file | annotate | diff | comparison | revisions
src/string.c file | annotate | diff | comparison | revisions
test/test_string.cpp file | annotate | diff | comparison | revisions
--- a/src/cx/string.h	Wed Feb 01 18:07:16 2023 +0100
+++ b/src/cx/string.h	Thu Feb 02 20:25:34 2023 +0100
@@ -79,6 +79,57 @@
 typedef struct cx_string_s cxstring;
 
 /**
+ * Context for string tokenizing.
+ */
+struct cx_strtok_ctx_s {
+    /**
+     * The string to tokenize.
+     */
+    cxstring str;
+    /**
+     * The primary delimiter.
+     */
+    cxstring delim;
+    /**
+     * Optional array of more delimiters.
+     */
+    cxstring const *delim_more;
+    /**
+     * Length of the array containing more delimiters.
+     */
+    size_t delim_more_count;
+    /**
+     * Position of the currently active token in the source string.
+     */
+    size_t pos;
+    /**
+     * Position of next delimiter in the source string.
+     *
+     * If the tokenizer has not yet returned a token, the content of this field
+     * is undefined. If the tokenizer reached the end of the string, this field
+     * contains the length of the source string.
+     */
+    size_t delim_pos;
+    /**
+     * The position of the next token in the source string.
+     */
+    size_t next_pos;
+    /**
+     * The number of already found tokens.
+     */
+    size_t found;
+    /**
+     * The maximum number of tokens that shall be returned.
+     */
+    size_t limit;
+};
+
+/**
+ * A string tokenizing context.
+ */
+typedef struct cx_strtok_ctx_s CxStrtokCtx;
+
+/**
  * A literal initializer for an UCX string structure.
  *
  * The argument MUST be a string (const char*) \em literal.
@@ -828,6 +879,85 @@
 #define cx_strreplace(str, pattern, replacement) \
 cx_strreplacen_a(cxDefaultAllocator, str, pattern, replacement, SIZE_MAX)
 
+/**
+ * Creates a string tokenization context.
+ *
+ * @param str the string to tokenize
+ * @param delim the delimiter (must not be empty)
+ * @param limit the maximum number of tokens that shall be returned
+ * @return a new string tokenization context
+ */
+__attribute__((__warn_unused_result__))
+CxStrtokCtx cx_strtok(
+        cxstring str,
+        cxstring delim,
+        size_t limit
+);
+
+/**
+* Creates a string tokenization context for a mutable string.
+*
+* @param str the string to tokenize
+* @param delim the delimiter (must not be empty)
+* @param limit the maximum number of tokens that shall be returned
+* @return a new string tokenization context
+*/
+__attribute__((__warn_unused_result__))
+CxStrtokCtx cx_strtok_m(
+        cxmutstr str,
+        cxstring delim,
+        size_t limit
+);
+
+/**
+ * Returns the next token.
+ *
+ * The token will point to the source string.
+ *
+ * @param ctx the tokenization context
+ * @param token a pointer to memory where the next token shall be stored
+ * @return true if successful, false if the limit or the end of the string
+ * has been reached
+ */
+__attribute__((__warn_unused_result__, __nonnull__))
+bool cx_strtok_next(
+        CxStrtokCtx *ctx,
+        cxstring *token
+);
+
+/**
+ * Returns the next token of a mutable string.
+ *
+ * The token will point to the source string.
+ * If the context was not initialized over a mutable string, modifying
+ * the data of the returned token is undefined behavior.
+ *
+ * @param ctx the tokenization context
+ * @param token a pointer to memory where the next token shall be stored
+ * @return true if successful, false if the limit or the end of the string
+ * has been reached
+ */
+__attribute__((__warn_unused_result__, __nonnull__))
+bool cx_strtok_next_m(
+        CxStrtokCtx *ctx,
+        cxmutstr *token
+);
+
+/**
+ * Defines an array of more delimiters for the specified tokenization context.
+ *
+ * @param ctx the tokenization context
+ * @param delim array of more delimiters
+ * @param count number of elements in the array
+ */
+__attribute__((__nonnull__))
+void cx_strtok_delim(
+        CxStrtokCtx *ctx,
+        cxstring const *delim,
+        size_t count
+);
+
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/src/string.c	Wed Feb 01 18:07:16 2023 +0100
+++ b/src/string.c	Thu Feb 02 20:25:34 2023 +0100
@@ -674,4 +674,87 @@
     return result;
 }
 
+CxStrtokCtx cx_strtok(
+        cxstring str,
+        cxstring delim,
+        size_t limit
+) {
+    CxStrtokCtx ctx;
+    ctx.str = str;
+    ctx.delim = delim;
+    ctx.limit = limit;
+    ctx.pos = 0;
+    ctx.next_pos = 0;
+    ctx.delim_pos = 0;
+    ctx.found = 0;
+    ctx.delim_more = NULL;
+    ctx.delim_more_count = 0;
+    return ctx;
+}
 
+CxStrtokCtx cx_strtok_m(
+        cxmutstr str,
+        cxstring delim,
+        size_t limit
+) {
+    return cx_strtok(cx_strcast(str), delim, limit);
+}
+
+bool cx_strtok_next(
+        CxStrtokCtx *ctx,
+        cxstring *token
+) {
+    // abortion criteria
+    if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
+        return false;
+    }
+
+    // determine the search start
+    cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);
+
+    // search the next delimiter
+    cxstring delim = cx_strstr(haystack, ctx->delim);
+
+    // if found, make delim capture exactly the delimiter
+    if (delim.length > 0) {
+        delim.length = ctx->delim.length;
+    }
+
+    // if more delimiters are specified, check them now
+    if (ctx->delim_more_count > 0) {
+        cx_for_n(i, ctx->delim_more_count) {
+            cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
+            if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
+                delim.ptr = d.ptr;
+                delim.length = ctx->delim_more[i].length;
+            }
+        }
+    }
+
+    // store the token information and adjust the context
+    ctx->found++;
+    ctx->pos = ctx->next_pos;
+    token->ptr = &ctx->str.ptr[ctx->pos];
+    ctx->delim_pos = delim.length == 0 ?
+                     ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
+    token->length = ctx->delim_pos - ctx->pos;
+    ctx->next_pos = ctx->delim_pos + delim.length;
+
+    return true;
+}
+
+bool cx_strtok_next_m(
+        CxStrtokCtx *ctx,
+        cxmutstr *token
+) {
+    return cx_strtok_next(ctx, (cxstring *) token);
+}
+
+void cx_strtok_delim(
+        CxStrtokCtx *ctx,
+        cxstring const *delim,
+        size_t count
+) {
+    ctx->delim_more = delim;
+    ctx->delim_more_count = count;
+}
--- a/test/test_string.cpp	Wed Feb 01 18:07:16 2023 +0100
+++ b/test/test_string.cpp	Thu Feb 02 20:25:34 2023 +0100
@@ -653,3 +653,213 @@
     EXPECT_STREQ(str.ptr, "this 1s @ te$t");
     cx_strfree(&str);
 }
+
+TEST(String, strtok) {
+    cxstring str = cx_str("a,comma,separated,string");
+    cxstring delim = cx_str(",");
+    CxStrtokCtx ctx = cx_strtok(str, delim, 3);
+    EXPECT_EQ(ctx.str.ptr, str.ptr);
+    EXPECT_EQ(ctx.str.length, str.length);
+    EXPECT_EQ(ctx.delim.ptr, delim.ptr);
+    EXPECT_EQ(ctx.delim.length, delim.length);
+    EXPECT_EQ(ctx.limit, 3);
+    EXPECT_EQ(ctx.found, 0);
+    EXPECT_EQ(ctx.pos, 0);
+    EXPECT_EQ(ctx.next_pos, 0);
+    EXPECT_EQ(ctx.delim_more, nullptr);
+    EXPECT_EQ(ctx.delim_more_count, 0);
+}
+
+TEST(String, strtok_m) {
+    cxmutstr str = cx_strdup(cx_str("a,comma,separated,string"));
+    cxstring delim = cx_str(",");
+    CxStrtokCtx ctx = cx_strtok_m(str, delim, 3);
+    EXPECT_EQ(ctx.str.ptr, str.ptr);
+    EXPECT_EQ(ctx.str.length, str.length);
+    EXPECT_EQ(ctx.delim.ptr, delim.ptr);
+    EXPECT_EQ(ctx.delim.length, delim.length);
+    EXPECT_EQ(ctx.limit, 3);
+    EXPECT_EQ(ctx.found, 0);
+    EXPECT_EQ(ctx.pos, 0);
+    EXPECT_EQ(ctx.next_pos, 0);
+    EXPECT_EQ(ctx.delim_more, nullptr);
+    EXPECT_EQ(ctx.delim_more_count, 0);
+    cx_strfree(&str);
+}
+
+TEST(String, strtok_delim) {
+    cxstring str = cx_str("an,arbitrarily|separated;string");
+    cxstring delim = cx_str(",");
+    cxstring delim_more[2] = {CX_STR("|"), CX_STR(";")};
+    CxStrtokCtx ctx = cx_strtok(str, delim, 3);
+    cx_strtok_delim(&ctx, delim_more, 2);
+    EXPECT_EQ(ctx.str.ptr, str.ptr);
+    EXPECT_EQ(ctx.str.length, str.length);
+    EXPECT_EQ(ctx.delim.ptr, delim.ptr);
+    EXPECT_EQ(ctx.delim.length, delim.length);
+    EXPECT_EQ(ctx.limit, 3);
+    EXPECT_EQ(ctx.found, 0);
+    EXPECT_EQ(ctx.pos, 0);
+    EXPECT_EQ(ctx.next_pos, 0);
+    EXPECT_EQ(ctx.delim_more, delim_more);
+    EXPECT_EQ(ctx.delim_more_count, 2);
+}
+
+TEST(String, strtok_next_easy) {
+    cxstring str = cx_str("a,comma,separated,string");
+    cxstring delim = cx_str(",");
+    CxStrtokCtx ctx = cx_strtok(str, delim, 3);
+    bool ret;
+    cxstring tok;
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("a")), 0);
+    EXPECT_EQ(ctx.pos, 0);
+    EXPECT_EQ(ctx.next_pos, 2);
+    EXPECT_EQ(ctx.delim_pos, 1);
+    EXPECT_EQ(ctx.found, 1);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("comma")), 0);
+    EXPECT_EQ(ctx.pos, 2);
+    EXPECT_EQ(ctx.next_pos, 8);
+    EXPECT_EQ(ctx.delim_pos, 7);
+    EXPECT_EQ(ctx.found, 2);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0);
+    EXPECT_EQ(ctx.pos, 8);
+    EXPECT_EQ(ctx.next_pos, 18);
+    EXPECT_EQ(ctx.delim_pos, 17);
+    EXPECT_EQ(ctx.found, 3);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_FALSE(ret);
+    EXPECT_EQ(ctx.pos, 8);
+    EXPECT_EQ(ctx.next_pos, 18);
+    EXPECT_EQ(ctx.delim_pos, 17);
+    EXPECT_EQ(ctx.found, 3);
+}
+
+TEST(String, strtok_next_unlimited) {
+    cxstring str = cx_str("some;-;otherwise;-;separated;-;string;-;");
+    cxstring delim = cx_str(";-;");
+    CxStrtokCtx ctx = cx_strtok(str, delim, SIZE_MAX);
+    bool ret;
+    cxstring tok;
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("some")), 0);
+    EXPECT_EQ(ctx.pos, 0);
+    EXPECT_EQ(ctx.next_pos, 7);
+    EXPECT_EQ(ctx.delim_pos, 4);
+    EXPECT_EQ(ctx.found, 1);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("otherwise")), 0);
+    EXPECT_EQ(ctx.pos, 7);
+    EXPECT_EQ(ctx.next_pos, 19);
+    EXPECT_EQ(ctx.delim_pos, 16);
+    EXPECT_EQ(ctx.found, 2);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0);
+    EXPECT_EQ(ctx.pos, 19);
+    EXPECT_EQ(ctx.next_pos, 31);
+    EXPECT_EQ(ctx.delim_pos, 28);
+    EXPECT_EQ(ctx.found, 3);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("string")), 0);
+    EXPECT_EQ(ctx.pos, 31);
+    EXPECT_EQ(ctx.next_pos, 40);
+    EXPECT_EQ(ctx.delim_pos, 37);
+    EXPECT_EQ(ctx.found, 4);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(tok, cx_str("")), 0);
+    EXPECT_EQ(ctx.pos, 40);
+    EXPECT_EQ(ctx.next_pos, 40);
+    EXPECT_EQ(ctx.delim_pos, 40);
+    EXPECT_EQ(ctx.found, 5);
+
+    ret = cx_strtok_next(&ctx, &tok);
+    ASSERT_FALSE(ret);
+    EXPECT_EQ(ctx.pos, 40);
+    EXPECT_EQ(ctx.delim_pos, 40);
+    EXPECT_EQ(ctx.found, 5);
+}
+
+TEST(String, strtok_next_advanced) {
+    cxmutstr str = cx_strdup(cx_str("an,arbitrarily;||separated;string"));
+    cxstring delim = cx_str(",");
+    cxstring delim_more[2] = {CX_STR("||"), CX_STR(";")};
+    CxStrtokCtx ctx = cx_strtok_m(str, delim, 10);
+    cx_strtok_delim(&ctx, delim_more, 2);
+    bool ret;
+    cxmutstr tok;
+
+    ret = cx_strtok_next_m(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("an")), 0);
+    EXPECT_EQ(ctx.pos, 0);
+    EXPECT_EQ(ctx.next_pos, 3);
+    EXPECT_EQ(ctx.delim_pos, 2);
+    EXPECT_EQ(ctx.found, 1);
+    cx_strupper(tok);
+
+    ret = cx_strtok_next_m(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("arbitrarily")), 0);
+    EXPECT_EQ(ctx.pos, 3);
+    EXPECT_EQ(ctx.next_pos, 15);
+    EXPECT_EQ(ctx.delim_pos, 14);
+    EXPECT_EQ(ctx.found, 2);
+    cx_strupper(tok);
+
+    ret = cx_strtok_next_m(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("")), 0);
+    EXPECT_EQ(ctx.pos, 15);
+    EXPECT_EQ(ctx.next_pos, 17);
+    EXPECT_EQ(ctx.delim_pos, 15);
+    EXPECT_EQ(ctx.found, 3);
+    cx_strupper(tok);
+
+    ret = cx_strtok_next_m(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("separated")), 0);
+    EXPECT_EQ(ctx.pos, 17);
+    EXPECT_EQ(ctx.next_pos, 27);
+    EXPECT_EQ(ctx.delim_pos, 26);
+    EXPECT_EQ(ctx.found, 4);
+    cx_strupper(tok);
+
+    ret = cx_strtok_next_m(&ctx, &tok);
+    ASSERT_TRUE(ret);
+    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("string")), 0);
+    EXPECT_EQ(ctx.pos, 27);
+    EXPECT_EQ(ctx.next_pos, 33);
+    EXPECT_EQ(ctx.delim_pos, 33);
+    EXPECT_EQ(ctx.found, 5);
+    cx_strupper(tok);
+
+    ret = cx_strtok_next_m(&ctx, &tok);
+    ASSERT_FALSE(ret);
+    EXPECT_EQ(ctx.pos, 27);
+    EXPECT_EQ(ctx.next_pos, 33);
+    EXPECT_EQ(ctx.delim_pos, 33);
+    EXPECT_EQ(ctx.found, 5);
+
+    EXPECT_EQ(cx_strcmp(cx_strcast(str), cx_str("AN,ARBITRARILY;||SEPARATED;STRING")), 0);
+
+    cx_strfree(&str);
+}

mercurial