add strtok API - fixes #220

Thu, 02 Feb 2023 20:25:34 +0100

author
Mike Becker <universe@uap-core.de>
date
Thu, 02 Feb 2023 20:25:34 +0100
changeset 645
ec50abb285ad
parent 644
fcaa0891ef28
child 646
dfd0403ff8b6
child 648
4e115b610b37

add strtok API - fixes #220

src/cx/string.h file | annotate | diff | comparison | revisions
src/string.c file | annotate | diff | comparison | revisions
test/test_string.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cx/string.h	Wed Feb 01 18:07:16 2023 +0100
     1.2 +++ b/src/cx/string.h	Thu Feb 02 20:25:34 2023 +0100
     1.3 @@ -79,6 +79,57 @@
     1.4  typedef struct cx_string_s cxstring;
     1.5  
     1.6  /**
     1.7 + * Context for string tokenizing.
     1.8 + */
     1.9 +struct cx_strtok_ctx_s {
    1.10 +    /**
    1.11 +     * The string to tokenize.
    1.12 +     */
    1.13 +    cxstring str;
    1.14 +    /**
    1.15 +     * The primary delimiter.
    1.16 +     */
    1.17 +    cxstring delim;
    1.18 +    /**
    1.19 +     * Optional array of more delimiters.
    1.20 +     */
    1.21 +    cxstring const *delim_more;
    1.22 +    /**
    1.23 +     * Length of the array containing more delimiters.
    1.24 +     */
    1.25 +    size_t delim_more_count;
    1.26 +    /**
    1.27 +     * Position of the currently active token in the source string.
    1.28 +     */
    1.29 +    size_t pos;
    1.30 +    /**
    1.31 +     * Position of next delimiter in the source string.
    1.32 +     *
    1.33 +     * If the tokenizer has not yet returned a token, the content of this field
    1.34 +     * is undefined. If the tokenizer reached the end of the string, this field
    1.35 +     * contains the length of the source string.
    1.36 +     */
    1.37 +    size_t delim_pos;
    1.38 +    /**
    1.39 +     * The position of the next token in the source string.
    1.40 +     */
    1.41 +    size_t next_pos;
    1.42 +    /**
    1.43 +     * The number of already found tokens.
    1.44 +     */
    1.45 +    size_t found;
    1.46 +    /**
    1.47 +     * The maximum number of tokens that shall be returned.
    1.48 +     */
    1.49 +    size_t limit;
    1.50 +};
    1.51 +
    1.52 +/**
    1.53 + * A string tokenizing context.
    1.54 + */
    1.55 +typedef struct cx_strtok_ctx_s CxStrtokCtx;
    1.56 +
    1.57 +/**
    1.58   * A literal initializer for an UCX string structure.
    1.59   *
    1.60   * The argument MUST be a string (const char*) \em literal.
    1.61 @@ -828,6 +879,85 @@
    1.62  #define cx_strreplace(str, pattern, replacement) \
    1.63  cx_strreplacen_a(cxDefaultAllocator, str, pattern, replacement, SIZE_MAX)
    1.64  
    1.65 +/**
    1.66 + * Creates a string tokenization context.
    1.67 + *
    1.68 + * @param str the string to tokenize
    1.69 + * @param delim the delimiter (must not be empty)
    1.70 + * @param limit the maximum number of tokens that shall be returned
    1.71 + * @return a new string tokenization context
    1.72 + */
    1.73 +__attribute__((__warn_unused_result__))
    1.74 +CxStrtokCtx cx_strtok(
    1.75 +        cxstring str,
    1.76 +        cxstring delim,
    1.77 +        size_t limit
    1.78 +);
    1.79 +
    1.80 +/**
    1.81 +* Creates a string tokenization context for a mutable string.
    1.82 +*
    1.83 +* @param str the string to tokenize
    1.84 +* @param delim the delimiter (must not be empty)
    1.85 +* @param limit the maximum number of tokens that shall be returned
    1.86 +* @return a new string tokenization context
    1.87 +*/
    1.88 +__attribute__((__warn_unused_result__))
    1.89 +CxStrtokCtx cx_strtok_m(
    1.90 +        cxmutstr str,
    1.91 +        cxstring delim,
    1.92 +        size_t limit
    1.93 +);
    1.94 +
    1.95 +/**
    1.96 + * Returns the next token.
    1.97 + *
    1.98 + * The token will point to the source string.
    1.99 + *
   1.100 + * @param ctx the tokenization context
   1.101 + * @param token a pointer to memory where the next token shall be stored
   1.102 + * @return true if successful, false if the limit or the end of the string
   1.103 + * has been reached
   1.104 + */
   1.105 +__attribute__((__warn_unused_result__, __nonnull__))
   1.106 +bool cx_strtok_next(
   1.107 +        CxStrtokCtx *ctx,
   1.108 +        cxstring *token
   1.109 +);
   1.110 +
   1.111 +/**
   1.112 + * Returns the next token of a mutable string.
   1.113 + *
   1.114 + * The token will point to the source string.
   1.115 + * If the context was not initialized over a mutable string, modifying
   1.116 + * the data of the returned token is undefined behavior.
   1.117 + *
   1.118 + * @param ctx the tokenization context
   1.119 + * @param token a pointer to memory where the next token shall be stored
   1.120 + * @return true if successful, false if the limit or the end of the string
   1.121 + * has been reached
   1.122 + */
   1.123 +__attribute__((__warn_unused_result__, __nonnull__))
   1.124 +bool cx_strtok_next_m(
   1.125 +        CxStrtokCtx *ctx,
   1.126 +        cxmutstr *token
   1.127 +);
   1.128 +
   1.129 +/**
   1.130 + * Defines an array of more delimiters for the specified tokenization context.
   1.131 + *
   1.132 + * @param ctx the tokenization context
   1.133 + * @param delim array of more delimiters
   1.134 + * @param count number of elements in the array
   1.135 + */
   1.136 +__attribute__((__nonnull__))
   1.137 +void cx_strtok_delim(
   1.138 +        CxStrtokCtx *ctx,
   1.139 +        cxstring const *delim,
   1.140 +        size_t count
   1.141 +);
   1.142 +
   1.143 +
   1.144  #ifdef __cplusplus
   1.145  } // extern "C"
   1.146  #endif
     2.1 --- a/src/string.c	Wed Feb 01 18:07:16 2023 +0100
     2.2 +++ b/src/string.c	Thu Feb 02 20:25:34 2023 +0100
     2.3 @@ -674,4 +674,87 @@
     2.4      return result;
     2.5  }
     2.6  
     2.7 +CxStrtokCtx cx_strtok(
     2.8 +        cxstring str,
     2.9 +        cxstring delim,
    2.10 +        size_t limit
    2.11 +) {
    2.12 +    CxStrtokCtx ctx;
    2.13 +    ctx.str = str;
    2.14 +    ctx.delim = delim;
    2.15 +    ctx.limit = limit;
    2.16 +    ctx.pos = 0;
    2.17 +    ctx.next_pos = 0;
    2.18 +    ctx.delim_pos = 0;
    2.19 +    ctx.found = 0;
    2.20 +    ctx.delim_more = NULL;
    2.21 +    ctx.delim_more_count = 0;
    2.22 +    return ctx;
    2.23 +}
    2.24  
    2.25 +CxStrtokCtx cx_strtok_m(
    2.26 +        cxmutstr str,
    2.27 +        cxstring delim,
    2.28 +        size_t limit
    2.29 +) {
    2.30 +    return cx_strtok(cx_strcast(str), delim, limit);
    2.31 +}
    2.32 +
    2.33 +bool cx_strtok_next(
    2.34 +        CxStrtokCtx *ctx,
    2.35 +        cxstring *token
    2.36 +) {
    2.37 +    // abortion criteria
    2.38 +    if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
    2.39 +        return false;
    2.40 +    }
    2.41 +
    2.42 +    // determine the search start
    2.43 +    cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);
    2.44 +
    2.45 +    // search the next delimiter
    2.46 +    cxstring delim = cx_strstr(haystack, ctx->delim);
    2.47 +
    2.48 +    // if found, make delim capture exactly the delimiter
    2.49 +    if (delim.length > 0) {
    2.50 +        delim.length = ctx->delim.length;
    2.51 +    }
    2.52 +
    2.53 +    // if more delimiters are specified, check them now
    2.54 +    if (ctx->delim_more_count > 0) {
    2.55 +        cx_for_n(i, ctx->delim_more_count) {
    2.56 +            cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
    2.57 +            if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
    2.58 +                delim.ptr = d.ptr;
    2.59 +                delim.length = ctx->delim_more[i].length;
    2.60 +            }
    2.61 +        }
    2.62 +    }
    2.63 +
    2.64 +    // store the token information and adjust the context
    2.65 +    ctx->found++;
    2.66 +    ctx->pos = ctx->next_pos;
    2.67 +    token->ptr = &ctx->str.ptr[ctx->pos];
    2.68 +    ctx->delim_pos = delim.length == 0 ?
    2.69 +                     ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
    2.70 +    token->length = ctx->delim_pos - ctx->pos;
    2.71 +    ctx->next_pos = ctx->delim_pos + delim.length;
    2.72 +
    2.73 +    return true;
    2.74 +}
    2.75 +
    2.76 +bool cx_strtok_next_m(
    2.77 +        CxStrtokCtx *ctx,
    2.78 +        cxmutstr *token
    2.79 +) {
    2.80 +    return cx_strtok_next(ctx, (cxstring *) token);
    2.81 +}
    2.82 +
    2.83 +void cx_strtok_delim(
    2.84 +        CxStrtokCtx *ctx,
    2.85 +        cxstring const *delim,
    2.86 +        size_t count
    2.87 +) {
    2.88 +    ctx->delim_more = delim;
    2.89 +    ctx->delim_more_count = count;
    2.90 +}
     3.1 --- a/test/test_string.cpp	Wed Feb 01 18:07:16 2023 +0100
     3.2 +++ b/test/test_string.cpp	Thu Feb 02 20:25:34 2023 +0100
     3.3 @@ -653,3 +653,213 @@
     3.4      EXPECT_STREQ(str.ptr, "this 1s @ te$t");
     3.5      cx_strfree(&str);
     3.6  }
     3.7 +
     3.8 +TEST(String, strtok) {
     3.9 +    cxstring str = cx_str("a,comma,separated,string");
    3.10 +    cxstring delim = cx_str(",");
    3.11 +    CxStrtokCtx ctx = cx_strtok(str, delim, 3);
    3.12 +    EXPECT_EQ(ctx.str.ptr, str.ptr);
    3.13 +    EXPECT_EQ(ctx.str.length, str.length);
    3.14 +    EXPECT_EQ(ctx.delim.ptr, delim.ptr);
    3.15 +    EXPECT_EQ(ctx.delim.length, delim.length);
    3.16 +    EXPECT_EQ(ctx.limit, 3);
    3.17 +    EXPECT_EQ(ctx.found, 0);
    3.18 +    EXPECT_EQ(ctx.pos, 0);
    3.19 +    EXPECT_EQ(ctx.next_pos, 0);
    3.20 +    EXPECT_EQ(ctx.delim_more, nullptr);
    3.21 +    EXPECT_EQ(ctx.delim_more_count, 0);
    3.22 +}
    3.23 +
    3.24 +TEST(String, strtok_m) {
    3.25 +    cxmutstr str = cx_strdup(cx_str("a,comma,separated,string"));
    3.26 +    cxstring delim = cx_str(",");
    3.27 +    CxStrtokCtx ctx = cx_strtok_m(str, delim, 3);
    3.28 +    EXPECT_EQ(ctx.str.ptr, str.ptr);
    3.29 +    EXPECT_EQ(ctx.str.length, str.length);
    3.30 +    EXPECT_EQ(ctx.delim.ptr, delim.ptr);
    3.31 +    EXPECT_EQ(ctx.delim.length, delim.length);
    3.32 +    EXPECT_EQ(ctx.limit, 3);
    3.33 +    EXPECT_EQ(ctx.found, 0);
    3.34 +    EXPECT_EQ(ctx.pos, 0);
    3.35 +    EXPECT_EQ(ctx.next_pos, 0);
    3.36 +    EXPECT_EQ(ctx.delim_more, nullptr);
    3.37 +    EXPECT_EQ(ctx.delim_more_count, 0);
    3.38 +    cx_strfree(&str);
    3.39 +}
    3.40 +
    3.41 +TEST(String, strtok_delim) {
    3.42 +    cxstring str = cx_str("an,arbitrarily|separated;string");
    3.43 +    cxstring delim = cx_str(",");
    3.44 +    cxstring delim_more[2] = {CX_STR("|"), CX_STR(";")};
    3.45 +    CxStrtokCtx ctx = cx_strtok(str, delim, 3);
    3.46 +    cx_strtok_delim(&ctx, delim_more, 2);
    3.47 +    EXPECT_EQ(ctx.str.ptr, str.ptr);
    3.48 +    EXPECT_EQ(ctx.str.length, str.length);
    3.49 +    EXPECT_EQ(ctx.delim.ptr, delim.ptr);
    3.50 +    EXPECT_EQ(ctx.delim.length, delim.length);
    3.51 +    EXPECT_EQ(ctx.limit, 3);
    3.52 +    EXPECT_EQ(ctx.found, 0);
    3.53 +    EXPECT_EQ(ctx.pos, 0);
    3.54 +    EXPECT_EQ(ctx.next_pos, 0);
    3.55 +    EXPECT_EQ(ctx.delim_more, delim_more);
    3.56 +    EXPECT_EQ(ctx.delim_more_count, 2);
    3.57 +}
    3.58 +
    3.59 +TEST(String, strtok_next_easy) {
    3.60 +    cxstring str = cx_str("a,comma,separated,string");
    3.61 +    cxstring delim = cx_str(",");
    3.62 +    CxStrtokCtx ctx = cx_strtok(str, delim, 3);
    3.63 +    bool ret;
    3.64 +    cxstring tok;
    3.65 +
    3.66 +    ret = cx_strtok_next(&ctx, &tok);
    3.67 +    ASSERT_TRUE(ret);
    3.68 +    EXPECT_EQ(cx_strcmp(tok, cx_str("a")), 0);
    3.69 +    EXPECT_EQ(ctx.pos, 0);
    3.70 +    EXPECT_EQ(ctx.next_pos, 2);
    3.71 +    EXPECT_EQ(ctx.delim_pos, 1);
    3.72 +    EXPECT_EQ(ctx.found, 1);
    3.73 +
    3.74 +    ret = cx_strtok_next(&ctx, &tok);
    3.75 +    ASSERT_TRUE(ret);
    3.76 +    EXPECT_EQ(cx_strcmp(tok, cx_str("comma")), 0);
    3.77 +    EXPECT_EQ(ctx.pos, 2);
    3.78 +    EXPECT_EQ(ctx.next_pos, 8);
    3.79 +    EXPECT_EQ(ctx.delim_pos, 7);
    3.80 +    EXPECT_EQ(ctx.found, 2);
    3.81 +
    3.82 +    ret = cx_strtok_next(&ctx, &tok);
    3.83 +    ASSERT_TRUE(ret);
    3.84 +    EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0);
    3.85 +    EXPECT_EQ(ctx.pos, 8);
    3.86 +    EXPECT_EQ(ctx.next_pos, 18);
    3.87 +    EXPECT_EQ(ctx.delim_pos, 17);
    3.88 +    EXPECT_EQ(ctx.found, 3);
    3.89 +
    3.90 +    ret = cx_strtok_next(&ctx, &tok);
    3.91 +    ASSERT_FALSE(ret);
    3.92 +    EXPECT_EQ(ctx.pos, 8);
    3.93 +    EXPECT_EQ(ctx.next_pos, 18);
    3.94 +    EXPECT_EQ(ctx.delim_pos, 17);
    3.95 +    EXPECT_EQ(ctx.found, 3);
    3.96 +}
    3.97 +
    3.98 +TEST(String, strtok_next_unlimited) {
    3.99 +    cxstring str = cx_str("some;-;otherwise;-;separated;-;string;-;");
   3.100 +    cxstring delim = cx_str(";-;");
   3.101 +    CxStrtokCtx ctx = cx_strtok(str, delim, SIZE_MAX);
   3.102 +    bool ret;
   3.103 +    cxstring tok;
   3.104 +
   3.105 +    ret = cx_strtok_next(&ctx, &tok);
   3.106 +    ASSERT_TRUE(ret);
   3.107 +    EXPECT_EQ(cx_strcmp(tok, cx_str("some")), 0);
   3.108 +    EXPECT_EQ(ctx.pos, 0);
   3.109 +    EXPECT_EQ(ctx.next_pos, 7);
   3.110 +    EXPECT_EQ(ctx.delim_pos, 4);
   3.111 +    EXPECT_EQ(ctx.found, 1);
   3.112 +
   3.113 +    ret = cx_strtok_next(&ctx, &tok);
   3.114 +    ASSERT_TRUE(ret);
   3.115 +    EXPECT_EQ(cx_strcmp(tok, cx_str("otherwise")), 0);
   3.116 +    EXPECT_EQ(ctx.pos, 7);
   3.117 +    EXPECT_EQ(ctx.next_pos, 19);
   3.118 +    EXPECT_EQ(ctx.delim_pos, 16);
   3.119 +    EXPECT_EQ(ctx.found, 2);
   3.120 +
   3.121 +    ret = cx_strtok_next(&ctx, &tok);
   3.122 +    ASSERT_TRUE(ret);
   3.123 +    EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0);
   3.124 +    EXPECT_EQ(ctx.pos, 19);
   3.125 +    EXPECT_EQ(ctx.next_pos, 31);
   3.126 +    EXPECT_EQ(ctx.delim_pos, 28);
   3.127 +    EXPECT_EQ(ctx.found, 3);
   3.128 +
   3.129 +    ret = cx_strtok_next(&ctx, &tok);
   3.130 +    ASSERT_TRUE(ret);
   3.131 +    EXPECT_EQ(cx_strcmp(tok, cx_str("string")), 0);
   3.132 +    EXPECT_EQ(ctx.pos, 31);
   3.133 +    EXPECT_EQ(ctx.next_pos, 40);
   3.134 +    EXPECT_EQ(ctx.delim_pos, 37);
   3.135 +    EXPECT_EQ(ctx.found, 4);
   3.136 +
   3.137 +    ret = cx_strtok_next(&ctx, &tok);
   3.138 +    ASSERT_TRUE(ret);
   3.139 +    EXPECT_EQ(cx_strcmp(tok, cx_str("")), 0);
   3.140 +    EXPECT_EQ(ctx.pos, 40);
   3.141 +    EXPECT_EQ(ctx.next_pos, 40);
   3.142 +    EXPECT_EQ(ctx.delim_pos, 40);
   3.143 +    EXPECT_EQ(ctx.found, 5);
   3.144 +
   3.145 +    ret = cx_strtok_next(&ctx, &tok);
   3.146 +    ASSERT_FALSE(ret);
   3.147 +    EXPECT_EQ(ctx.pos, 40);
   3.148 +    EXPECT_EQ(ctx.delim_pos, 40);
   3.149 +    EXPECT_EQ(ctx.found, 5);
   3.150 +}
   3.151 +
   3.152 +TEST(String, strtok_next_advanced) {
   3.153 +    cxmutstr str = cx_strdup(cx_str("an,arbitrarily;||separated;string"));
   3.154 +    cxstring delim = cx_str(",");
   3.155 +    cxstring delim_more[2] = {CX_STR("||"), CX_STR(";")};
   3.156 +    CxStrtokCtx ctx = cx_strtok_m(str, delim, 10);
   3.157 +    cx_strtok_delim(&ctx, delim_more, 2);
   3.158 +    bool ret;
   3.159 +    cxmutstr tok;
   3.160 +
   3.161 +    ret = cx_strtok_next_m(&ctx, &tok);
   3.162 +    ASSERT_TRUE(ret);
   3.163 +    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("an")), 0);
   3.164 +    EXPECT_EQ(ctx.pos, 0);
   3.165 +    EXPECT_EQ(ctx.next_pos, 3);
   3.166 +    EXPECT_EQ(ctx.delim_pos, 2);
   3.167 +    EXPECT_EQ(ctx.found, 1);
   3.168 +    cx_strupper(tok);
   3.169 +
   3.170 +    ret = cx_strtok_next_m(&ctx, &tok);
   3.171 +    ASSERT_TRUE(ret);
   3.172 +    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("arbitrarily")), 0);
   3.173 +    EXPECT_EQ(ctx.pos, 3);
   3.174 +    EXPECT_EQ(ctx.next_pos, 15);
   3.175 +    EXPECT_EQ(ctx.delim_pos, 14);
   3.176 +    EXPECT_EQ(ctx.found, 2);
   3.177 +    cx_strupper(tok);
   3.178 +
   3.179 +    ret = cx_strtok_next_m(&ctx, &tok);
   3.180 +    ASSERT_TRUE(ret);
   3.181 +    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("")), 0);
   3.182 +    EXPECT_EQ(ctx.pos, 15);
   3.183 +    EXPECT_EQ(ctx.next_pos, 17);
   3.184 +    EXPECT_EQ(ctx.delim_pos, 15);
   3.185 +    EXPECT_EQ(ctx.found, 3);
   3.186 +    cx_strupper(tok);
   3.187 +
   3.188 +    ret = cx_strtok_next_m(&ctx, &tok);
   3.189 +    ASSERT_TRUE(ret);
   3.190 +    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("separated")), 0);
   3.191 +    EXPECT_EQ(ctx.pos, 17);
   3.192 +    EXPECT_EQ(ctx.next_pos, 27);
   3.193 +    EXPECT_EQ(ctx.delim_pos, 26);
   3.194 +    EXPECT_EQ(ctx.found, 4);
   3.195 +    cx_strupper(tok);
   3.196 +
   3.197 +    ret = cx_strtok_next_m(&ctx, &tok);
   3.198 +    ASSERT_TRUE(ret);
   3.199 +    EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("string")), 0);
   3.200 +    EXPECT_EQ(ctx.pos, 27);
   3.201 +    EXPECT_EQ(ctx.next_pos, 33);
   3.202 +    EXPECT_EQ(ctx.delim_pos, 33);
   3.203 +    EXPECT_EQ(ctx.found, 5);
   3.204 +    cx_strupper(tok);
   3.205 +
   3.206 +    ret = cx_strtok_next_m(&ctx, &tok);
   3.207 +    ASSERT_FALSE(ret);
   3.208 +    EXPECT_EQ(ctx.pos, 27);
   3.209 +    EXPECT_EQ(ctx.next_pos, 33);
   3.210 +    EXPECT_EQ(ctx.delim_pos, 33);
   3.211 +    EXPECT_EQ(ctx.found, 5);
   3.212 +
   3.213 +    EXPECT_EQ(cx_strcmp(cx_strcast(str), cx_str("AN,ARBITRARILY;||SEPARATED;STRING")), 0);
   3.214 +
   3.215 +    cx_strfree(&str);
   3.216 +}

mercurial