Thu, 02 Feb 2023 20:25:34 +0100
add strtok API - fixes #220
src/cx/string.h | file | annotate | diff | comparison | revisions | |
src/string.c | file | annotate | diff | comparison | revisions | |
test/test_string.cpp | file | annotate | diff | comparison | revisions |
1.1 --- a/src/cx/string.h Wed Feb 01 18:07:16 2023 +0100 1.2 +++ b/src/cx/string.h Thu Feb 02 20:25:34 2023 +0100 1.3 @@ -79,6 +79,57 @@ 1.4 typedef struct cx_string_s cxstring; 1.5 1.6 /** 1.7 + * Context for string tokenizing. 1.8 + */ 1.9 +struct cx_strtok_ctx_s { 1.10 + /** 1.11 + * The string to tokenize. 1.12 + */ 1.13 + cxstring str; 1.14 + /** 1.15 + * The primary delimiter. 1.16 + */ 1.17 + cxstring delim; 1.18 + /** 1.19 + * Optional array of more delimiters. 1.20 + */ 1.21 + cxstring const *delim_more; 1.22 + /** 1.23 + * Length of the array containing more delimiters. 1.24 + */ 1.25 + size_t delim_more_count; 1.26 + /** 1.27 + * Position of the currently active token in the source string. 1.28 + */ 1.29 + size_t pos; 1.30 + /** 1.31 + * Position of next delimiter in the source string. 1.32 + * 1.33 + * If the tokenizer has not yet returned a token, the content of this field 1.34 + * is undefined. If the tokenizer reached the end of the string, this field 1.35 + * contains the length of the source string. 1.36 + */ 1.37 + size_t delim_pos; 1.38 + /** 1.39 + * The position of the next token in the source string. 1.40 + */ 1.41 + size_t next_pos; 1.42 + /** 1.43 + * The number of already found tokens. 1.44 + */ 1.45 + size_t found; 1.46 + /** 1.47 + * The maximum number of tokens that shall be returned. 1.48 + */ 1.49 + size_t limit; 1.50 +}; 1.51 + 1.52 +/** 1.53 + * A string tokenizing context. 1.54 + */ 1.55 +typedef struct cx_strtok_ctx_s CxStrtokCtx; 1.56 + 1.57 +/** 1.58 * A literal initializer for an UCX string structure. 1.59 * 1.60 * The argument MUST be a string (const char*) \em literal. 1.61 @@ -828,6 +879,85 @@ 1.62 #define cx_strreplace(str, pattern, replacement) \ 1.63 cx_strreplacen_a(cxDefaultAllocator, str, pattern, replacement, SIZE_MAX) 1.64 1.65 +/** 1.66 + * Creates a string tokenization context. 1.67 + * 1.68 + * @param str the string to tokenize 1.69 + * @param delim the delimiter (must not be empty) 1.70 + * @param limit the maximum number of tokens that shall be returned 1.71 + * @return a new string tokenization context 1.72 + */ 1.73 +__attribute__((__warn_unused_result__)) 1.74 +CxStrtokCtx cx_strtok( 1.75 + cxstring str, 1.76 + cxstring delim, 1.77 + size_t limit 1.78 +); 1.79 + 1.80 +/** 1.81 +* Creates a string tokenization context for a mutable string. 1.82 +* 1.83 +* @param str the string to tokenize 1.84 +* @param delim the delimiter (must not be empty) 1.85 +* @param limit the maximum number of tokens that shall be returned 1.86 +* @return a new string tokenization context 1.87 +*/ 1.88 +__attribute__((__warn_unused_result__)) 1.89 +CxStrtokCtx cx_strtok_m( 1.90 + cxmutstr str, 1.91 + cxstring delim, 1.92 + size_t limit 1.93 +); 1.94 + 1.95 +/** 1.96 + * Returns the next token. 1.97 + * 1.98 + * The token will point to the source string. 1.99 + * 1.100 + * @param ctx the tokenization context 1.101 + * @param token a pointer to memory where the next token shall be stored 1.102 + * @return true if successful, false if the limit or the end of the string 1.103 + * has been reached 1.104 + */ 1.105 +__attribute__((__warn_unused_result__, __nonnull__)) 1.106 +bool cx_strtok_next( 1.107 + CxStrtokCtx *ctx, 1.108 + cxstring *token 1.109 +); 1.110 + 1.111 +/** 1.112 + * Returns the next token of a mutable string. 1.113 + * 1.114 + * The token will point to the source string. 1.115 + * If the context was not initialized over a mutable string, modifying 1.116 + * the data of the returned token is undefined behavior. 1.117 + * 1.118 + * @param ctx the tokenization context 1.119 + * @param token a pointer to memory where the next token shall be stored 1.120 + * @return true if successful, false if the limit or the end of the string 1.121 + * has been reached 1.122 + */ 1.123 +__attribute__((__warn_unused_result__, __nonnull__)) 1.124 +bool cx_strtok_next_m( 1.125 + CxStrtokCtx *ctx, 1.126 + cxmutstr *token 1.127 +); 1.128 + 1.129 +/** 1.130 + * Defines an array of more delimiters for the specified tokenization context. 1.131 + * 1.132 + * @param ctx the tokenization context 1.133 + * @param delim array of more delimiters 1.134 + * @param count number of elements in the array 1.135 + */ 1.136 +__attribute__((__nonnull__)) 1.137 +void cx_strtok_delim( 1.138 + CxStrtokCtx *ctx, 1.139 + cxstring const *delim, 1.140 + size_t count 1.141 +); 1.142 + 1.143 + 1.144 #ifdef __cplusplus 1.145 } // extern "C" 1.146 #endif
2.1 --- a/src/string.c Wed Feb 01 18:07:16 2023 +0100 2.2 +++ b/src/string.c Thu Feb 02 20:25:34 2023 +0100 2.3 @@ -674,4 +674,87 @@ 2.4 return result; 2.5 } 2.6 2.7 +CxStrtokCtx cx_strtok( 2.8 + cxstring str, 2.9 + cxstring delim, 2.10 + size_t limit 2.11 +) { 2.12 + CxStrtokCtx ctx; 2.13 + ctx.str = str; 2.14 + ctx.delim = delim; 2.15 + ctx.limit = limit; 2.16 + ctx.pos = 0; 2.17 + ctx.next_pos = 0; 2.18 + ctx.delim_pos = 0; 2.19 + ctx.found = 0; 2.20 + ctx.delim_more = NULL; 2.21 + ctx.delim_more_count = 0; 2.22 + return ctx; 2.23 +} 2.24 2.25 +CxStrtokCtx cx_strtok_m( 2.26 + cxmutstr str, 2.27 + cxstring delim, 2.28 + size_t limit 2.29 +) { 2.30 + return cx_strtok(cx_strcast(str), delim, limit); 2.31 +} 2.32 + 2.33 +bool cx_strtok_next( 2.34 + CxStrtokCtx *ctx, 2.35 + cxstring *token 2.36 +) { 2.37 + // abortion criteria 2.38 + if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) { 2.39 + return false; 2.40 + } 2.41 + 2.42 + // determine the search start 2.43 + cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos); 2.44 + 2.45 + // search the next delimiter 2.46 + cxstring delim = cx_strstr(haystack, ctx->delim); 2.47 + 2.48 + // if found, make delim capture exactly the delimiter 2.49 + if (delim.length > 0) { 2.50 + delim.length = ctx->delim.length; 2.51 + } 2.52 + 2.53 + // if more delimiters are specified, check them now 2.54 + if (ctx->delim_more_count > 0) { 2.55 + cx_for_n(i, ctx->delim_more_count) { 2.56 + cxstring d = cx_strstr(haystack, ctx->delim_more[i]); 2.57 + if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { 2.58 + delim.ptr = d.ptr; 2.59 + delim.length = ctx->delim_more[i].length; 2.60 + } 2.61 + } 2.62 + } 2.63 + 2.64 + // store the token information and adjust the context 2.65 + ctx->found++; 2.66 + ctx->pos = ctx->next_pos; 2.67 + token->ptr = &ctx->str.ptr[ctx->pos]; 2.68 + ctx->delim_pos = delim.length == 0 ? 2.69 + ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr); 2.70 + token->length = ctx->delim_pos - ctx->pos; 2.71 + ctx->next_pos = ctx->delim_pos + delim.length; 2.72 + 2.73 + return true; 2.74 +} 2.75 + 2.76 +bool cx_strtok_next_m( 2.77 + CxStrtokCtx *ctx, 2.78 + cxmutstr *token 2.79 +) { 2.80 + return cx_strtok_next(ctx, (cxstring *) token); 2.81 +} 2.82 + 2.83 +void cx_strtok_delim( 2.84 + CxStrtokCtx *ctx, 2.85 + cxstring const *delim, 2.86 + size_t count 2.87 +) { 2.88 + ctx->delim_more = delim; 2.89 + ctx->delim_more_count = count; 2.90 +}
3.1 --- a/test/test_string.cpp Wed Feb 01 18:07:16 2023 +0100 3.2 +++ b/test/test_string.cpp Thu Feb 02 20:25:34 2023 +0100 3.3 @@ -653,3 +653,213 @@ 3.4 EXPECT_STREQ(str.ptr, "this 1s @ te$t"); 3.5 cx_strfree(&str); 3.6 } 3.7 + 3.8 +TEST(String, strtok) { 3.9 + cxstring str = cx_str("a,comma,separated,string"); 3.10 + cxstring delim = cx_str(","); 3.11 + CxStrtokCtx ctx = cx_strtok(str, delim, 3); 3.12 + EXPECT_EQ(ctx.str.ptr, str.ptr); 3.13 + EXPECT_EQ(ctx.str.length, str.length); 3.14 + EXPECT_EQ(ctx.delim.ptr, delim.ptr); 3.15 + EXPECT_EQ(ctx.delim.length, delim.length); 3.16 + EXPECT_EQ(ctx.limit, 3); 3.17 + EXPECT_EQ(ctx.found, 0); 3.18 + EXPECT_EQ(ctx.pos, 0); 3.19 + EXPECT_EQ(ctx.next_pos, 0); 3.20 + EXPECT_EQ(ctx.delim_more, nullptr); 3.21 + EXPECT_EQ(ctx.delim_more_count, 0); 3.22 +} 3.23 + 3.24 +TEST(String, strtok_m) { 3.25 + cxmutstr str = cx_strdup(cx_str("a,comma,separated,string")); 3.26 + cxstring delim = cx_str(","); 3.27 + CxStrtokCtx ctx = cx_strtok_m(str, delim, 3); 3.28 + EXPECT_EQ(ctx.str.ptr, str.ptr); 3.29 + EXPECT_EQ(ctx.str.length, str.length); 3.30 + EXPECT_EQ(ctx.delim.ptr, delim.ptr); 3.31 + EXPECT_EQ(ctx.delim.length, delim.length); 3.32 + EXPECT_EQ(ctx.limit, 3); 3.33 + EXPECT_EQ(ctx.found, 0); 3.34 + EXPECT_EQ(ctx.pos, 0); 3.35 + EXPECT_EQ(ctx.next_pos, 0); 3.36 + EXPECT_EQ(ctx.delim_more, nullptr); 3.37 + EXPECT_EQ(ctx.delim_more_count, 0); 3.38 + cx_strfree(&str); 3.39 +} 3.40 + 3.41 +TEST(String, strtok_delim) { 3.42 + cxstring str = cx_str("an,arbitrarily|separated;string"); 3.43 + cxstring delim = cx_str(","); 3.44 + cxstring delim_more[2] = {CX_STR("|"), CX_STR(";")}; 3.45 + CxStrtokCtx ctx = cx_strtok(str, delim, 3); 3.46 + cx_strtok_delim(&ctx, delim_more, 2); 3.47 + EXPECT_EQ(ctx.str.ptr, str.ptr); 3.48 + EXPECT_EQ(ctx.str.length, str.length); 3.49 + EXPECT_EQ(ctx.delim.ptr, delim.ptr); 3.50 + EXPECT_EQ(ctx.delim.length, delim.length); 3.51 + EXPECT_EQ(ctx.limit, 3); 3.52 + EXPECT_EQ(ctx.found, 0); 3.53 + EXPECT_EQ(ctx.pos, 0); 3.54 + EXPECT_EQ(ctx.next_pos, 0); 3.55 + EXPECT_EQ(ctx.delim_more, delim_more); 3.56 + EXPECT_EQ(ctx.delim_more_count, 2); 3.57 +} 3.58 + 3.59 +TEST(String, strtok_next_easy) { 3.60 + cxstring str = cx_str("a,comma,separated,string"); 3.61 + cxstring delim = cx_str(","); 3.62 + CxStrtokCtx ctx = cx_strtok(str, delim, 3); 3.63 + bool ret; 3.64 + cxstring tok; 3.65 + 3.66 + ret = cx_strtok_next(&ctx, &tok); 3.67 + ASSERT_TRUE(ret); 3.68 + EXPECT_EQ(cx_strcmp(tok, cx_str("a")), 0); 3.69 + EXPECT_EQ(ctx.pos, 0); 3.70 + EXPECT_EQ(ctx.next_pos, 2); 3.71 + EXPECT_EQ(ctx.delim_pos, 1); 3.72 + EXPECT_EQ(ctx.found, 1); 3.73 + 3.74 + ret = cx_strtok_next(&ctx, &tok); 3.75 + ASSERT_TRUE(ret); 3.76 + EXPECT_EQ(cx_strcmp(tok, cx_str("comma")), 0); 3.77 + EXPECT_EQ(ctx.pos, 2); 3.78 + EXPECT_EQ(ctx.next_pos, 8); 3.79 + EXPECT_EQ(ctx.delim_pos, 7); 3.80 + EXPECT_EQ(ctx.found, 2); 3.81 + 3.82 + ret = cx_strtok_next(&ctx, &tok); 3.83 + ASSERT_TRUE(ret); 3.84 + EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0); 3.85 + EXPECT_EQ(ctx.pos, 8); 3.86 + EXPECT_EQ(ctx.next_pos, 18); 3.87 + EXPECT_EQ(ctx.delim_pos, 17); 3.88 + EXPECT_EQ(ctx.found, 3); 3.89 + 3.90 + ret = cx_strtok_next(&ctx, &tok); 3.91 + ASSERT_FALSE(ret); 3.92 + EXPECT_EQ(ctx.pos, 8); 3.93 + EXPECT_EQ(ctx.next_pos, 18); 3.94 + EXPECT_EQ(ctx.delim_pos, 17); 3.95 + EXPECT_EQ(ctx.found, 3); 3.96 +} 3.97 + 3.98 +TEST(String, strtok_next_unlimited) { 3.99 + cxstring str = cx_str("some;-;otherwise;-;separated;-;string;-;"); 3.100 + cxstring delim = cx_str(";-;"); 3.101 + CxStrtokCtx ctx = cx_strtok(str, delim, SIZE_MAX); 3.102 + bool ret; 3.103 + cxstring tok; 3.104 + 3.105 + ret = cx_strtok_next(&ctx, &tok); 3.106 + ASSERT_TRUE(ret); 3.107 + EXPECT_EQ(cx_strcmp(tok, cx_str("some")), 0); 3.108 + EXPECT_EQ(ctx.pos, 0); 3.109 + EXPECT_EQ(ctx.next_pos, 7); 3.110 + EXPECT_EQ(ctx.delim_pos, 4); 3.111 + EXPECT_EQ(ctx.found, 1); 3.112 + 3.113 + ret = cx_strtok_next(&ctx, &tok); 3.114 + ASSERT_TRUE(ret); 3.115 + EXPECT_EQ(cx_strcmp(tok, cx_str("otherwise")), 0); 3.116 + EXPECT_EQ(ctx.pos, 7); 3.117 + EXPECT_EQ(ctx.next_pos, 19); 3.118 + EXPECT_EQ(ctx.delim_pos, 16); 3.119 + EXPECT_EQ(ctx.found, 2); 3.120 + 3.121 + ret = cx_strtok_next(&ctx, &tok); 3.122 + ASSERT_TRUE(ret); 3.123 + EXPECT_EQ(cx_strcmp(tok, cx_str("separated")), 0); 3.124 + EXPECT_EQ(ctx.pos, 19); 3.125 + EXPECT_EQ(ctx.next_pos, 31); 3.126 + EXPECT_EQ(ctx.delim_pos, 28); 3.127 + EXPECT_EQ(ctx.found, 3); 3.128 + 3.129 + ret = cx_strtok_next(&ctx, &tok); 3.130 + ASSERT_TRUE(ret); 3.131 + EXPECT_EQ(cx_strcmp(tok, cx_str("string")), 0); 3.132 + EXPECT_EQ(ctx.pos, 31); 3.133 + EXPECT_EQ(ctx.next_pos, 40); 3.134 + EXPECT_EQ(ctx.delim_pos, 37); 3.135 + EXPECT_EQ(ctx.found, 4); 3.136 + 3.137 + ret = cx_strtok_next(&ctx, &tok); 3.138 + ASSERT_TRUE(ret); 3.139 + EXPECT_EQ(cx_strcmp(tok, cx_str("")), 0); 3.140 + EXPECT_EQ(ctx.pos, 40); 3.141 + EXPECT_EQ(ctx.next_pos, 40); 3.142 + EXPECT_EQ(ctx.delim_pos, 40); 3.143 + EXPECT_EQ(ctx.found, 5); 3.144 + 3.145 + ret = cx_strtok_next(&ctx, &tok); 3.146 + ASSERT_FALSE(ret); 3.147 + EXPECT_EQ(ctx.pos, 40); 3.148 + EXPECT_EQ(ctx.delim_pos, 40); 3.149 + EXPECT_EQ(ctx.found, 5); 3.150 +} 3.151 + 3.152 +TEST(String, strtok_next_advanced) { 3.153 + cxmutstr str = cx_strdup(cx_str("an,arbitrarily;||separated;string")); 3.154 + cxstring delim = cx_str(","); 3.155 + cxstring delim_more[2] = {CX_STR("||"), CX_STR(";")}; 3.156 + CxStrtokCtx ctx = cx_strtok_m(str, delim, 10); 3.157 + cx_strtok_delim(&ctx, delim_more, 2); 3.158 + bool ret; 3.159 + cxmutstr tok; 3.160 + 3.161 + ret = cx_strtok_next_m(&ctx, &tok); 3.162 + ASSERT_TRUE(ret); 3.163 + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("an")), 0); 3.164 + EXPECT_EQ(ctx.pos, 0); 3.165 + EXPECT_EQ(ctx.next_pos, 3); 3.166 + EXPECT_EQ(ctx.delim_pos, 2); 3.167 + EXPECT_EQ(ctx.found, 1); 3.168 + cx_strupper(tok); 3.169 + 3.170 + ret = cx_strtok_next_m(&ctx, &tok); 3.171 + ASSERT_TRUE(ret); 3.172 + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("arbitrarily")), 0); 3.173 + EXPECT_EQ(ctx.pos, 3); 3.174 + EXPECT_EQ(ctx.next_pos, 15); 3.175 + EXPECT_EQ(ctx.delim_pos, 14); 3.176 + EXPECT_EQ(ctx.found, 2); 3.177 + cx_strupper(tok); 3.178 + 3.179 + ret = cx_strtok_next_m(&ctx, &tok); 3.180 + ASSERT_TRUE(ret); 3.181 + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("")), 0); 3.182 + EXPECT_EQ(ctx.pos, 15); 3.183 + EXPECT_EQ(ctx.next_pos, 17); 3.184 + EXPECT_EQ(ctx.delim_pos, 15); 3.185 + EXPECT_EQ(ctx.found, 3); 3.186 + cx_strupper(tok); 3.187 + 3.188 + ret = cx_strtok_next_m(&ctx, &tok); 3.189 + ASSERT_TRUE(ret); 3.190 + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("separated")), 0); 3.191 + EXPECT_EQ(ctx.pos, 17); 3.192 + EXPECT_EQ(ctx.next_pos, 27); 3.193 + EXPECT_EQ(ctx.delim_pos, 26); 3.194 + EXPECT_EQ(ctx.found, 4); 3.195 + cx_strupper(tok); 3.196 + 3.197 + ret = cx_strtok_next_m(&ctx, &tok); 3.198 + ASSERT_TRUE(ret); 3.199 + EXPECT_EQ(cx_strcmp(cx_strcast(tok), cx_str("string")), 0); 3.200 + EXPECT_EQ(ctx.pos, 27); 3.201 + EXPECT_EQ(ctx.next_pos, 33); 3.202 + EXPECT_EQ(ctx.delim_pos, 33); 3.203 + EXPECT_EQ(ctx.found, 5); 3.204 + cx_strupper(tok); 3.205 + 3.206 + ret = cx_strtok_next_m(&ctx, &tok); 3.207 + ASSERT_FALSE(ret); 3.208 + EXPECT_EQ(ctx.pos, 27); 3.209 + EXPECT_EQ(ctx.next_pos, 33); 3.210 + EXPECT_EQ(ctx.delim_pos, 33); 3.211 + EXPECT_EQ(ctx.found, 5); 3.212 + 3.213 + EXPECT_EQ(cx_strcmp(cx_strcast(str), cx_str("AN,ARBITRARILY;||SEPARATED;STRING")), 0); 3.214 + 3.215 + cx_strfree(&str); 3.216 +}