Thu, 02 Feb 2023 20:25:34 +0100
add strtok API - fixes #220
1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
29 #include "cx/string.h"
30 #include "cx/utils.h"
32 #include <string.h>
33 #include <stdarg.h>
34 #include <ctype.h>
36 #ifndef _WIN32
38 #include <strings.h> // for strncasecmp()
40 #endif // _WIN32
42 cxmutstr cx_mutstr(char *cstring) {
43 return (cxmutstr) {cstring, strlen(cstring)};
44 }
46 cxmutstr cx_mutstrn(
47 char *cstring,
48 size_t length
49 ) {
50 return (cxmutstr) {cstring, length};
51 }
53 cxstring cx_str(const char *cstring) {
54 return (cxstring) {cstring, strlen(cstring)};
55 }
57 cxstring cx_strn(
58 const char *cstring,
59 size_t length
60 ) {
61 return (cxstring) {cstring, length};
62 }
64 cxstring cx_strcast(cxmutstr str) {
65 return (cxstring) {str.ptr, str.length};
66 }
68 void cx_strfree(cxmutstr *str) {
69 free(str->ptr);
70 str->ptr = NULL;
71 str->length = 0;
72 }
74 void cx_strfree_a(
75 CxAllocator *alloc,
76 cxmutstr *str
77 ) {
78 cxFree(alloc, str->ptr);
79 str->ptr = NULL;
80 str->length = 0;
81 }
83 size_t cx_strlen(
84 size_t count,
85 ...
86 ) {
87 if (count == 0) return 0;
89 va_list ap;
90 va_start(ap, count);
91 size_t size = 0;
92 cx_for_n(i, count) {
93 cxstring str = va_arg(ap, cxstring);
94 size += str.length;
95 }
96 va_end(ap);
98 return size;
99 }
101 cxmutstr cx_strcat_a(
102 CxAllocator *alloc,
103 size_t count,
104 ...
105 ) {
106 cxstring *strings = calloc(count, sizeof(cxstring));
107 if (!strings) abort();
109 va_list ap;
110 va_start(ap, count);
112 // get all args and overall length
113 size_t slen = 0;
114 cx_for_n(i, count) {
115 cxstring s = va_arg (ap, cxstring);
116 strings[i] = s;
117 slen += s.length;
118 }
120 // create new string
121 cxmutstr result;
122 result.ptr = cxMalloc(alloc, slen + 1);
123 result.length = slen;
124 if (result.ptr == NULL) abort();
126 // concatenate strings
127 size_t pos = 0;
128 cx_for_n(i, count) {
129 cxstring s = strings[i];
130 memcpy(result.ptr + pos, s.ptr, s.length);
131 pos += s.length;
132 }
134 // terminate string
135 result.ptr[result.length] = '\0';
137 // free temporary array
138 free(strings);
140 return result;
141 }
143 cxstring cx_strsubs(
144 cxstring string,
145 size_t start
146 ) {
147 return cx_strsubsl(string, start, string.length - start);
148 }
150 cxmutstr cx_strsubs_m(
151 cxmutstr string,
152 size_t start
153 ) {
154 return cx_strsubsl_m(string, start, string.length - start);
155 }
157 cxstring cx_strsubsl(
158 cxstring string,
159 size_t start,
160 size_t length
161 ) {
162 if (start > string.length) {
163 return (cxstring) {NULL, 0};
164 }
166 size_t rem_len = string.length - start;
167 if (length > rem_len) {
168 length = rem_len;
169 }
171 return (cxstring) {string.ptr + start, length};
172 }
174 cxmutstr cx_strsubsl_m(
175 cxmutstr string,
176 size_t start,
177 size_t length
178 ) {
179 cxstring result = cx_strsubsl(cx_strcast(string), start, length);
180 return (cxmutstr) {(char *) result.ptr, result.length};
181 }
183 cxstring cx_strchr(
184 cxstring string,
185 int chr
186 ) {
187 chr = 0xFF & chr;
188 // TODO: improve by comparing multiple bytes at once
189 cx_for_n(i, string.length) {
190 if (string.ptr[i] == chr) {
191 return cx_strsubs(string, i);
192 }
193 }
194 return (cxstring) {NULL, 0};
195 }
197 cxmutstr cx_strchr_m(
198 cxmutstr string,
199 int chr
200 ) {
201 cxstring result = cx_strchr(cx_strcast(string), chr);
202 return (cxmutstr) {(char *) result.ptr, result.length};
203 }
205 cxstring cx_strrchr(
206 cxstring string,
207 int chr
208 ) {
209 chr = 0xFF & chr;
210 size_t i = string.length;
211 while (i > 0) {
212 i--;
213 // TODO: improve by comparing multiple bytes at once
214 if (string.ptr[i] == chr) {
215 return cx_strsubs(string, i);
216 }
217 }
218 return (cxstring) {NULL, 0};
219 }
221 cxmutstr cx_strrchr_m(
222 cxmutstr string,
223 int chr
224 ) {
225 cxstring result = cx_strrchr(cx_strcast(string), chr);
226 return (cxmutstr) {(char *) result.ptr, result.length};
227 }
229 #ifndef CX_STRSTR_SBO_SIZE
230 #define CX_STRSTR_SBO_SIZE 512
231 #endif
233 cxstring cx_strstr(
234 cxstring haystack,
235 cxstring needle
236 ) {
237 if (needle.length == 0) {
238 return haystack;
239 }
241 // optimize for single-char needles
242 if (needle.length == 1) {
243 return cx_strchr(haystack, *needle.ptr);
244 }
246 /*
247 * IMPORTANT:
248 * Our prefix table contains the prefix length PLUS ONE
249 * this is our decision, because we want to use the full range of size_t.
250 * The original algorithm needs a (-1) at one single place,
251 * and we want to avoid that.
252 */
254 // local prefix table
255 size_t s_prefix_table[CX_STRSTR_SBO_SIZE];
257 // check needle length and use appropriate prefix table
258 // if the pattern exceeds static prefix table, allocate on the heap
259 bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
260 register size_t *ptable = useheap ? calloc(needle.length + 1,
261 sizeof(size_t)) : s_prefix_table;
263 // keep counter in registers
264 register size_t i, j;
266 // fill prefix table
267 i = 0;
268 j = 0;
269 ptable[i] = j;
270 while (i < needle.length) {
271 while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
272 j = ptable[j - 1];
273 }
274 i++;
275 j++;
276 ptable[i] = j;
277 }
279 // search
280 cxstring result = {NULL, 0};
281 i = 0;
282 j = 1;
283 while (i < haystack.length) {
284 while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
285 j = ptable[j - 1];
286 }
287 i++;
288 j++;
289 if (j - 1 == needle.length) {
290 size_t start = i - needle.length;
291 result.ptr = haystack.ptr + start;
292 result.length = haystack.length - start;
293 break;
294 }
295 }
297 // if prefix table was allocated on the heap, free it
298 if (ptable != s_prefix_table) {
299 free(ptable);
300 }
302 return result;
303 }
305 cxmutstr cx_strstr_m(
306 cxmutstr haystack,
307 cxstring needle
308 ) {
309 cxstring result = cx_strstr(cx_strcast(haystack), needle);
310 return (cxmutstr) {(char *) result.ptr, result.length};
311 }
313 size_t cx_strsplit(
314 cxstring string,
315 cxstring delim,
316 size_t limit,
317 cxstring *output
318 ) {
319 // special case: output limit is zero
320 if (limit == 0) return 0;
322 // special case: delimiter is empty
323 if (delim.length == 0) {
324 output[0] = string;
325 return 1;
326 }
328 // special cases: delimiter is at least as large as the string
329 if (delim.length >= string.length) {
330 // exact match
331 if (cx_strcmp(string, delim) == 0) {
332 output[0] = cx_strn(string.ptr, 0);
333 output[1] = cx_strn(string.ptr + string.length, 0);
334 return 2;
335 } else {
336 // no match possible
337 output[0] = string;
338 return 1;
339 }
340 }
342 size_t n = 0;
343 cxstring curpos = string;
344 while (1) {
345 ++n;
346 cxstring match = cx_strstr(curpos, delim);
347 if (match.length > 0) {
348 // is the limit reached?
349 if (n < limit) {
350 // copy the current string to the array
351 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
352 output[n - 1] = item;
353 size_t processed = item.length + delim.length;
354 curpos.ptr += processed;
355 curpos.length -= processed;
356 } else {
357 // limit reached, copy the _full_ remaining string
358 output[n - 1] = curpos;
359 break;
360 }
361 } else {
362 // no more matches, copy last string
363 output[n - 1] = curpos;
364 break;
365 }
366 }
368 return n;
369 }
371 size_t cx_strsplit_a(
372 CxAllocator *allocator,
373 cxstring string,
374 cxstring delim,
375 size_t limit,
376 cxstring **output
377 ) {
378 // find out how many splits we're going to make and allocate memory
379 size_t n = 0;
380 cxstring curpos = string;
381 while (1) {
382 ++n;
383 cxstring match = cx_strstr(curpos, delim);
384 if (match.length > 0) {
385 // is the limit reached?
386 if (n < limit) {
387 size_t processed = match.ptr - curpos.ptr + delim.length;
388 curpos.ptr += processed;
389 curpos.length -= processed;
390 } else {
391 // limit reached
392 break;
393 }
394 } else {
395 // no more matches
396 break;
397 }
398 }
399 *output = cxCalloc(allocator, n, sizeof(cxstring));
400 return cx_strsplit(string, delim, n, *output);
401 }
403 size_t cx_strsplit_m(
404 cxmutstr string,
405 cxstring delim,
406 size_t limit,
407 cxmutstr *output
408 ) {
409 return cx_strsplit(cx_strcast(string),
410 delim, limit, (cxstring *) output);
411 }
413 size_t cx_strsplit_ma(
414 CxAllocator *allocator,
415 cxmutstr string,
416 cxstring delim,
417 size_t limit,
418 cxmutstr **output
419 ) {
420 return cx_strsplit_a(allocator, cx_strcast(string),
421 delim, limit, (cxstring **) output);
422 }
424 int cx_strcmp(
425 cxstring s1,
426 cxstring s2
427 ) {
428 if (s1.length == s2.length) {
429 return memcmp(s1.ptr, s2.ptr, s1.length);
430 } else if (s1.length > s2.length) {
431 return 1;
432 } else {
433 return -1;
434 }
435 }
437 int cx_strcasecmp(
438 cxstring s1,
439 cxstring s2
440 ) {
441 if (s1.length == s2.length) {
442 #ifdef _WIN32
443 return _strnicmp(s1.ptr, s2.ptr, s1.length);
444 #else
445 return strncasecmp(s1.ptr, s2.ptr, s1.length);
446 #endif
447 } else if (s1.length > s2.length) {
448 return 1;
449 } else {
450 return -1;
451 }
452 }
454 cxmutstr cx_strdup_a(
455 CxAllocator *allocator,
456 cxstring string
457 ) {
458 cxmutstr result = {
459 cxMalloc(allocator, string.length + 1),
460 string.length
461 };
462 if (result.ptr == NULL) {
463 result.length = 0;
464 return result;
465 }
466 memcpy(result.ptr, string.ptr, string.length);
467 result.ptr[string.length] = '\0';
468 return result;
469 }
471 cxstring cx_strtrim(cxstring string) {
472 cxstring result = string;
473 // TODO: optimize by comparing multiple bytes at once
474 while (result.length > 0 && isspace(*result.ptr)) {
475 result.ptr++;
476 result.length--;
477 }
478 while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
479 result.length--;
480 }
481 return result;
482 }
484 cxmutstr cx_strtrim_m(cxmutstr string) {
485 cxstring result = cx_strtrim(cx_strcast(string));
486 return (cxmutstr) {(char *) result.ptr, result.length};
487 }
489 bool cx_strprefix(
490 cxstring string,
491 cxstring prefix
492 ) {
493 if (string.length < prefix.length) return false;
494 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
495 }
497 bool cx_strsuffix(
498 cxstring string,
499 cxstring suffix
500 ) {
501 if (string.length < suffix.length) return false;
502 return memcmp(string.ptr + string.length - suffix.length,
503 suffix.ptr, suffix.length) == 0;
504 }
506 bool cx_strcaseprefix(
507 cxstring string,
508 cxstring prefix
509 ) {
510 if (string.length < prefix.length) return false;
511 #ifdef _WIN32
512 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
513 #else
514 return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
515 #endif
516 }
518 bool cx_strcasesuffix(
519 cxstring string,
520 cxstring suffix
521 ) {
522 if (string.length < suffix.length) return false;
523 #ifdef _WIN32
524 return _strnicmp(string.ptr+string.length-suffix.length,
525 suffix.ptr, suffix.length) == 0;
526 #else
527 return strncasecmp(string.ptr + string.length - suffix.length,
528 suffix.ptr, suffix.length) == 0;
529 #endif
530 }
532 void cx_strlower(cxmutstr string) {
533 cx_for_n(i, string.length) {
534 string.ptr[i] = (char) tolower(string.ptr[i]);
535 }
536 }
538 void cx_strupper(cxmutstr string) {
539 cx_for_n(i, string.length) {
540 string.ptr[i] = (char) toupper(string.ptr[i]);
541 }
542 }
544 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
545 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
546 #endif
548 struct cx_strreplace_ibuf {
549 size_t *buf;
550 struct cx_strreplace_ibuf *next;
551 unsigned int len;
552 };
554 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
555 while (buf) {
556 struct cx_strreplace_ibuf *next = buf->next;
557 free(buf->buf);
558 free(buf);
559 buf = next;
560 }
561 }
563 cxmutstr cx_strreplacen_a(
564 CxAllocator *allocator,
565 cxstring str,
566 cxstring pattern,
567 cxstring replacement,
568 size_t replmax
569 ) {
571 if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
572 return cx_strdup_a(allocator, str);
574 // Compute expected buffer length
575 size_t ibufmax = str.length / pattern.length;
576 size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
577 if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
578 ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
579 }
581 // Allocate first index buffer
582 struct cx_strreplace_ibuf *firstbuf, *curbuf;
583 firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
584 if (!firstbuf) return cx_mutstrn(NULL, 0);
585 firstbuf->buf = calloc(ibuflen, sizeof(size_t));
586 if (!firstbuf->buf) {
587 free(firstbuf);
588 return cx_mutstrn(NULL, 0);
589 }
591 // Search occurrences
592 cxstring searchstr = str;
593 size_t found = 0;
594 do {
595 cxstring match = cx_strstr(searchstr, pattern);
596 if (match.length > 0) {
597 // Allocate next buffer in chain, if required
598 if (curbuf->len == ibuflen) {
599 struct cx_strreplace_ibuf *nextbuf =
600 calloc(1, sizeof(struct cx_strreplace_ibuf));
601 if (!nextbuf) {
602 cx_strrepl_free_ibuf(firstbuf);
603 return cx_mutstrn(NULL, 0);
604 }
605 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
606 if (!nextbuf->buf) {
607 free(nextbuf);
608 cx_strrepl_free_ibuf(firstbuf);
609 return cx_mutstrn(NULL, 0);
610 }
611 curbuf->next = nextbuf;
612 curbuf = nextbuf;
613 }
615 // Record match index
616 found++;
617 size_t idx = match.ptr - str.ptr;
618 curbuf->buf[curbuf->len++] = idx;
619 searchstr.ptr = match.ptr + pattern.length;
620 searchstr.length = str.length - idx - pattern.length;
621 } else {
622 break;
623 }
624 } while (searchstr.length > 0 && found < replmax);
626 // Allocate result string
627 cxmutstr result;
628 {
629 ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
630 size_t rcount = 0;
631 curbuf = firstbuf;
632 do {
633 rcount += curbuf->len;
634 curbuf = curbuf->next;
635 } while (curbuf);
636 result.length = str.length + rcount * adjlen;
637 result.ptr = cxMalloc(allocator, result.length + 1);
638 if (!result.ptr) {
639 cx_strrepl_free_ibuf(firstbuf);
640 return cx_mutstrn(NULL, 0);
641 }
642 }
644 // Build result string
645 curbuf = firstbuf;
646 size_t srcidx = 0;
647 char *destptr = result.ptr;
648 do {
649 for (size_t i = 0; i < curbuf->len; i++) {
650 // Copy source part up to next match
651 size_t idx = curbuf->buf[i];
652 size_t srclen = idx - srcidx;
653 if (srclen > 0) {
654 memcpy(destptr, str.ptr + srcidx, srclen);
655 destptr += srclen;
656 srcidx += srclen;
657 }
659 // Copy the replacement and skip the source pattern
660 srcidx += pattern.length;
661 memcpy(destptr, replacement.ptr, replacement.length);
662 destptr += replacement.length;
663 }
664 curbuf = curbuf->next;
665 } while (curbuf);
666 memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
668 // Result is guaranteed to be zero-terminated
669 result.ptr[result.length] = '\0';
671 // Free index buffer
672 cx_strrepl_free_ibuf(firstbuf);
674 return result;
675 }
677 CxStrtokCtx cx_strtok(
678 cxstring str,
679 cxstring delim,
680 size_t limit
681 ) {
682 CxStrtokCtx ctx;
683 ctx.str = str;
684 ctx.delim = delim;
685 ctx.limit = limit;
686 ctx.pos = 0;
687 ctx.next_pos = 0;
688 ctx.delim_pos = 0;
689 ctx.found = 0;
690 ctx.delim_more = NULL;
691 ctx.delim_more_count = 0;
692 return ctx;
693 }
695 CxStrtokCtx cx_strtok_m(
696 cxmutstr str,
697 cxstring delim,
698 size_t limit
699 ) {
700 return cx_strtok(cx_strcast(str), delim, limit);
701 }
703 bool cx_strtok_next(
704 CxStrtokCtx *ctx,
705 cxstring *token
706 ) {
707 // abortion criteria
708 if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
709 return false;
710 }
712 // determine the search start
713 cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);
715 // search the next delimiter
716 cxstring delim = cx_strstr(haystack, ctx->delim);
718 // if found, make delim capture exactly the delimiter
719 if (delim.length > 0) {
720 delim.length = ctx->delim.length;
721 }
723 // if more delimiters are specified, check them now
724 if (ctx->delim_more_count > 0) {
725 cx_for_n(i, ctx->delim_more_count) {
726 cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
727 if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
728 delim.ptr = d.ptr;
729 delim.length = ctx->delim_more[i].length;
730 }
731 }
732 }
734 // store the token information and adjust the context
735 ctx->found++;
736 ctx->pos = ctx->next_pos;
737 token->ptr = &ctx->str.ptr[ctx->pos];
738 ctx->delim_pos = delim.length == 0 ?
739 ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
740 token->length = ctx->delim_pos - ctx->pos;
741 ctx->next_pos = ctx->delim_pos + delim.length;
743 return true;
744 }
746 bool cx_strtok_next_m(
747 CxStrtokCtx *ctx,
748 cxmutstr *token
749 ) {
750 return cx_strtok_next(ctx, (cxstring *) token);
751 }
753 void cx_strtok_delim(
754 CxStrtokCtx *ctx,
755 cxstring const *delim,
756 size_t count
757 ) {
758 ctx->delim_more = delim;
759 ctx->delim_more_count = count;
760 }