Tue, 04 Oct 2022 19:25:07 +0200
fix over-optimization of strstr
1. it's actually less performant to frequently read bytes
from an array instead of using the native word length
2. the SBO buffer should be local and not static to allow
multi-threading usage
1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
29 #include "cx/string.h"
30 #include "cx/utils.h"
32 #include <string.h>
33 #include <stdarg.h>
34 #include <stdint.h>
35 #include <ctype.h>
37 #ifndef _WIN32
39 #include <strings.h> /* for strncasecmp() */
41 #endif /* _WIN32 */
43 cxmutstr cx_mutstr(char *cstring) {
44 return (cxmutstr) {cstring, strlen(cstring)};
45 }
47 cxmutstr cx_mutstrn(
48 char *cstring,
49 size_t length
50 ) {
51 return (cxmutstr) {cstring, length};
52 }
54 cxstring cx_str(const char *cstring) {
55 return (cxstring) {cstring, strlen(cstring)};
56 }
58 cxstring cx_strn(
59 const char *cstring,
60 size_t length
61 ) {
62 return (cxstring) {cstring, length};
63 }
65 cxstring cx_strcast(cxmutstr str) {
66 return (cxstring) {str.ptr, str.length};
67 }
69 void cx_strfree(cxmutstr *str) {
70 free(str->ptr);
71 str->ptr = NULL;
72 str->length = 0;
73 }
75 void cx_strfree_a(
76 CxAllocator *alloc,
77 cxmutstr *str
78 ) {
79 cxFree(alloc, str->ptr);
80 str->ptr = NULL;
81 str->length = 0;
82 }
84 size_t cx_strlen(
85 size_t count,
86 ...
87 ) {
88 if (count == 0) return 0;
90 va_list ap;
91 va_start(ap, count);
92 size_t size = 0;
93 cx_for_n(i, count) {
94 cxstring str = va_arg(ap, cxstring);
95 size += str.length;
96 }
97 va_end(ap);
99 return size;
100 }
102 cxmutstr cx_strcat_a(
103 CxAllocator *alloc,
104 size_t count,
105 ...
106 ) {
107 cxstring *strings = calloc(count, sizeof(cxstring));
108 if (!strings) abort();
110 va_list ap;
111 va_start(ap, count);
113 // get all args and overall length
114 size_t slen = 0;
115 cx_for_n(i, count) {
116 cxstring s = va_arg (ap, cxstring);
117 strings[i] = s;
118 slen += s.length;
119 }
121 // create new string
122 cxmutstr result;
123 result.ptr = cxMalloc(alloc, slen + 1);
124 result.length = slen;
125 if (result.ptr == NULL) abort();
127 // concatenate strings
128 size_t pos = 0;
129 cx_for_n(i, count) {
130 cxstring s = strings[i];
131 memcpy(result.ptr + pos, s.ptr, s.length);
132 pos += s.length;
133 }
135 // terminate string
136 result.ptr[result.length] = '\0';
138 // free temporary array
139 free(strings);
141 return result;
142 }
144 cxstring cx_strsubs(
145 cxstring string,
146 size_t start
147 ) {
148 return cx_strsubsl(string, start, string.length - start);
149 }
151 cxmutstr cx_strsubs_m(
152 cxmutstr string,
153 size_t start
154 ) {
155 return cx_strsubsl_m(string, start, string.length - start);
156 }
158 cxstring cx_strsubsl(
159 cxstring string,
160 size_t start,
161 size_t length
162 ) {
163 if (start > string.length) {
164 return (cxstring) {NULL, 0};
165 }
167 size_t rem_len = string.length - start;
168 if (length > rem_len) {
169 length = rem_len;
170 }
172 return (cxstring) {string.ptr + start, length};
173 }
175 cxmutstr cx_strsubsl_m(
176 cxmutstr string,
177 size_t start,
178 size_t length
179 ) {
180 cxstring result = cx_strsubsl(cx_strcast(string), start, length);
181 return (cxmutstr) {(char *) result.ptr, result.length};
182 }
184 cxstring cx_strchr(
185 cxstring string,
186 int chr
187 ) {
188 chr = 0xFF & chr;
189 // TODO: improve by comparing multiple bytes at once
190 cx_for_n(i, string.length) {
191 if (string.ptr[i] == chr) {
192 return cx_strsubs(string, i);
193 }
194 }
195 return (cxstring) {NULL, 0};
196 }
198 cxmutstr cx_strchr_m(
199 cxmutstr string,
200 int chr
201 ) {
202 cxstring result = cx_strchr(cx_strcast(string), chr);
203 return (cxmutstr) {(char *) result.ptr, result.length};
204 }
206 cxstring cx_strrchr(
207 cxstring string,
208 int chr
209 ) {
210 chr = 0xFF & chr;
211 size_t i = string.length;
212 while (i > 0) {
213 i--;
214 // TODO: improve by comparing multiple bytes at once
215 if (string.ptr[i] == chr) {
216 return cx_strsubs(string, i);
217 }
218 }
219 return (cxstring) {NULL, 0};
220 }
222 cxmutstr cx_strrchr_m(
223 cxmutstr string,
224 int chr
225 ) {
226 cxstring result = cx_strrchr(cx_strcast(string), chr);
227 return (cxmutstr) {(char *) result.ptr, result.length};
228 }
230 #define STRSTR_SBO_BUFLEN 512
232 cxstring cx_strstr(
233 cxstring haystack,
234 cxstring needle
235 ) {
236 if (needle.length == 0) {
237 return haystack;
238 }
240 /* optimize for single-char needles */
241 if (needle.length == 1) {
242 return cx_strchr(haystack, *needle.ptr);
243 }
245 /*
246 * IMPORTANT:
247 * Our prefix table contains the prefix length PLUS ONE
248 * this is our decision, because we want to use the full range of size_t.
249 * The original algorithm needs a (-1) at one single place,
250 * and we want to avoid that.
251 */
253 /* local prefix table */
254 size_t s_prefix_table[STRSTR_SBO_BUFLEN];
256 /* check needle length and use appropriate prefix table */
257 /* if the pattern exceeds static prefix table, allocate on the heap */
258 bool useheap = needle.length >= STRSTR_SBO_BUFLEN;
259 register size_t *ptable = useheap ? calloc(needle.length + 1,
260 sizeof(size_t)) : s_prefix_table;
262 /* keep counter in registers */
263 register size_t i, j;
265 /* fill prefix table */
266 i = 0;
267 j = 0;
268 ptable[i] = j;
269 while (i < needle.length) {
270 while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
271 j = ptable[j - 1];
272 }
273 i++;
274 j++;
275 ptable[i] = j;
276 }
278 /* search */
279 cxstring result = {NULL, 0};
280 i = 0;
281 j = 1;
282 while (i < haystack.length) {
283 while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
284 j = ptable[j - 1];
285 }
286 i++;
287 j++;
288 if (j - 1 == needle.length) {
289 size_t start = i - needle.length;
290 result.ptr = haystack.ptr + start;
291 result.length = haystack.length - start;
292 break;
293 }
294 }
296 /* if prefix table was allocated on the heap, free it */
297 if (ptable != s_prefix_table) {
298 free(ptable);
299 }
301 return result;
302 }
304 cxmutstr cx_strstr_m(
305 cxmutstr haystack,
306 cxstring needle
307 ) {
308 cxstring result = cx_strstr(cx_strcast(haystack), needle);
309 return (cxmutstr) {(char *) result.ptr, result.length};
310 }
312 size_t cx_strsplit(
313 cxstring string,
314 cxstring delim,
315 size_t limit,
316 cxstring *output
317 ) {
318 /* special case: output limit is zero */
319 if (limit == 0) return 0;
321 /* special case: delimiter is empty */
322 if (delim.length == 0) {
323 output[0] = string;
324 return 1;
325 }
327 /* special cases: delimiter is at least as large as the string */
328 if (delim.length >= string.length) {
329 /* exact match */
330 if (cx_strcmp(string, delim) == 0) {
331 output[0] = cx_strn(string.ptr, 0);
332 output[1] = cx_strn(string.ptr + string.length, 0);
333 return 2;
334 } else /* no match possible */ {
335 output[0] = string;
336 return 1;
337 }
338 }
340 size_t n = 0;
341 cxstring curpos = string;
342 while (1) {
343 ++n;
344 cxstring match = cx_strstr(curpos, delim);
345 if (match.length > 0) {
346 /* is the limit reached? */
347 if (n < limit) {
348 /* copy the current string to the array */
349 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
350 output[n - 1] = item;
351 size_t processed = item.length + delim.length;
352 curpos.ptr += processed;
353 curpos.length -= processed;
354 } else {
355 /* limit reached, copy the _full_ remaining string */
356 output[n - 1] = curpos;
357 break;
358 }
359 } else {
360 /* no more matches, copy last string */
361 output[n - 1] = curpos;
362 break;
363 }
364 }
366 return n;
367 }
369 size_t cx_strsplit_a(
370 CxAllocator *allocator,
371 cxstring string,
372 cxstring delim,
373 size_t limit,
374 cxstring **output
375 ) {
376 /* find out how many splits we're going to make and allocate memory */
377 size_t n = 0;
378 cxstring curpos = string;
379 while (1) {
380 ++n;
381 cxstring match = cx_strstr(curpos, delim);
382 if (match.length > 0) {
383 /* is the limit reached? */
384 if (n < limit) {
385 size_t processed = match.ptr - curpos.ptr + delim.length;
386 curpos.ptr += processed;
387 curpos.length -= processed;
388 } else {
389 /* limit reached */
390 break;
391 }
392 } else {
393 /* no more matches */
394 break;
395 }
396 }
397 *output = cxCalloc(allocator, n, sizeof(cxstring));
398 return cx_strsplit(string, delim, n, *output);
399 }
401 size_t cx_strsplit_m(
402 cxmutstr string,
403 cxstring delim,
404 size_t limit,
405 cxmutstr *output
406 ) {
407 return cx_strsplit(cx_strcast(string),
408 delim, limit, (cxstring *) output);
409 }
411 size_t cx_strsplit_ma(
412 CxAllocator *allocator,
413 cxmutstr string,
414 cxstring delim,
415 size_t limit,
416 cxmutstr **output
417 ) {
418 return cx_strsplit_a(allocator, cx_strcast(string),
419 delim, limit, (cxstring **) output);
420 }
422 int cx_strcmp(
423 cxstring s1,
424 cxstring s2
425 ) {
426 if (s1.length == s2.length) {
427 return memcmp(s1.ptr, s2.ptr, s1.length);
428 } else if (s1.length > s2.length) {
429 return 1;
430 } else {
431 return -1;
432 }
433 }
435 int cx_strcasecmp(
436 cxstring s1,
437 cxstring s2
438 ) {
439 if (s1.length == s2.length) {
440 #ifdef _WIN32
441 return _strnicmp(s1.ptr, s2.ptr, s1.length);
442 #else
443 return strncasecmp(s1.ptr, s2.ptr, s1.length);
444 #endif
445 } else if (s1.length > s2.length) {
446 return 1;
447 } else {
448 return -1;
449 }
450 }
452 cxmutstr cx_strdup_a(
453 CxAllocator *allocator,
454 cxstring string
455 ) {
456 cxmutstr result = {
457 cxMalloc(allocator, string.length + 1),
458 string.length
459 };
460 if (result.ptr == NULL) {
461 result.length = 0;
462 return result;
463 }
464 memcpy(result.ptr, string.ptr, string.length);
465 result.ptr[string.length] = '\0';
466 return result;
467 }
469 cxstring cx_strtrim(cxstring string) {
470 cxstring result = string;
471 // TODO: optimize by comparing multiple bytes at once
472 while (result.length > 0 && isspace(*result.ptr)) {
473 result.ptr++;
474 result.length--;
475 }
476 while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
477 result.length--;
478 }
479 return result;
480 }
482 cxmutstr cx_strtrim_m(cxmutstr string) {
483 cxstring result = cx_strtrim(cx_strcast(string));
484 return (cxmutstr) {(char *) result.ptr, result.length};
485 }
487 bool cx_strprefix(
488 cxstring string,
489 cxstring prefix
490 ) {
491 if (string.length < prefix.length) return false;
492 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
493 }
495 bool cx_strsuffix(
496 cxstring string,
497 cxstring suffix
498 ) {
499 if (string.length < suffix.length) return false;
500 return memcmp(string.ptr + string.length - suffix.length,
501 suffix.ptr, suffix.length) == 0;
502 }
504 bool cx_strcaseprefix(
505 cxstring string,
506 cxstring prefix
507 ) {
508 if (string.length < prefix.length) return false;
509 #ifdef _WIN32
510 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
511 #else
512 return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
513 #endif
514 }
516 bool cx_strcasesuffix(
517 cxstring string,
518 cxstring suffix
519 ) {
520 if (string.length < suffix.length) return false;
521 #ifdef _WIN32
522 return _strnicmp(string.ptr+string.length-suffix.length,
523 suffix.ptr, suffix.length) == 0;
524 #else
525 return strncasecmp(string.ptr + string.length - suffix.length,
526 suffix.ptr, suffix.length) == 0;
527 #endif
528 }
530 void cx_strlower(cxmutstr string) {
531 cx_for_n(i, string.length) {
532 string.ptr[i] = tolower(string.ptr[i]);
533 }
534 }
536 void cx_strupper(cxmutstr string) {
537 cx_for_n(i, string.length) {
538 string.ptr[i] = toupper(string.ptr[i]);
539 }
540 }
542 #define REPLACE_INDEX_BUFFER_MAX 100
544 struct cx_strreplace_ibuf {
545 size_t *buf;
546 struct cx_strreplace_ibuf *next;
547 unsigned int len;
548 };
550 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
551 while (buf) {
552 struct cx_strreplace_ibuf *next = buf->next;
553 free(buf->buf);
554 free(buf);
555 buf = next;
556 }
557 }
559 cxmutstr cx_strreplacen_a(
560 CxAllocator *allocator,
561 cxstring str,
562 cxstring pattern,
563 cxstring replacement,
564 size_t replmax
565 ) {
567 if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
568 return cx_strdup_a(allocator, str);
570 /* Compute expected buffer length */
571 size_t ibufmax = str.length / pattern.length;
572 size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
573 if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
574 ibuflen = REPLACE_INDEX_BUFFER_MAX;
575 }
577 /* Allocate first index buffer */
578 struct cx_strreplace_ibuf *firstbuf, *curbuf;
579 firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
580 if (!firstbuf) return cx_mutstrn(NULL, 0);
581 firstbuf->buf = calloc(ibuflen, sizeof(size_t));
582 if (!firstbuf->buf) {
583 free(firstbuf);
584 return cx_mutstrn(NULL, 0);
585 }
587 /* Search occurrences */
588 cxstring searchstr = str;
589 size_t found = 0;
590 do {
591 cxstring match = cx_strstr(searchstr, pattern);
592 if (match.length > 0) {
593 /* Allocate next buffer in chain, if required */
594 if (curbuf->len == ibuflen) {
595 struct cx_strreplace_ibuf *nextbuf =
596 calloc(1, sizeof(struct cx_strreplace_ibuf));
597 if (!nextbuf) {
598 cx_strrepl_free_ibuf(firstbuf);
599 return cx_mutstrn(NULL, 0);
600 }
601 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
602 if (!nextbuf->buf) {
603 free(nextbuf);
604 cx_strrepl_free_ibuf(firstbuf);
605 return cx_mutstrn(NULL, 0);
606 }
607 curbuf->next = nextbuf;
608 curbuf = nextbuf;
609 }
611 /* Record match index */
612 found++;
613 size_t idx = match.ptr - str.ptr;
614 curbuf->buf[curbuf->len++] = idx;
615 searchstr.ptr = match.ptr + pattern.length;
616 searchstr.length = str.length - idx - pattern.length;
617 } else {
618 break;
619 }
620 } while (searchstr.length > 0 && found < replmax);
622 /* Allocate result string */
623 cxmutstr result;
624 {
625 ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
626 size_t rcount = 0;
627 curbuf = firstbuf;
628 do {
629 rcount += curbuf->len;
630 curbuf = curbuf->next;
631 } while (curbuf);
632 result.length = str.length + rcount * adjlen;
633 result.ptr = cxMalloc(allocator, result.length + 1);
634 if (!result.ptr) {
635 cx_strrepl_free_ibuf(firstbuf);
636 return cx_mutstrn(NULL, 0);
637 }
638 }
640 /* Build result string */
641 curbuf = firstbuf;
642 size_t srcidx = 0;
643 char *destptr = result.ptr;
644 do {
645 for (size_t i = 0; i < curbuf->len; i++) {
646 /* Copy source part up to next match*/
647 size_t idx = curbuf->buf[i];
648 size_t srclen = idx - srcidx;
649 if (srclen > 0) {
650 memcpy(destptr, str.ptr + srcidx, srclen);
651 destptr += srclen;
652 srcidx += srclen;
653 }
655 /* Copy the replacement and skip the source pattern */
656 srcidx += pattern.length;
657 memcpy(destptr, replacement.ptr, replacement.length);
658 destptr += replacement.length;
659 }
660 curbuf = curbuf->next;
661 } while (curbuf);
662 memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
664 /* Result is guaranteed to be zero-terminated */
665 result.ptr[result.length] = '\0';
667 /* Free index buffer */
668 cx_strrepl_free_ibuf(firstbuf);
670 return result;
671 }