Sun, 13 Nov 2022 13:22:03 +0100
proposal for a low level array copy
1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
29 #include "cx/string.h"
30 #include "cx/utils.h"
32 #include <string.h>
33 #include <stdarg.h>
34 #include <ctype.h>
36 #ifndef _WIN32
38 #include <strings.h> /* for strncasecmp() */
40 #endif /* _WIN32 */
42 cxmutstr cx_mutstr(char *cstring) {
43 return (cxmutstr) {cstring, strlen(cstring)};
44 }
46 cxmutstr cx_mutstrn(
47 char *cstring,
48 size_t length
49 ) {
50 return (cxmutstr) {cstring, length};
51 }
53 cxstring cx_str(const char *cstring) {
54 return (cxstring) {cstring, strlen(cstring)};
55 }
57 cxstring cx_strn(
58 const char *cstring,
59 size_t length
60 ) {
61 return (cxstring) {cstring, length};
62 }
64 cxstring cx_strcast(cxmutstr str) {
65 return (cxstring) {str.ptr, str.length};
66 }
68 void cx_strfree(cxmutstr *str) {
69 free(str->ptr);
70 str->ptr = NULL;
71 str->length = 0;
72 }
74 void cx_strfree_a(
75 CxAllocator *alloc,
76 cxmutstr *str
77 ) {
78 cxFree(alloc, str->ptr);
79 str->ptr = NULL;
80 str->length = 0;
81 }
83 size_t cx_strlen(
84 size_t count,
85 ...
86 ) {
87 if (count == 0) return 0;
89 va_list ap;
90 va_start(ap, count);
91 size_t size = 0;
92 cx_for_n(i, count) {
93 cxstring str = va_arg(ap, cxstring);
94 size += str.length;
95 }
96 va_end(ap);
98 return size;
99 }
101 cxmutstr cx_strcat_a(
102 CxAllocator *alloc,
103 size_t count,
104 ...
105 ) {
106 cxstring *strings = calloc(count, sizeof(cxstring));
107 if (!strings) abort();
109 va_list ap;
110 va_start(ap, count);
112 // get all args and overall length
113 size_t slen = 0;
114 cx_for_n(i, count) {
115 cxstring s = va_arg (ap, cxstring);
116 strings[i] = s;
117 slen += s.length;
118 }
120 // create new string
121 cxmutstr result;
122 result.ptr = cxMalloc(alloc, slen + 1);
123 result.length = slen;
124 if (result.ptr == NULL) abort();
126 // concatenate strings
127 size_t pos = 0;
128 cx_for_n(i, count) {
129 cxstring s = strings[i];
130 memcpy(result.ptr + pos, s.ptr, s.length);
131 pos += s.length;
132 }
134 // terminate string
135 result.ptr[result.length] = '\0';
137 // free temporary array
138 free(strings);
140 return result;
141 }
143 cxstring cx_strsubs(
144 cxstring string,
145 size_t start
146 ) {
147 return cx_strsubsl(string, start, string.length - start);
148 }
150 cxmutstr cx_strsubs_m(
151 cxmutstr string,
152 size_t start
153 ) {
154 return cx_strsubsl_m(string, start, string.length - start);
155 }
157 cxstring cx_strsubsl(
158 cxstring string,
159 size_t start,
160 size_t length
161 ) {
162 if (start > string.length) {
163 return (cxstring) {NULL, 0};
164 }
166 size_t rem_len = string.length - start;
167 if (length > rem_len) {
168 length = rem_len;
169 }
171 return (cxstring) {string.ptr + start, length};
172 }
174 cxmutstr cx_strsubsl_m(
175 cxmutstr string,
176 size_t start,
177 size_t length
178 ) {
179 cxstring result = cx_strsubsl(cx_strcast(string), start, length);
180 return (cxmutstr) {(char *) result.ptr, result.length};
181 }
183 cxstring cx_strchr(
184 cxstring string,
185 int chr
186 ) {
187 chr = 0xFF & chr;
188 // TODO: improve by comparing multiple bytes at once
189 cx_for_n(i, string.length) {
190 if (string.ptr[i] == chr) {
191 return cx_strsubs(string, i);
192 }
193 }
194 return (cxstring) {NULL, 0};
195 }
197 cxmutstr cx_strchr_m(
198 cxmutstr string,
199 int chr
200 ) {
201 cxstring result = cx_strchr(cx_strcast(string), chr);
202 return (cxmutstr) {(char *) result.ptr, result.length};
203 }
205 cxstring cx_strrchr(
206 cxstring string,
207 int chr
208 ) {
209 chr = 0xFF & chr;
210 size_t i = string.length;
211 while (i > 0) {
212 i--;
213 // TODO: improve by comparing multiple bytes at once
214 if (string.ptr[i] == chr) {
215 return cx_strsubs(string, i);
216 }
217 }
218 return (cxstring) {NULL, 0};
219 }
221 cxmutstr cx_strrchr_m(
222 cxmutstr string,
223 int chr
224 ) {
225 cxstring result = cx_strrchr(cx_strcast(string), chr);
226 return (cxmutstr) {(char *) result.ptr, result.length};
227 }
229 #define STRSTR_SBO_BUFLEN 512
231 cxstring cx_strstr(
232 cxstring haystack,
233 cxstring needle
234 ) {
235 if (needle.length == 0) {
236 return haystack;
237 }
239 /* optimize for single-char needles */
240 if (needle.length == 1) {
241 return cx_strchr(haystack, *needle.ptr);
242 }
244 /*
245 * IMPORTANT:
246 * Our prefix table contains the prefix length PLUS ONE
247 * this is our decision, because we want to use the full range of size_t.
248 * The original algorithm needs a (-1) at one single place,
249 * and we want to avoid that.
250 */
252 /* local prefix table */
253 size_t s_prefix_table[STRSTR_SBO_BUFLEN];
255 /* check needle length and use appropriate prefix table */
256 /* if the pattern exceeds static prefix table, allocate on the heap */
257 bool useheap = needle.length >= STRSTR_SBO_BUFLEN;
258 register size_t *ptable = useheap ? calloc(needle.length + 1,
259 sizeof(size_t)) : s_prefix_table;
261 /* keep counter in registers */
262 register size_t i, j;
264 /* fill prefix table */
265 i = 0;
266 j = 0;
267 ptable[i] = j;
268 while (i < needle.length) {
269 while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
270 j = ptable[j - 1];
271 }
272 i++;
273 j++;
274 ptable[i] = j;
275 }
277 /* search */
278 cxstring result = {NULL, 0};
279 i = 0;
280 j = 1;
281 while (i < haystack.length) {
282 while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
283 j = ptable[j - 1];
284 }
285 i++;
286 j++;
287 if (j - 1 == needle.length) {
288 size_t start = i - needle.length;
289 result.ptr = haystack.ptr + start;
290 result.length = haystack.length - start;
291 break;
292 }
293 }
295 /* if prefix table was allocated on the heap, free it */
296 if (ptable != s_prefix_table) {
297 free(ptable);
298 }
300 return result;
301 }
303 cxmutstr cx_strstr_m(
304 cxmutstr haystack,
305 cxstring needle
306 ) {
307 cxstring result = cx_strstr(cx_strcast(haystack), needle);
308 return (cxmutstr) {(char *) result.ptr, result.length};
309 }
311 size_t cx_strsplit(
312 cxstring string,
313 cxstring delim,
314 size_t limit,
315 cxstring *output
316 ) {
317 /* special case: output limit is zero */
318 if (limit == 0) return 0;
320 /* special case: delimiter is empty */
321 if (delim.length == 0) {
322 output[0] = string;
323 return 1;
324 }
326 /* special cases: delimiter is at least as large as the string */
327 if (delim.length >= string.length) {
328 /* exact match */
329 if (cx_strcmp(string, delim) == 0) {
330 output[0] = cx_strn(string.ptr, 0);
331 output[1] = cx_strn(string.ptr + string.length, 0);
332 return 2;
333 } else /* no match possible */ {
334 output[0] = string;
335 return 1;
336 }
337 }
339 size_t n = 0;
340 cxstring curpos = string;
341 while (1) {
342 ++n;
343 cxstring match = cx_strstr(curpos, delim);
344 if (match.length > 0) {
345 /* is the limit reached? */
346 if (n < limit) {
347 /* copy the current string to the array */
348 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
349 output[n - 1] = item;
350 size_t processed = item.length + delim.length;
351 curpos.ptr += processed;
352 curpos.length -= processed;
353 } else {
354 /* limit reached, copy the _full_ remaining string */
355 output[n - 1] = curpos;
356 break;
357 }
358 } else {
359 /* no more matches, copy last string */
360 output[n - 1] = curpos;
361 break;
362 }
363 }
365 return n;
366 }
368 size_t cx_strsplit_a(
369 CxAllocator *allocator,
370 cxstring string,
371 cxstring delim,
372 size_t limit,
373 cxstring **output
374 ) {
375 /* find out how many splits we're going to make and allocate memory */
376 size_t n = 0;
377 cxstring curpos = string;
378 while (1) {
379 ++n;
380 cxstring match = cx_strstr(curpos, delim);
381 if (match.length > 0) {
382 /* is the limit reached? */
383 if (n < limit) {
384 size_t processed = match.ptr - curpos.ptr + delim.length;
385 curpos.ptr += processed;
386 curpos.length -= processed;
387 } else {
388 /* limit reached */
389 break;
390 }
391 } else {
392 /* no more matches */
393 break;
394 }
395 }
396 *output = cxCalloc(allocator, n, sizeof(cxstring));
397 return cx_strsplit(string, delim, n, *output);
398 }
400 size_t cx_strsplit_m(
401 cxmutstr string,
402 cxstring delim,
403 size_t limit,
404 cxmutstr *output
405 ) {
406 return cx_strsplit(cx_strcast(string),
407 delim, limit, (cxstring *) output);
408 }
410 size_t cx_strsplit_ma(
411 CxAllocator *allocator,
412 cxmutstr string,
413 cxstring delim,
414 size_t limit,
415 cxmutstr **output
416 ) {
417 return cx_strsplit_a(allocator, cx_strcast(string),
418 delim, limit, (cxstring **) output);
419 }
421 int cx_strcmp(
422 cxstring s1,
423 cxstring s2
424 ) {
425 if (s1.length == s2.length) {
426 return memcmp(s1.ptr, s2.ptr, s1.length);
427 } else if (s1.length > s2.length) {
428 return 1;
429 } else {
430 return -1;
431 }
432 }
434 int cx_strcasecmp(
435 cxstring s1,
436 cxstring s2
437 ) {
438 if (s1.length == s2.length) {
439 #ifdef _WIN32
440 return _strnicmp(s1.ptr, s2.ptr, s1.length);
441 #else
442 return strncasecmp(s1.ptr, s2.ptr, s1.length);
443 #endif
444 } else if (s1.length > s2.length) {
445 return 1;
446 } else {
447 return -1;
448 }
449 }
451 cxmutstr cx_strdup_a(
452 CxAllocator *allocator,
453 cxstring string
454 ) {
455 cxmutstr result = {
456 cxMalloc(allocator, string.length + 1),
457 string.length
458 };
459 if (result.ptr == NULL) {
460 result.length = 0;
461 return result;
462 }
463 memcpy(result.ptr, string.ptr, string.length);
464 result.ptr[string.length] = '\0';
465 return result;
466 }
468 cxstring cx_strtrim(cxstring string) {
469 cxstring result = string;
470 // TODO: optimize by comparing multiple bytes at once
471 while (result.length > 0 && isspace(*result.ptr)) {
472 result.ptr++;
473 result.length--;
474 }
475 while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
476 result.length--;
477 }
478 return result;
479 }
481 cxmutstr cx_strtrim_m(cxmutstr string) {
482 cxstring result = cx_strtrim(cx_strcast(string));
483 return (cxmutstr) {(char *) result.ptr, result.length};
484 }
486 bool cx_strprefix(
487 cxstring string,
488 cxstring prefix
489 ) {
490 if (string.length < prefix.length) return false;
491 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
492 }
494 bool cx_strsuffix(
495 cxstring string,
496 cxstring suffix
497 ) {
498 if (string.length < suffix.length) return false;
499 return memcmp(string.ptr + string.length - suffix.length,
500 suffix.ptr, suffix.length) == 0;
501 }
503 bool cx_strcaseprefix(
504 cxstring string,
505 cxstring prefix
506 ) {
507 if (string.length < prefix.length) return false;
508 #ifdef _WIN32
509 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
510 #else
511 return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
512 #endif
513 }
515 bool cx_strcasesuffix(
516 cxstring string,
517 cxstring suffix
518 ) {
519 if (string.length < suffix.length) return false;
520 #ifdef _WIN32
521 return _strnicmp(string.ptr+string.length-suffix.length,
522 suffix.ptr, suffix.length) == 0;
523 #else
524 return strncasecmp(string.ptr + string.length - suffix.length,
525 suffix.ptr, suffix.length) == 0;
526 #endif
527 }
529 void cx_strlower(cxmutstr string) {
530 cx_for_n(i, string.length) {
531 string.ptr[i] = (char) tolower(string.ptr[i]);
532 }
533 }
535 void cx_strupper(cxmutstr string) {
536 cx_for_n(i, string.length) {
537 string.ptr[i] = (char) toupper(string.ptr[i]);
538 }
539 }
541 #define REPLACE_INDEX_BUFFER_MAX 100
543 struct cx_strreplace_ibuf {
544 size_t *buf;
545 struct cx_strreplace_ibuf *next;
546 unsigned int len;
547 };
549 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
550 while (buf) {
551 struct cx_strreplace_ibuf *next = buf->next;
552 free(buf->buf);
553 free(buf);
554 buf = next;
555 }
556 }
558 cxmutstr cx_strreplacen_a(
559 CxAllocator *allocator,
560 cxstring str,
561 cxstring pattern,
562 cxstring replacement,
563 size_t replmax
564 ) {
566 if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
567 return cx_strdup_a(allocator, str);
569 /* Compute expected buffer length */
570 size_t ibufmax = str.length / pattern.length;
571 size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
572 if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
573 ibuflen = REPLACE_INDEX_BUFFER_MAX;
574 }
576 /* Allocate first index buffer */
577 struct cx_strreplace_ibuf *firstbuf, *curbuf;
578 firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
579 if (!firstbuf) return cx_mutstrn(NULL, 0);
580 firstbuf->buf = calloc(ibuflen, sizeof(size_t));
581 if (!firstbuf->buf) {
582 free(firstbuf);
583 return cx_mutstrn(NULL, 0);
584 }
586 /* Search occurrences */
587 cxstring searchstr = str;
588 size_t found = 0;
589 do {
590 cxstring match = cx_strstr(searchstr, pattern);
591 if (match.length > 0) {
592 /* Allocate next buffer in chain, if required */
593 if (curbuf->len == ibuflen) {
594 struct cx_strreplace_ibuf *nextbuf =
595 calloc(1, sizeof(struct cx_strreplace_ibuf));
596 if (!nextbuf) {
597 cx_strrepl_free_ibuf(firstbuf);
598 return cx_mutstrn(NULL, 0);
599 }
600 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
601 if (!nextbuf->buf) {
602 free(nextbuf);
603 cx_strrepl_free_ibuf(firstbuf);
604 return cx_mutstrn(NULL, 0);
605 }
606 curbuf->next = nextbuf;
607 curbuf = nextbuf;
608 }
610 /* Record match index */
611 found++;
612 size_t idx = match.ptr - str.ptr;
613 curbuf->buf[curbuf->len++] = idx;
614 searchstr.ptr = match.ptr + pattern.length;
615 searchstr.length = str.length - idx - pattern.length;
616 } else {
617 break;
618 }
619 } while (searchstr.length > 0 && found < replmax);
621 /* Allocate result string */
622 cxmutstr result;
623 {
624 ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
625 size_t rcount = 0;
626 curbuf = firstbuf;
627 do {
628 rcount += curbuf->len;
629 curbuf = curbuf->next;
630 } while (curbuf);
631 result.length = str.length + rcount * adjlen;
632 result.ptr = cxMalloc(allocator, result.length + 1);
633 if (!result.ptr) {
634 cx_strrepl_free_ibuf(firstbuf);
635 return cx_mutstrn(NULL, 0);
636 }
637 }
639 /* Build result string */
640 curbuf = firstbuf;
641 size_t srcidx = 0;
642 char *destptr = result.ptr;
643 do {
644 for (size_t i = 0; i < curbuf->len; i++) {
645 /* Copy source part up to next match*/
646 size_t idx = curbuf->buf[i];
647 size_t srclen = idx - srcidx;
648 if (srclen > 0) {
649 memcpy(destptr, str.ptr + srcidx, srclen);
650 destptr += srclen;
651 srcidx += srclen;
652 }
654 /* Copy the replacement and skip the source pattern */
655 srcidx += pattern.length;
656 memcpy(destptr, replacement.ptr, replacement.length);
657 destptr += replacement.length;
658 }
659 curbuf = curbuf->next;
660 } while (curbuf);
661 memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
663 /* Result is guaranteed to be zero-terminated */
664 result.ptr[result.length] = '\0';
666 /* Free index buffer */
667 cx_strrepl_free_ibuf(firstbuf);
669 return result;
670 }