src/string.c

Tue, 04 Oct 2022 19:25:07 +0200

author
Mike Becker <universe@uap-core.de>
date
Tue, 04 Oct 2022 19:25:07 +0200
changeset 591
7df0bcaecffa
parent 590
02a56701a5cb
child 593
ea9b41b5ebbc
permissions
-rw-r--r--

fix over-optimization of strstr

1. it's actually less performant to frequently read bytes
from an array instead of using the native word length
2. the SBO buffer should be local and not static to allow
multi-threading usage

     1 /*
     2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
     3  *
     4  * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or without
     7  * modification, are permitted provided that the following conditions are met:
     8  *
     9  *   1. Redistributions of source code must retain the above copyright
    10  *      notice, this list of conditions and the following disclaimer.
    11  *
    12  *   2. Redistributions in binary form must reproduce the above copyright
    13  *      notice, this list of conditions and the following disclaimer in the
    14  *      documentation and/or other materials provided with the distribution.
    15  *
    16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
    20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    26  * POSSIBILITY OF SUCH DAMAGE.
    27  */
    29 #include "cx/string.h"
    30 #include "cx/utils.h"
    32 #include <string.h>
    33 #include <stdarg.h>
    34 #include <stdint.h>
    35 #include <ctype.h>
    37 #ifndef _WIN32
    39 #include <strings.h> /* for strncasecmp() */
    41 #endif /* _WIN32 */
    43 cxmutstr cx_mutstr(char *cstring) {
    44     return (cxmutstr) {cstring, strlen(cstring)};
    45 }
    47 cxmutstr cx_mutstrn(
    48         char *cstring,
    49         size_t length
    50 ) {
    51     return (cxmutstr) {cstring, length};
    52 }
    54 cxstring cx_str(const char *cstring) {
    55     return (cxstring) {cstring, strlen(cstring)};
    56 }
    58 cxstring cx_strn(
    59         const char *cstring,
    60         size_t length
    61 ) {
    62     return (cxstring) {cstring, length};
    63 }
    65 cxstring cx_strcast(cxmutstr str) {
    66     return (cxstring) {str.ptr, str.length};
    67 }
    69 void cx_strfree(cxmutstr *str) {
    70     free(str->ptr);
    71     str->ptr = NULL;
    72     str->length = 0;
    73 }
    75 void cx_strfree_a(
    76         CxAllocator *alloc,
    77         cxmutstr *str
    78 ) {
    79     cxFree(alloc, str->ptr);
    80     str->ptr = NULL;
    81     str->length = 0;
    82 }
    84 size_t cx_strlen(
    85         size_t count,
    86         ...
    87 ) {
    88     if (count == 0) return 0;
    90     va_list ap;
    91     va_start(ap, count);
    92     size_t size = 0;
    93     cx_for_n(i, count) {
    94         cxstring str = va_arg(ap, cxstring);
    95         size += str.length;
    96     }
    97     va_end(ap);
    99     return size;
   100 }
   102 cxmutstr cx_strcat_a(
   103         CxAllocator *alloc,
   104         size_t count,
   105         ...
   106 ) {
   107     cxstring *strings = calloc(count, sizeof(cxstring));
   108     if (!strings) abort();
   110     va_list ap;
   111     va_start(ap, count);
   113     // get all args and overall length
   114     size_t slen = 0;
   115     cx_for_n(i, count) {
   116         cxstring s = va_arg (ap, cxstring);
   117         strings[i] = s;
   118         slen += s.length;
   119     }
   121     // create new string
   122     cxmutstr result;
   123     result.ptr = cxMalloc(alloc, slen + 1);
   124     result.length = slen;
   125     if (result.ptr == NULL) abort();
   127     // concatenate strings
   128     size_t pos = 0;
   129     cx_for_n(i, count) {
   130         cxstring s = strings[i];
   131         memcpy(result.ptr + pos, s.ptr, s.length);
   132         pos += s.length;
   133     }
   135     // terminate string
   136     result.ptr[result.length] = '\0';
   138     // free temporary array
   139     free(strings);
   141     return result;
   142 }
   144 cxstring cx_strsubs(
   145         cxstring string,
   146         size_t start
   147 ) {
   148     return cx_strsubsl(string, start, string.length - start);
   149 }
   151 cxmutstr cx_strsubs_m(
   152         cxmutstr string,
   153         size_t start
   154 ) {
   155     return cx_strsubsl_m(string, start, string.length - start);
   156 }
   158 cxstring cx_strsubsl(
   159         cxstring string,
   160         size_t start,
   161         size_t length
   162 ) {
   163     if (start > string.length) {
   164         return (cxstring) {NULL, 0};
   165     }
   167     size_t rem_len = string.length - start;
   168     if (length > rem_len) {
   169         length = rem_len;
   170     }
   172     return (cxstring) {string.ptr + start, length};
   173 }
   175 cxmutstr cx_strsubsl_m(
   176         cxmutstr string,
   177         size_t start,
   178         size_t length
   179 ) {
   180     cxstring result = cx_strsubsl(cx_strcast(string), start, length);
   181     return (cxmutstr) {(char *) result.ptr, result.length};
   182 }
   184 cxstring cx_strchr(
   185         cxstring string,
   186         int chr
   187 ) {
   188     chr = 0xFF & chr;
   189     // TODO: improve by comparing multiple bytes at once
   190     cx_for_n(i, string.length) {
   191         if (string.ptr[i] == chr) {
   192             return cx_strsubs(string, i);
   193         }
   194     }
   195     return (cxstring) {NULL, 0};
   196 }
   198 cxmutstr cx_strchr_m(
   199         cxmutstr string,
   200         int chr
   201 ) {
   202     cxstring result = cx_strchr(cx_strcast(string), chr);
   203     return (cxmutstr) {(char *) result.ptr, result.length};
   204 }
   206 cxstring cx_strrchr(
   207         cxstring string,
   208         int chr
   209 ) {
   210     chr = 0xFF & chr;
   211     size_t i = string.length;
   212     while (i > 0) {
   213         i--;
   214         // TODO: improve by comparing multiple bytes at once
   215         if (string.ptr[i] == chr) {
   216             return cx_strsubs(string, i);
   217         }
   218     }
   219     return (cxstring) {NULL, 0};
   220 }
   222 cxmutstr cx_strrchr_m(
   223         cxmutstr string,
   224         int chr
   225 ) {
   226     cxstring result = cx_strrchr(cx_strcast(string), chr);
   227     return (cxmutstr) {(char *) result.ptr, result.length};
   228 }
   230 #define STRSTR_SBO_BUFLEN 512
   232 cxstring cx_strstr(
   233         cxstring haystack,
   234         cxstring needle
   235 ) {
   236     if (needle.length == 0) {
   237         return haystack;
   238     }
   240     /* optimize for single-char needles */
   241     if (needle.length == 1) {
   242         return cx_strchr(haystack, *needle.ptr);
   243     }
   245     /*
   246      * IMPORTANT:
   247      * Our prefix table contains the prefix length PLUS ONE
   248      * this is our decision, because we want to use the full range of size_t.
   249      * The original algorithm needs a (-1) at one single place,
   250      * and we want to avoid that.
   251      */
   253     /* local prefix table */
   254     size_t s_prefix_table[STRSTR_SBO_BUFLEN];
   256     /* check needle length and use appropriate prefix table */
   257     /* if the pattern exceeds static prefix table, allocate on the heap */
   258     bool useheap = needle.length >= STRSTR_SBO_BUFLEN;
   259     register size_t *ptable = useheap ? calloc(needle.length + 1,
   260                                                sizeof(size_t)) : s_prefix_table;
   262     /* keep counter in registers */
   263     register size_t i, j;
   265     /* fill prefix table */
   266     i = 0;
   267     j = 0;
   268     ptable[i] = j;
   269     while (i < needle.length) {
   270         while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
   271             j = ptable[j - 1];
   272         }
   273         i++;
   274         j++;
   275         ptable[i] = j;
   276     }
   278     /* search */
   279     cxstring result = {NULL, 0};
   280     i = 0;
   281     j = 1;
   282     while (i < haystack.length) {
   283         while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
   284             j = ptable[j - 1];
   285         }
   286         i++;
   287         j++;
   288         if (j - 1 == needle.length) {
   289             size_t start = i - needle.length;
   290             result.ptr = haystack.ptr + start;
   291             result.length = haystack.length - start;
   292             break;
   293         }
   294     }
   296     /* if prefix table was allocated on the heap, free it */
   297     if (ptable != s_prefix_table) {
   298         free(ptable);
   299     }
   301     return result;
   302 }
   304 cxmutstr cx_strstr_m(
   305         cxmutstr haystack,
   306         cxstring needle
   307 ) {
   308     cxstring result = cx_strstr(cx_strcast(haystack), needle);
   309     return (cxmutstr) {(char *) result.ptr, result.length};
   310 }
   312 size_t cx_strsplit(
   313         cxstring string,
   314         cxstring delim,
   315         size_t limit,
   316         cxstring *output
   317 ) {
   318     /* special case: output limit is zero */
   319     if (limit == 0) return 0;
   321     /* special case: delimiter is empty */
   322     if (delim.length == 0) {
   323         output[0] = string;
   324         return 1;
   325     }
   327     /* special cases: delimiter is at least as large as the string */
   328     if (delim.length >= string.length) {
   329         /* exact match */
   330         if (cx_strcmp(string, delim) == 0) {
   331             output[0] = cx_strn(string.ptr, 0);
   332             output[1] = cx_strn(string.ptr + string.length, 0);
   333             return 2;
   334         } else /* no match possible */ {
   335             output[0] = string;
   336             return 1;
   337         }
   338     }
   340     size_t n = 0;
   341     cxstring curpos = string;
   342     while (1) {
   343         ++n;
   344         cxstring match = cx_strstr(curpos, delim);
   345         if (match.length > 0) {
   346             /* is the limit reached? */
   347             if (n < limit) {
   348                 /* copy the current string to the array */
   349                 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
   350                 output[n - 1] = item;
   351                 size_t processed = item.length + delim.length;
   352                 curpos.ptr += processed;
   353                 curpos.length -= processed;
   354             } else {
   355                 /* limit reached, copy the _full_ remaining string */
   356                 output[n - 1] = curpos;
   357                 break;
   358             }
   359         } else {
   360             /* no more matches, copy last string */
   361             output[n - 1] = curpos;
   362             break;
   363         }
   364     }
   366     return n;
   367 }
   369 size_t cx_strsplit_a(
   370         CxAllocator *allocator,
   371         cxstring string,
   372         cxstring delim,
   373         size_t limit,
   374         cxstring **output
   375 ) {
   376     /* find out how many splits we're going to make and allocate memory */
   377     size_t n = 0;
   378     cxstring curpos = string;
   379     while (1) {
   380         ++n;
   381         cxstring match = cx_strstr(curpos, delim);
   382         if (match.length > 0) {
   383             /* is the limit reached? */
   384             if (n < limit) {
   385                 size_t processed = match.ptr - curpos.ptr + delim.length;
   386                 curpos.ptr += processed;
   387                 curpos.length -= processed;
   388             } else {
   389                 /* limit reached */
   390                 break;
   391             }
   392         } else {
   393             /* no more matches */
   394             break;
   395         }
   396     }
   397     *output = cxCalloc(allocator, n, sizeof(cxstring));
   398     return cx_strsplit(string, delim, n, *output);
   399 }
   401 size_t cx_strsplit_m(
   402         cxmutstr string,
   403         cxstring delim,
   404         size_t limit,
   405         cxmutstr *output
   406 ) {
   407     return cx_strsplit(cx_strcast(string),
   408                        delim, limit, (cxstring *) output);
   409 }
   411 size_t cx_strsplit_ma(
   412         CxAllocator *allocator,
   413         cxmutstr string,
   414         cxstring delim,
   415         size_t limit,
   416         cxmutstr **output
   417 ) {
   418     return cx_strsplit_a(allocator, cx_strcast(string),
   419                          delim, limit, (cxstring **) output);
   420 }
   422 int cx_strcmp(
   423         cxstring s1,
   424         cxstring s2
   425 ) {
   426     if (s1.length == s2.length) {
   427         return memcmp(s1.ptr, s2.ptr, s1.length);
   428     } else if (s1.length > s2.length) {
   429         return 1;
   430     } else {
   431         return -1;
   432     }
   433 }
   435 int cx_strcasecmp(
   436         cxstring s1,
   437         cxstring s2
   438 ) {
   439     if (s1.length == s2.length) {
   440 #ifdef _WIN32
   441         return _strnicmp(s1.ptr, s2.ptr, s1.length);
   442 #else
   443         return strncasecmp(s1.ptr, s2.ptr, s1.length);
   444 #endif
   445     } else if (s1.length > s2.length) {
   446         return 1;
   447     } else {
   448         return -1;
   449     }
   450 }
   452 cxmutstr cx_strdup_a(
   453         CxAllocator *allocator,
   454         cxstring string
   455 ) {
   456     cxmutstr result = {
   457             cxMalloc(allocator, string.length + 1),
   458             string.length
   459     };
   460     if (result.ptr == NULL) {
   461         result.length = 0;
   462         return result;
   463     }
   464     memcpy(result.ptr, string.ptr, string.length);
   465     result.ptr[string.length] = '\0';
   466     return result;
   467 }
   469 cxstring cx_strtrim(cxstring string) {
   470     cxstring result = string;
   471     // TODO: optimize by comparing multiple bytes at once
   472     while (result.length > 0 && isspace(*result.ptr)) {
   473         result.ptr++;
   474         result.length--;
   475     }
   476     while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
   477         result.length--;
   478     }
   479     return result;
   480 }
   482 cxmutstr cx_strtrim_m(cxmutstr string) {
   483     cxstring result = cx_strtrim(cx_strcast(string));
   484     return (cxmutstr) {(char *) result.ptr, result.length};
   485 }
   487 bool cx_strprefix(
   488         cxstring string,
   489         cxstring prefix
   490 ) {
   491     if (string.length < prefix.length) return false;
   492     return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
   493 }
   495 bool cx_strsuffix(
   496         cxstring string,
   497         cxstring suffix
   498 ) {
   499     if (string.length < suffix.length) return false;
   500     return memcmp(string.ptr + string.length - suffix.length,
   501                   suffix.ptr, suffix.length) == 0;
   502 }
   504 bool cx_strcaseprefix(
   505         cxstring string,
   506         cxstring prefix
   507 ) {
   508     if (string.length < prefix.length) return false;
   509 #ifdef _WIN32
   510     return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
   511 #else
   512     return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
   513 #endif
   514 }
   516 bool cx_strcasesuffix(
   517         cxstring string,
   518         cxstring suffix
   519 ) {
   520     if (string.length < suffix.length) return false;
   521 #ifdef _WIN32
   522     return _strnicmp(string.ptr+string.length-suffix.length,
   523                   suffix.ptr, suffix.length) == 0;
   524 #else
   525     return strncasecmp(string.ptr + string.length - suffix.length,
   526                        suffix.ptr, suffix.length) == 0;
   527 #endif
   528 }
   530 void cx_strlower(cxmutstr string) {
   531     cx_for_n(i, string.length) {
   532         string.ptr[i] = tolower(string.ptr[i]);
   533     }
   534 }
   536 void cx_strupper(cxmutstr string) {
   537     cx_for_n(i, string.length) {
   538         string.ptr[i] = toupper(string.ptr[i]);
   539     }
   540 }
   542 #define REPLACE_INDEX_BUFFER_MAX 100
   544 struct cx_strreplace_ibuf {
   545     size_t *buf;
   546     struct cx_strreplace_ibuf *next;
   547     unsigned int len;
   548 };
   550 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
   551     while (buf) {
   552         struct cx_strreplace_ibuf *next = buf->next;
   553         free(buf->buf);
   554         free(buf);
   555         buf = next;
   556     }
   557 }
   559 cxmutstr cx_strreplacen_a(
   560         CxAllocator *allocator,
   561         cxstring str,
   562         cxstring pattern,
   563         cxstring replacement,
   564         size_t replmax
   565 ) {
   567     if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
   568         return cx_strdup_a(allocator, str);
   570     /* Compute expected buffer length */
   571     size_t ibufmax = str.length / pattern.length;
   572     size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
   573     if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
   574         ibuflen = REPLACE_INDEX_BUFFER_MAX;
   575     }
   577     /* Allocate first index buffer */
   578     struct cx_strreplace_ibuf *firstbuf, *curbuf;
   579     firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
   580     if (!firstbuf) return cx_mutstrn(NULL, 0);
   581     firstbuf->buf = calloc(ibuflen, sizeof(size_t));
   582     if (!firstbuf->buf) {
   583         free(firstbuf);
   584         return cx_mutstrn(NULL, 0);
   585     }
   587     /* Search occurrences */
   588     cxstring searchstr = str;
   589     size_t found = 0;
   590     do {
   591         cxstring match = cx_strstr(searchstr, pattern);
   592         if (match.length > 0) {
   593             /* Allocate next buffer in chain, if required */
   594             if (curbuf->len == ibuflen) {
   595                 struct cx_strreplace_ibuf *nextbuf =
   596                         calloc(1, sizeof(struct cx_strreplace_ibuf));
   597                 if (!nextbuf) {
   598                     cx_strrepl_free_ibuf(firstbuf);
   599                     return cx_mutstrn(NULL, 0);
   600                 }
   601                 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
   602                 if (!nextbuf->buf) {
   603                     free(nextbuf);
   604                     cx_strrepl_free_ibuf(firstbuf);
   605                     return cx_mutstrn(NULL, 0);
   606                 }
   607                 curbuf->next = nextbuf;
   608                 curbuf = nextbuf;
   609             }
   611             /* Record match index */
   612             found++;
   613             size_t idx = match.ptr - str.ptr;
   614             curbuf->buf[curbuf->len++] = idx;
   615             searchstr.ptr = match.ptr + pattern.length;
   616             searchstr.length = str.length - idx - pattern.length;
   617         } else {
   618             break;
   619         }
   620     } while (searchstr.length > 0 && found < replmax);
   622     /* Allocate result string */
   623     cxmutstr result;
   624     {
   625         ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
   626         size_t rcount = 0;
   627         curbuf = firstbuf;
   628         do {
   629             rcount += curbuf->len;
   630             curbuf = curbuf->next;
   631         } while (curbuf);
   632         result.length = str.length + rcount * adjlen;
   633         result.ptr = cxMalloc(allocator, result.length + 1);
   634         if (!result.ptr) {
   635             cx_strrepl_free_ibuf(firstbuf);
   636             return cx_mutstrn(NULL, 0);
   637         }
   638     }
   640     /* Build result string */
   641     curbuf = firstbuf;
   642     size_t srcidx = 0;
   643     char *destptr = result.ptr;
   644     do {
   645         for (size_t i = 0; i < curbuf->len; i++) {
   646             /* Copy source part up to next match*/
   647             size_t idx = curbuf->buf[i];
   648             size_t srclen = idx - srcidx;
   649             if (srclen > 0) {
   650                 memcpy(destptr, str.ptr + srcidx, srclen);
   651                 destptr += srclen;
   652                 srcidx += srclen;
   653             }
   655             /* Copy the replacement and skip the source pattern */
   656             srcidx += pattern.length;
   657             memcpy(destptr, replacement.ptr, replacement.length);
   658             destptr += replacement.length;
   659         }
   660         curbuf = curbuf->next;
   661     } while (curbuf);
   662     memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
   664     /* Result is guaranteed to be zero-terminated */
   665     result.ptr[result.length] = '\0';
   667     /* Free index buffer */
   668     cx_strrepl_free_ibuf(firstbuf);
   670     return result;
   671 }

mercurial