src/string.c

Sun, 20 Nov 2022 21:08:36 +0100

author
Mike Becker <universe@uap-core.de>
date
Sun, 20 Nov 2022 21:08:36 +0100
changeset 628
1e2be40f0cb5
parent 593
ea9b41b5ebbc
child 643
5700ba9154ab
permissions
-rw-r--r--

use //-style single line comments everywhere

     1 /*
     2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
     3  *
     4  * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or without
     7  * modification, are permitted provided that the following conditions are met:
     8  *
     9  *   1. Redistributions of source code must retain the above copyright
    10  *      notice, this list of conditions and the following disclaimer.
    11  *
    12  *   2. Redistributions in binary form must reproduce the above copyright
    13  *      notice, this list of conditions and the following disclaimer in the
    14  *      documentation and/or other materials provided with the distribution.
    15  *
    16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
    20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    26  * POSSIBILITY OF SUCH DAMAGE.
    27  */
    29 #include "cx/string.h"
    30 #include "cx/utils.h"
    32 #include <string.h>
    33 #include <stdarg.h>
    34 #include <ctype.h>
    36 #ifndef _WIN32
    38 #include <strings.h> // for strncasecmp()
    40 #endif // _WIN32
    42 cxmutstr cx_mutstr(char *cstring) {
    43     return (cxmutstr) {cstring, strlen(cstring)};
    44 }
    46 cxmutstr cx_mutstrn(
    47         char *cstring,
    48         size_t length
    49 ) {
    50     return (cxmutstr) {cstring, length};
    51 }
    53 cxstring cx_str(const char *cstring) {
    54     return (cxstring) {cstring, strlen(cstring)};
    55 }
    57 cxstring cx_strn(
    58         const char *cstring,
    59         size_t length
    60 ) {
    61     return (cxstring) {cstring, length};
    62 }
    64 cxstring cx_strcast(cxmutstr str) {
    65     return (cxstring) {str.ptr, str.length};
    66 }
    68 void cx_strfree(cxmutstr *str) {
    69     free(str->ptr);
    70     str->ptr = NULL;
    71     str->length = 0;
    72 }
    74 void cx_strfree_a(
    75         CxAllocator *alloc,
    76         cxmutstr *str
    77 ) {
    78     cxFree(alloc, str->ptr);
    79     str->ptr = NULL;
    80     str->length = 0;
    81 }
    83 size_t cx_strlen(
    84         size_t count,
    85         ...
    86 ) {
    87     if (count == 0) return 0;
    89     va_list ap;
    90     va_start(ap, count);
    91     size_t size = 0;
    92     cx_for_n(i, count) {
    93         cxstring str = va_arg(ap, cxstring);
    94         size += str.length;
    95     }
    96     va_end(ap);
    98     return size;
    99 }
   101 cxmutstr cx_strcat_a(
   102         CxAllocator *alloc,
   103         size_t count,
   104         ...
   105 ) {
   106     cxstring *strings = calloc(count, sizeof(cxstring));
   107     if (!strings) abort();
   109     va_list ap;
   110     va_start(ap, count);
   112     // get all args and overall length
   113     size_t slen = 0;
   114     cx_for_n(i, count) {
   115         cxstring s = va_arg (ap, cxstring);
   116         strings[i] = s;
   117         slen += s.length;
   118     }
   120     // create new string
   121     cxmutstr result;
   122     result.ptr = cxMalloc(alloc, slen + 1);
   123     result.length = slen;
   124     if (result.ptr == NULL) abort();
   126     // concatenate strings
   127     size_t pos = 0;
   128     cx_for_n(i, count) {
   129         cxstring s = strings[i];
   130         memcpy(result.ptr + pos, s.ptr, s.length);
   131         pos += s.length;
   132     }
   134     // terminate string
   135     result.ptr[result.length] = '\0';
   137     // free temporary array
   138     free(strings);
   140     return result;
   141 }
   143 cxstring cx_strsubs(
   144         cxstring string,
   145         size_t start
   146 ) {
   147     return cx_strsubsl(string, start, string.length - start);
   148 }
   150 cxmutstr cx_strsubs_m(
   151         cxmutstr string,
   152         size_t start
   153 ) {
   154     return cx_strsubsl_m(string, start, string.length - start);
   155 }
   157 cxstring cx_strsubsl(
   158         cxstring string,
   159         size_t start,
   160         size_t length
   161 ) {
   162     if (start > string.length) {
   163         return (cxstring) {NULL, 0};
   164     }
   166     size_t rem_len = string.length - start;
   167     if (length > rem_len) {
   168         length = rem_len;
   169     }
   171     return (cxstring) {string.ptr + start, length};
   172 }
   174 cxmutstr cx_strsubsl_m(
   175         cxmutstr string,
   176         size_t start,
   177         size_t length
   178 ) {
   179     cxstring result = cx_strsubsl(cx_strcast(string), start, length);
   180     return (cxmutstr) {(char *) result.ptr, result.length};
   181 }
   183 cxstring cx_strchr(
   184         cxstring string,
   185         int chr
   186 ) {
   187     chr = 0xFF & chr;
   188     // TODO: improve by comparing multiple bytes at once
   189     cx_for_n(i, string.length) {
   190         if (string.ptr[i] == chr) {
   191             return cx_strsubs(string, i);
   192         }
   193     }
   194     return (cxstring) {NULL, 0};
   195 }
   197 cxmutstr cx_strchr_m(
   198         cxmutstr string,
   199         int chr
   200 ) {
   201     cxstring result = cx_strchr(cx_strcast(string), chr);
   202     return (cxmutstr) {(char *) result.ptr, result.length};
   203 }
   205 cxstring cx_strrchr(
   206         cxstring string,
   207         int chr
   208 ) {
   209     chr = 0xFF & chr;
   210     size_t i = string.length;
   211     while (i > 0) {
   212         i--;
   213         // TODO: improve by comparing multiple bytes at once
   214         if (string.ptr[i] == chr) {
   215             return cx_strsubs(string, i);
   216         }
   217     }
   218     return (cxstring) {NULL, 0};
   219 }
   221 cxmutstr cx_strrchr_m(
   222         cxmutstr string,
   223         int chr
   224 ) {
   225     cxstring result = cx_strrchr(cx_strcast(string), chr);
   226     return (cxmutstr) {(char *) result.ptr, result.length};
   227 }
   229 #define STRSTR_SBO_BUFLEN 512
   231 cxstring cx_strstr(
   232         cxstring haystack,
   233         cxstring needle
   234 ) {
   235     if (needle.length == 0) {
   236         return haystack;
   237     }
   239     // optimize for single-char needles
   240     if (needle.length == 1) {
   241         return cx_strchr(haystack, *needle.ptr);
   242     }
   244     /*
   245      * IMPORTANT:
   246      * Our prefix table contains the prefix length PLUS ONE
   247      * this is our decision, because we want to use the full range of size_t.
   248      * The original algorithm needs a (-1) at one single place,
   249      * and we want to avoid that.
   250      */
   252     // local prefix table
   253     size_t s_prefix_table[STRSTR_SBO_BUFLEN];
   255     // check needle length and use appropriate prefix table
   256     // if the pattern exceeds static prefix table, allocate on the heap
   257     bool useheap = needle.length >= STRSTR_SBO_BUFLEN;
   258     register size_t *ptable = useheap ? calloc(needle.length + 1,
   259                                                sizeof(size_t)) : s_prefix_table;
   261     // keep counter in registers
   262     register size_t i, j;
   264     // fill prefix table
   265     i = 0;
   266     j = 0;
   267     ptable[i] = j;
   268     while (i < needle.length) {
   269         while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
   270             j = ptable[j - 1];
   271         }
   272         i++;
   273         j++;
   274         ptable[i] = j;
   275     }
   277     // search
   278     cxstring result = {NULL, 0};
   279     i = 0;
   280     j = 1;
   281     while (i < haystack.length) {
   282         while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
   283             j = ptable[j - 1];
   284         }
   285         i++;
   286         j++;
   287         if (j - 1 == needle.length) {
   288             size_t start = i - needle.length;
   289             result.ptr = haystack.ptr + start;
   290             result.length = haystack.length - start;
   291             break;
   292         }
   293     }
   295     // if prefix table was allocated on the heap, free it
   296     if (ptable != s_prefix_table) {
   297         free(ptable);
   298     }
   300     return result;
   301 }
   303 cxmutstr cx_strstr_m(
   304         cxmutstr haystack,
   305         cxstring needle
   306 ) {
   307     cxstring result = cx_strstr(cx_strcast(haystack), needle);
   308     return (cxmutstr) {(char *) result.ptr, result.length};
   309 }
   311 size_t cx_strsplit(
   312         cxstring string,
   313         cxstring delim,
   314         size_t limit,
   315         cxstring *output
   316 ) {
   317     // special case: output limit is zero
   318     if (limit == 0) return 0;
   320     // special case: delimiter is empty
   321     if (delim.length == 0) {
   322         output[0] = string;
   323         return 1;
   324     }
   326     // special cases: delimiter is at least as large as the string
   327     if (delim.length >= string.length) {
   328         // exact match
   329         if (cx_strcmp(string, delim) == 0) {
   330             output[0] = cx_strn(string.ptr, 0);
   331             output[1] = cx_strn(string.ptr + string.length, 0);
   332             return 2;
   333         } else {
   334             // no match possible
   335             output[0] = string;
   336             return 1;
   337         }
   338     }
   340     size_t n = 0;
   341     cxstring curpos = string;
   342     while (1) {
   343         ++n;
   344         cxstring match = cx_strstr(curpos, delim);
   345         if (match.length > 0) {
   346             // is the limit reached?
   347             if (n < limit) {
   348                 // copy the current string to the array
   349                 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
   350                 output[n - 1] = item;
   351                 size_t processed = item.length + delim.length;
   352                 curpos.ptr += processed;
   353                 curpos.length -= processed;
   354             } else {
   355                 // limit reached, copy the _full_ remaining string
   356                 output[n - 1] = curpos;
   357                 break;
   358             }
   359         } else {
   360             // no more matches, copy last string
   361             output[n - 1] = curpos;
   362             break;
   363         }
   364     }
   366     return n;
   367 }
   369 size_t cx_strsplit_a(
   370         CxAllocator *allocator,
   371         cxstring string,
   372         cxstring delim,
   373         size_t limit,
   374         cxstring **output
   375 ) {
   376     // find out how many splits we're going to make and allocate memory
   377     size_t n = 0;
   378     cxstring curpos = string;
   379     while (1) {
   380         ++n;
   381         cxstring match = cx_strstr(curpos, delim);
   382         if (match.length > 0) {
   383             // is the limit reached?
   384             if (n < limit) {
   385                 size_t processed = match.ptr - curpos.ptr + delim.length;
   386                 curpos.ptr += processed;
   387                 curpos.length -= processed;
   388             } else {
   389                 // limit reached
   390                 break;
   391             }
   392         } else {
   393             // no more matches
   394             break;
   395         }
   396     }
   397     *output = cxCalloc(allocator, n, sizeof(cxstring));
   398     return cx_strsplit(string, delim, n, *output);
   399 }
   401 size_t cx_strsplit_m(
   402         cxmutstr string,
   403         cxstring delim,
   404         size_t limit,
   405         cxmutstr *output
   406 ) {
   407     return cx_strsplit(cx_strcast(string),
   408                        delim, limit, (cxstring *) output);
   409 }
   411 size_t cx_strsplit_ma(
   412         CxAllocator *allocator,
   413         cxmutstr string,
   414         cxstring delim,
   415         size_t limit,
   416         cxmutstr **output
   417 ) {
   418     return cx_strsplit_a(allocator, cx_strcast(string),
   419                          delim, limit, (cxstring **) output);
   420 }
   422 int cx_strcmp(
   423         cxstring s1,
   424         cxstring s2
   425 ) {
   426     if (s1.length == s2.length) {
   427         return memcmp(s1.ptr, s2.ptr, s1.length);
   428     } else if (s1.length > s2.length) {
   429         return 1;
   430     } else {
   431         return -1;
   432     }
   433 }
   435 int cx_strcasecmp(
   436         cxstring s1,
   437         cxstring s2
   438 ) {
   439     if (s1.length == s2.length) {
   440 #ifdef _WIN32
   441         return _strnicmp(s1.ptr, s2.ptr, s1.length);
   442 #else
   443         return strncasecmp(s1.ptr, s2.ptr, s1.length);
   444 #endif
   445     } else if (s1.length > s2.length) {
   446         return 1;
   447     } else {
   448         return -1;
   449     }
   450 }
   452 cxmutstr cx_strdup_a(
   453         CxAllocator *allocator,
   454         cxstring string
   455 ) {
   456     cxmutstr result = {
   457             cxMalloc(allocator, string.length + 1),
   458             string.length
   459     };
   460     if (result.ptr == NULL) {
   461         result.length = 0;
   462         return result;
   463     }
   464     memcpy(result.ptr, string.ptr, string.length);
   465     result.ptr[string.length] = '\0';
   466     return result;
   467 }
   469 cxstring cx_strtrim(cxstring string) {
   470     cxstring result = string;
   471     // TODO: optimize by comparing multiple bytes at once
   472     while (result.length > 0 && isspace(*result.ptr)) {
   473         result.ptr++;
   474         result.length--;
   475     }
   476     while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
   477         result.length--;
   478     }
   479     return result;
   480 }
   482 cxmutstr cx_strtrim_m(cxmutstr string) {
   483     cxstring result = cx_strtrim(cx_strcast(string));
   484     return (cxmutstr) {(char *) result.ptr, result.length};
   485 }
   487 bool cx_strprefix(
   488         cxstring string,
   489         cxstring prefix
   490 ) {
   491     if (string.length < prefix.length) return false;
   492     return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
   493 }
   495 bool cx_strsuffix(
   496         cxstring string,
   497         cxstring suffix
   498 ) {
   499     if (string.length < suffix.length) return false;
   500     return memcmp(string.ptr + string.length - suffix.length,
   501                   suffix.ptr, suffix.length) == 0;
   502 }
   504 bool cx_strcaseprefix(
   505         cxstring string,
   506         cxstring prefix
   507 ) {
   508     if (string.length < prefix.length) return false;
   509 #ifdef _WIN32
   510     return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
   511 #else
   512     return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
   513 #endif
   514 }
   516 bool cx_strcasesuffix(
   517         cxstring string,
   518         cxstring suffix
   519 ) {
   520     if (string.length < suffix.length) return false;
   521 #ifdef _WIN32
   522     return _strnicmp(string.ptr+string.length-suffix.length,
   523                   suffix.ptr, suffix.length) == 0;
   524 #else
   525     return strncasecmp(string.ptr + string.length - suffix.length,
   526                        suffix.ptr, suffix.length) == 0;
   527 #endif
   528 }
   530 void cx_strlower(cxmutstr string) {
   531     cx_for_n(i, string.length) {
   532         string.ptr[i] = (char) tolower(string.ptr[i]);
   533     }
   534 }
   536 void cx_strupper(cxmutstr string) {
   537     cx_for_n(i, string.length) {
   538         string.ptr[i] = (char) toupper(string.ptr[i]);
   539     }
   540 }
   542 #define REPLACE_INDEX_BUFFER_MAX 100
   544 struct cx_strreplace_ibuf {
   545     size_t *buf;
   546     struct cx_strreplace_ibuf *next;
   547     unsigned int len;
   548 };
   550 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
   551     while (buf) {
   552         struct cx_strreplace_ibuf *next = buf->next;
   553         free(buf->buf);
   554         free(buf);
   555         buf = next;
   556     }
   557 }
   559 cxmutstr cx_strreplacen_a(
   560         CxAllocator *allocator,
   561         cxstring str,
   562         cxstring pattern,
   563         cxstring replacement,
   564         size_t replmax
   565 ) {
   567     if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
   568         return cx_strdup_a(allocator, str);
   570     // Compute expected buffer length
   571     size_t ibufmax = str.length / pattern.length;
   572     size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
   573     if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
   574         ibuflen = REPLACE_INDEX_BUFFER_MAX;
   575     }
   577     // Allocate first index buffer
   578     struct cx_strreplace_ibuf *firstbuf, *curbuf;
   579     firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
   580     if (!firstbuf) return cx_mutstrn(NULL, 0);
   581     firstbuf->buf = calloc(ibuflen, sizeof(size_t));
   582     if (!firstbuf->buf) {
   583         free(firstbuf);
   584         return cx_mutstrn(NULL, 0);
   585     }
   587     // Search occurrences
   588     cxstring searchstr = str;
   589     size_t found = 0;
   590     do {
   591         cxstring match = cx_strstr(searchstr, pattern);
   592         if (match.length > 0) {
   593             // Allocate next buffer in chain, if required
   594             if (curbuf->len == ibuflen) {
   595                 struct cx_strreplace_ibuf *nextbuf =
   596                         calloc(1, sizeof(struct cx_strreplace_ibuf));
   597                 if (!nextbuf) {
   598                     cx_strrepl_free_ibuf(firstbuf);
   599                     return cx_mutstrn(NULL, 0);
   600                 }
   601                 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
   602                 if (!nextbuf->buf) {
   603                     free(nextbuf);
   604                     cx_strrepl_free_ibuf(firstbuf);
   605                     return cx_mutstrn(NULL, 0);
   606                 }
   607                 curbuf->next = nextbuf;
   608                 curbuf = nextbuf;
   609             }
   611             // Record match index
   612             found++;
   613             size_t idx = match.ptr - str.ptr;
   614             curbuf->buf[curbuf->len++] = idx;
   615             searchstr.ptr = match.ptr + pattern.length;
   616             searchstr.length = str.length - idx - pattern.length;
   617         } else {
   618             break;
   619         }
   620     } while (searchstr.length > 0 && found < replmax);
   622     // Allocate result string
   623     cxmutstr result;
   624     {
   625         ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
   626         size_t rcount = 0;
   627         curbuf = firstbuf;
   628         do {
   629             rcount += curbuf->len;
   630             curbuf = curbuf->next;
   631         } while (curbuf);
   632         result.length = str.length + rcount * adjlen;
   633         result.ptr = cxMalloc(allocator, result.length + 1);
   634         if (!result.ptr) {
   635             cx_strrepl_free_ibuf(firstbuf);
   636             return cx_mutstrn(NULL, 0);
   637         }
   638     }
   640     // Build result string
   641     curbuf = firstbuf;
   642     size_t srcidx = 0;
   643     char *destptr = result.ptr;
   644     do {
   645         for (size_t i = 0; i < curbuf->len; i++) {
   646             // Copy source part up to next match
   647             size_t idx = curbuf->buf[i];
   648             size_t srclen = idx - srcidx;
   649             if (srclen > 0) {
   650                 memcpy(destptr, str.ptr + srcidx, srclen);
   651                 destptr += srclen;
   652                 srcidx += srclen;
   653             }
   655             // Copy the replacement and skip the source pattern
   656             srcidx += pattern.length;
   657             memcpy(destptr, replacement.ptr, replacement.length);
   658             destptr += replacement.length;
   659         }
   660         curbuf = curbuf->next;
   661     } while (curbuf);
   662     memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
   664     // Result is guaranteed to be zero-terminated
   665     result.ptr[result.length] = '\0';
   667     // Free index buffer
   668     cx_strrepl_free_ibuf(firstbuf);
   670     return result;
   671 }

mercurial