src/string.c

Tue, 04 Oct 2022 18:55:20 +0200

author
Mike Becker <universe@uap-core.de>
date
Tue, 04 Oct 2022 18:55:20 +0200
changeset 590
02a56701a5cb
parent 583
0f3c9662f9b5
child 591
7df0bcaecffa
permissions
-rw-r--r--

fix missing zero-termination in strreplace

     1 /*
     2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
     3  *
     4  * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or without
     7  * modification, are permitted provided that the following conditions are met:
     8  *
     9  *   1. Redistributions of source code must retain the above copyright
    10  *      notice, this list of conditions and the following disclaimer.
    11  *
    12  *   2. Redistributions in binary form must reproduce the above copyright
    13  *      notice, this list of conditions and the following disclaimer in the
    14  *      documentation and/or other materials provided with the distribution.
    15  *
    16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
    20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    26  * POSSIBILITY OF SUCH DAMAGE.
    27  */
    29 #include "cx/string.h"
    30 #include "cx/utils.h"
    32 #include <string.h>
    33 #include <stdarg.h>
    34 #include <stdint.h>
    35 #include <ctype.h>
    37 #ifndef _WIN32
    39 #include <strings.h> /* for strncasecmp() */
    41 #endif /* _WIN32 */
    43 cxmutstr cx_mutstr(char *cstring) {
    44     return (cxmutstr) {cstring, strlen(cstring)};
    45 }
    47 cxmutstr cx_mutstrn(
    48         char *cstring,
    49         size_t length
    50 ) {
    51     return (cxmutstr) {cstring, length};
    52 }
    54 cxstring cx_str(const char *cstring) {
    55     return (cxstring) {cstring, strlen(cstring)};
    56 }
    58 cxstring cx_strn(
    59         const char *cstring,
    60         size_t length
    61 ) {
    62     return (cxstring) {cstring, length};
    63 }
    65 cxstring cx_strcast(cxmutstr str) {
    66     return (cxstring) {str.ptr, str.length};
    67 }
    69 void cx_strfree(cxmutstr *str) {
    70     free(str->ptr);
    71     str->ptr = NULL;
    72     str->length = 0;
    73 }
    75 void cx_strfree_a(
    76         CxAllocator *alloc,
    77         cxmutstr *str
    78 ) {
    79     cxFree(alloc, str->ptr);
    80     str->ptr = NULL;
    81     str->length = 0;
    82 }
    84 size_t cx_strlen(
    85         size_t count,
    86         ...
    87 ) {
    88     if (count == 0) return 0;
    90     va_list ap;
    91     va_start(ap, count);
    92     size_t size = 0;
    93     cx_for_n(i, count) {
    94         cxstring str = va_arg(ap, cxstring);
    95         size += str.length;
    96     }
    97     va_end(ap);
    99     return size;
   100 }
   102 cxmutstr cx_strcat_a(
   103         CxAllocator *alloc,
   104         size_t count,
   105         ...
   106 ) {
   107     cxstring *strings = calloc(count, sizeof(cxstring));
   108     if (!strings) abort();
   110     va_list ap;
   111     va_start(ap, count);
   113     // get all args and overall length
   114     size_t slen = 0;
   115     cx_for_n(i, count) {
   116         cxstring s = va_arg (ap, cxstring);
   117         strings[i] = s;
   118         slen += s.length;
   119     }
   121     // create new string
   122     cxmutstr result;
   123     result.ptr = cxMalloc(alloc, slen + 1);
   124     result.length = slen;
   125     if (result.ptr == NULL) abort();
   127     // concatenate strings
   128     size_t pos = 0;
   129     cx_for_n(i, count) {
   130         cxstring s = strings[i];
   131         memcpy(result.ptr + pos, s.ptr, s.length);
   132         pos += s.length;
   133     }
   135     // terminate string
   136     result.ptr[result.length] = '\0';
   138     // free temporary array
   139     free(strings);
   141     return result;
   142 }
   144 cxstring cx_strsubs(
   145         cxstring string,
   146         size_t start
   147 ) {
   148     return cx_strsubsl(string, start, string.length - start);
   149 }
   151 cxmutstr cx_strsubs_m(
   152         cxmutstr string,
   153         size_t start
   154 ) {
   155     return cx_strsubsl_m(string, start, string.length - start);
   156 }
   158 cxstring cx_strsubsl(
   159         cxstring string,
   160         size_t start,
   161         size_t length
   162 ) {
   163     if (start > string.length) {
   164         return (cxstring) {NULL, 0};
   165     }
   167     size_t rem_len = string.length - start;
   168     if (length > rem_len) {
   169         length = rem_len;
   170     }
   172     return (cxstring) {string.ptr + start, length};
   173 }
   175 cxmutstr cx_strsubsl_m(
   176         cxmutstr string,
   177         size_t start,
   178         size_t length
   179 ) {
   180     cxstring result = cx_strsubsl(cx_strcast(string), start, length);
   181     return (cxmutstr) {(char *) result.ptr, result.length};
   182 }
   184 cxstring cx_strchr(
   185         cxstring string,
   186         int chr
   187 ) {
   188     chr = 0xFF & chr;
   189     // TODO: improve by comparing multiple bytes at once
   190     cx_for_n(i, string.length) {
   191         if (string.ptr[i] == chr) {
   192             return cx_strsubs(string, i);
   193         }
   194     }
   195     return (cxstring) {NULL, 0};
   196 }
   198 cxmutstr cx_strchr_m(
   199         cxmutstr string,
   200         int chr
   201 ) {
   202     cxstring result = cx_strchr(cx_strcast(string), chr);
   203     return (cxmutstr) {(char *) result.ptr, result.length};
   204 }
   206 cxstring cx_strrchr(
   207         cxstring string,
   208         int chr
   209 ) {
   210     chr = 0xFF & chr;
   211     size_t i = string.length;
   212     while (i > 0) {
   213         i--;
   214         // TODO: improve by comparing multiple bytes at once
   215         if (string.ptr[i] == chr) {
   216             return cx_strsubs(string, i);
   217         }
   218     }
   219     return (cxstring) {NULL, 0};
   220 }
   222 cxmutstr cx_strrchr_m(
   223         cxmutstr string,
   224         int chr
   225 ) {
   226     cxstring result = cx_strrchr(cx_strcast(string), chr);
   227     return (cxmutstr) {(char *) result.ptr, result.length};
   228 }
   230 #define ptable_r(dest, useheap, ptable, index) (dest = useheap ? \
   231     ((size_t*)ptable)[index] : (size_t) ((uint8_t*)ptable)[index])
   233 #define ptable_w(useheap, ptable, index, src) do {\
   234     if (!useheap) ((uint8_t*)ptable)[index] = (uint8_t) src;\
   235     else ((size_t*)ptable)[index] = src;\
   236     } while (0)
   239 cxstring cx_strstr(
   240         cxstring haystack,
   241         cxstring needle
   242 ) {
   243     if (needle.length == 0) {
   244         return haystack;
   245     }
   247     /* optimize for single-char needles */
   248     if (needle.length == 1) {
   249         return cx_strchr(haystack, *needle.ptr);
   250     }
   252     /*
   253      * IMPORTANT:
   254      * Our prefix table contains the prefix length PLUS ONE
   255      * this is our decision, because we want to use the full range of size_t.
   256      * The original algorithm needs a (-1) at one single place,
   257      * and we want to avoid that.
   258      */
   260     /* static prefix table */
   261     static uint8_t s_prefix_table[512];
   263     /* check pattern length and use appropriate prefix table */
   264     /* if the pattern exceeds static prefix table, allocate on the heap */
   265     register int useheap = needle.length >= 512;
   266     register void *ptable = useheap ? calloc(needle.length + 1,
   267                                              sizeof(size_t)) : s_prefix_table;
   269     /* keep counter in registers */
   270     register size_t i, j;
   272     /* fill prefix table */
   273     i = 0;
   274     j = 0;
   275     ptable_w(useheap, ptable, i, j);
   276     while (i < needle.length) {
   277         while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
   278             ptable_r(j, useheap, ptable, j - 1);
   279         }
   280         i++;
   281         j++;
   282         ptable_w(useheap, ptable, i, j);
   283     }
   285     /* search */
   286     cxstring result = {NULL, 0};
   287     i = 0;
   288     j = 1;
   289     while (i < haystack.length) {
   290         while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
   291             ptable_r(j, useheap, ptable, j - 1);
   292         }
   293         i++;
   294         j++;
   295         if (j - 1 == needle.length) {
   296             size_t start = i - needle.length;
   297             result.ptr = haystack.ptr + start;
   298             result.length = haystack.length - start;
   299             break;
   300         }
   301     }
   303     /* if prefix table was allocated on the heap, free it */
   304     if (ptable != s_prefix_table) {
   305         free(ptable);
   306     }
   308     return result;
   309 }
   311 cxmutstr cx_strstr_m(
   312         cxmutstr haystack,
   313         cxstring needle
   314 ) {
   315     cxstring result = cx_strstr(cx_strcast(haystack), needle);
   316     return (cxmutstr) {(char *) result.ptr, result.length};
   317 }
   319 size_t cx_strsplit(
   320         cxstring string,
   321         cxstring delim,
   322         size_t limit,
   323         cxstring *output
   324 ) {
   325     /* special case: output limit is zero */
   326     if (limit == 0) return 0;
   328     /* special case: delimiter is empty */
   329     if (delim.length == 0) {
   330         output[0] = string;
   331         return 1;
   332     }
   334     /* special cases: delimiter is at least as large as the string */
   335     if (delim.length >= string.length) {
   336         /* exact match */
   337         if (cx_strcmp(string, delim) == 0) {
   338             output[0] = cx_strn(string.ptr, 0);
   339             output[1] = cx_strn(string.ptr + string.length, 0);
   340             return 2;
   341         } else /* no match possible */ {
   342             output[0] = string;
   343             return 1;
   344         }
   345     }
   347     size_t n = 0;
   348     cxstring curpos = string;
   349     while (1) {
   350         ++n;
   351         cxstring match = cx_strstr(curpos, delim);
   352         if (match.length > 0) {
   353             /* is the limit reached? */
   354             if (n < limit) {
   355                 /* copy the current string to the array */
   356                 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
   357                 output[n - 1] = item;
   358                 size_t processed = item.length + delim.length;
   359                 curpos.ptr += processed;
   360                 curpos.length -= processed;
   361             } else {
   362                 /* limit reached, copy the _full_ remaining string */
   363                 output[n - 1] = curpos;
   364                 break;
   365             }
   366         } else {
   367             /* no more matches, copy last string */
   368             output[n - 1] = curpos;
   369             break;
   370         }
   371     }
   373     return n;
   374 }
   376 size_t cx_strsplit_a(
   377         CxAllocator *allocator,
   378         cxstring string,
   379         cxstring delim,
   380         size_t limit,
   381         cxstring **output
   382 ) {
   383     /* find out how many splits we're going to make and allocate memory */
   384     size_t n = 0;
   385     cxstring curpos = string;
   386     while (1) {
   387         ++n;
   388         cxstring match = cx_strstr(curpos, delim);
   389         if (match.length > 0) {
   390             /* is the limit reached? */
   391             if (n < limit) {
   392                 size_t processed = match.ptr - curpos.ptr + delim.length;
   393                 curpos.ptr += processed;
   394                 curpos.length -= processed;
   395             } else {
   396                 /* limit reached */
   397                 break;
   398             }
   399         } else {
   400             /* no more matches */
   401             break;
   402         }
   403     }
   404     *output = cxCalloc(allocator, n, sizeof(cxstring));
   405     return cx_strsplit(string, delim, n, *output);
   406 }
   408 size_t cx_strsplit_m(
   409         cxmutstr string,
   410         cxstring delim,
   411         size_t limit,
   412         cxmutstr *output
   413 ) {
   414     return cx_strsplit(cx_strcast(string),
   415                        delim, limit, (cxstring *) output);
   416 }
   418 size_t cx_strsplit_ma(
   419         CxAllocator *allocator,
   420         cxmutstr string,
   421         cxstring delim,
   422         size_t limit,
   423         cxmutstr **output
   424 ) {
   425     return cx_strsplit_a(allocator, cx_strcast(string),
   426                          delim, limit, (cxstring **) output);
   427 }
   429 int cx_strcmp(
   430         cxstring s1,
   431         cxstring s2
   432 ) {
   433     if (s1.length == s2.length) {
   434         return memcmp(s1.ptr, s2.ptr, s1.length);
   435     } else if (s1.length > s2.length) {
   436         return 1;
   437     } else {
   438         return -1;
   439     }
   440 }
   442 int cx_strcasecmp(
   443         cxstring s1,
   444         cxstring s2
   445 ) {
   446     if (s1.length == s2.length) {
   447 #ifdef _WIN32
   448         return _strnicmp(s1.ptr, s2.ptr, s1.length);
   449 #else
   450         return strncasecmp(s1.ptr, s2.ptr, s1.length);
   451 #endif
   452     } else if (s1.length > s2.length) {
   453         return 1;
   454     } else {
   455         return -1;
   456     }
   457 }
   459 cxmutstr cx_strdup_a(
   460         CxAllocator *allocator,
   461         cxstring string
   462 ) {
   463     cxmutstr result = {
   464             cxMalloc(allocator, string.length + 1),
   465             string.length
   466     };
   467     if (result.ptr == NULL) {
   468         result.length = 0;
   469         return result;
   470     }
   471     memcpy(result.ptr, string.ptr, string.length);
   472     result.ptr[string.length] = '\0';
   473     return result;
   474 }
   476 cxstring cx_strtrim(cxstring string) {
   477     cxstring result = string;
   478     // TODO: optimize by comparing multiple bytes at once
   479     while (result.length > 0 && isspace(*result.ptr)) {
   480         result.ptr++;
   481         result.length--;
   482     }
   483     while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
   484         result.length--;
   485     }
   486     return result;
   487 }
   489 cxmutstr cx_strtrim_m(cxmutstr string) {
   490     cxstring result = cx_strtrim(cx_strcast(string));
   491     return (cxmutstr) {(char *) result.ptr, result.length};
   492 }
   494 bool cx_strprefix(
   495         cxstring string,
   496         cxstring prefix
   497 ) {
   498     if (string.length < prefix.length) return false;
   499     return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
   500 }
   502 bool cx_strsuffix(
   503         cxstring string,
   504         cxstring suffix
   505 ) {
   506     if (string.length < suffix.length) return false;
   507     return memcmp(string.ptr + string.length - suffix.length,
   508                   suffix.ptr, suffix.length) == 0;
   509 }
   511 bool cx_strcaseprefix(
   512         cxstring string,
   513         cxstring prefix
   514 ) {
   515     if (string.length < prefix.length) return false;
   516 #ifdef _WIN32
   517     return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
   518 #else
   519     return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
   520 #endif
   521 }
   523 bool cx_strcasesuffix(
   524         cxstring string,
   525         cxstring suffix
   526 ) {
   527     if (string.length < suffix.length) return false;
   528 #ifdef _WIN32
   529     return _strnicmp(string.ptr+string.length-suffix.length,
   530                   suffix.ptr, suffix.length) == 0;
   531 #else
   532     return strncasecmp(string.ptr + string.length - suffix.length,
   533                        suffix.ptr, suffix.length) == 0;
   534 #endif
   535 }
   537 void cx_strlower(cxmutstr string) {
   538     cx_for_n(i, string.length) {
   539         string.ptr[i] = tolower(string.ptr[i]);
   540     }
   541 }
   543 void cx_strupper(cxmutstr string) {
   544     cx_for_n(i, string.length) {
   545         string.ptr[i] = toupper(string.ptr[i]);
   546     }
   547 }
   549 #define REPLACE_INDEX_BUFFER_MAX 100
   551 struct cx_strreplace_ibuf {
   552     size_t *buf;
   553     struct cx_strreplace_ibuf *next;
   554     unsigned int len;
   555 };
   557 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
   558     while (buf) {
   559         struct cx_strreplace_ibuf *next = buf->next;
   560         free(buf->buf);
   561         free(buf);
   562         buf = next;
   563     }
   564 }
   566 cxmutstr cx_strreplacen_a(
   567         CxAllocator *allocator,
   568         cxstring str,
   569         cxstring pattern,
   570         cxstring replacement,
   571         size_t replmax
   572 ) {
   574     if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
   575         return cx_strdup_a(allocator, str);
   577     /* Compute expected buffer length */
   578     size_t ibufmax = str.length / pattern.length;
   579     size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
   580     if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
   581         ibuflen = REPLACE_INDEX_BUFFER_MAX;
   582     }
   584     /* Allocate first index buffer */
   585     struct cx_strreplace_ibuf *firstbuf, *curbuf;
   586     firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
   587     if (!firstbuf) return cx_mutstrn(NULL, 0);
   588     firstbuf->buf = calloc(ibuflen, sizeof(size_t));
   589     if (!firstbuf->buf) {
   590         free(firstbuf);
   591         return cx_mutstrn(NULL, 0);
   592     }
   594     /* Search occurrences */
   595     cxstring searchstr = str;
   596     size_t found = 0;
   597     do {
   598         cxstring match = cx_strstr(searchstr, pattern);
   599         if (match.length > 0) {
   600             /* Allocate next buffer in chain, if required */
   601             if (curbuf->len == ibuflen) {
   602                 struct cx_strreplace_ibuf *nextbuf =
   603                         calloc(1, sizeof(struct cx_strreplace_ibuf));
   604                 if (!nextbuf) {
   605                     cx_strrepl_free_ibuf(firstbuf);
   606                     return cx_mutstrn(NULL, 0);
   607                 }
   608                 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
   609                 if (!nextbuf->buf) {
   610                     free(nextbuf);
   611                     cx_strrepl_free_ibuf(firstbuf);
   612                     return cx_mutstrn(NULL, 0);
   613                 }
   614                 curbuf->next = nextbuf;
   615                 curbuf = nextbuf;
   616             }
   618             /* Record match index */
   619             found++;
   620             size_t idx = match.ptr - str.ptr;
   621             curbuf->buf[curbuf->len++] = idx;
   622             searchstr.ptr = match.ptr + pattern.length;
   623             searchstr.length = str.length - idx - pattern.length;
   624         } else {
   625             break;
   626         }
   627     } while (searchstr.length > 0 && found < replmax);
   629     /* Allocate result string */
   630     cxmutstr result;
   631     {
   632         ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
   633         size_t rcount = 0;
   634         curbuf = firstbuf;
   635         do {
   636             rcount += curbuf->len;
   637             curbuf = curbuf->next;
   638         } while (curbuf);
   639         result.length = str.length + rcount * adjlen;
   640         result.ptr = cxMalloc(allocator, result.length + 1);
   641         if (!result.ptr) {
   642             cx_strrepl_free_ibuf(firstbuf);
   643             return cx_mutstrn(NULL, 0);
   644         }
   645     }
   647     /* Build result string */
   648     curbuf = firstbuf;
   649     size_t srcidx = 0;
   650     char *destptr = result.ptr;
   651     do {
   652         for (size_t i = 0; i < curbuf->len; i++) {
   653             /* Copy source part up to next match*/
   654             size_t idx = curbuf->buf[i];
   655             size_t srclen = idx - srcidx;
   656             if (srclen > 0) {
   657                 memcpy(destptr, str.ptr + srcidx, srclen);
   658                 destptr += srclen;
   659                 srcidx += srclen;
   660             }
   662             /* Copy the replacement and skip the source pattern */
   663             srcidx += pattern.length;
   664             memcpy(destptr, replacement.ptr, replacement.length);
   665             destptr += replacement.length;
   666         }
   667         curbuf = curbuf->next;
   668     } while (curbuf);
   669     memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
   671     /* Result is guaranteed to be zero-terminated */
   672     result.ptr[result.length] = '\0';
   674     /* Free index buffer */
   675     cx_strrepl_free_ibuf(firstbuf);
   677     return result;
   678 }

mercurial