src/string.c

Thu, 02 Feb 2023 20:25:34 +0100

author
Mike Becker <universe@uap-core.de>
date
Thu, 02 Feb 2023 20:25:34 +0100
changeset 645
ec50abb285ad
parent 643
5700ba9154ab
child 657
3eeadf666d6b
permissions
-rw-r--r--

add strtok API - fixes #220

     1 /*
     2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
     3  *
     4  * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or without
     7  * modification, are permitted provided that the following conditions are met:
     8  *
     9  *   1. Redistributions of source code must retain the above copyright
    10  *      notice, this list of conditions and the following disclaimer.
    11  *
    12  *   2. Redistributions in binary form must reproduce the above copyright
    13  *      notice, this list of conditions and the following disclaimer in the
    14  *      documentation and/or other materials provided with the distribution.
    15  *
    16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
    20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    26  * POSSIBILITY OF SUCH DAMAGE.
    27  */
    29 #include "cx/string.h"
    30 #include "cx/utils.h"
    32 #include <string.h>
    33 #include <stdarg.h>
    34 #include <ctype.h>
    36 #ifndef _WIN32
    38 #include <strings.h> // for strncasecmp()
    40 #endif // _WIN32
    42 cxmutstr cx_mutstr(char *cstring) {
    43     return (cxmutstr) {cstring, strlen(cstring)};
    44 }
    46 cxmutstr cx_mutstrn(
    47         char *cstring,
    48         size_t length
    49 ) {
    50     return (cxmutstr) {cstring, length};
    51 }
    53 cxstring cx_str(const char *cstring) {
    54     return (cxstring) {cstring, strlen(cstring)};
    55 }
    57 cxstring cx_strn(
    58         const char *cstring,
    59         size_t length
    60 ) {
    61     return (cxstring) {cstring, length};
    62 }
    64 cxstring cx_strcast(cxmutstr str) {
    65     return (cxstring) {str.ptr, str.length};
    66 }
    68 void cx_strfree(cxmutstr *str) {
    69     free(str->ptr);
    70     str->ptr = NULL;
    71     str->length = 0;
    72 }
    74 void cx_strfree_a(
    75         CxAllocator *alloc,
    76         cxmutstr *str
    77 ) {
    78     cxFree(alloc, str->ptr);
    79     str->ptr = NULL;
    80     str->length = 0;
    81 }
    83 size_t cx_strlen(
    84         size_t count,
    85         ...
    86 ) {
    87     if (count == 0) return 0;
    89     va_list ap;
    90     va_start(ap, count);
    91     size_t size = 0;
    92     cx_for_n(i, count) {
    93         cxstring str = va_arg(ap, cxstring);
    94         size += str.length;
    95     }
    96     va_end(ap);
    98     return size;
    99 }
   101 cxmutstr cx_strcat_a(
   102         CxAllocator *alloc,
   103         size_t count,
   104         ...
   105 ) {
   106     cxstring *strings = calloc(count, sizeof(cxstring));
   107     if (!strings) abort();
   109     va_list ap;
   110     va_start(ap, count);
   112     // get all args and overall length
   113     size_t slen = 0;
   114     cx_for_n(i, count) {
   115         cxstring s = va_arg (ap, cxstring);
   116         strings[i] = s;
   117         slen += s.length;
   118     }
   120     // create new string
   121     cxmutstr result;
   122     result.ptr = cxMalloc(alloc, slen + 1);
   123     result.length = slen;
   124     if (result.ptr == NULL) abort();
   126     // concatenate strings
   127     size_t pos = 0;
   128     cx_for_n(i, count) {
   129         cxstring s = strings[i];
   130         memcpy(result.ptr + pos, s.ptr, s.length);
   131         pos += s.length;
   132     }
   134     // terminate string
   135     result.ptr[result.length] = '\0';
   137     // free temporary array
   138     free(strings);
   140     return result;
   141 }
   143 cxstring cx_strsubs(
   144         cxstring string,
   145         size_t start
   146 ) {
   147     return cx_strsubsl(string, start, string.length - start);
   148 }
   150 cxmutstr cx_strsubs_m(
   151         cxmutstr string,
   152         size_t start
   153 ) {
   154     return cx_strsubsl_m(string, start, string.length - start);
   155 }
   157 cxstring cx_strsubsl(
   158         cxstring string,
   159         size_t start,
   160         size_t length
   161 ) {
   162     if (start > string.length) {
   163         return (cxstring) {NULL, 0};
   164     }
   166     size_t rem_len = string.length - start;
   167     if (length > rem_len) {
   168         length = rem_len;
   169     }
   171     return (cxstring) {string.ptr + start, length};
   172 }
   174 cxmutstr cx_strsubsl_m(
   175         cxmutstr string,
   176         size_t start,
   177         size_t length
   178 ) {
   179     cxstring result = cx_strsubsl(cx_strcast(string), start, length);
   180     return (cxmutstr) {(char *) result.ptr, result.length};
   181 }
   183 cxstring cx_strchr(
   184         cxstring string,
   185         int chr
   186 ) {
   187     chr = 0xFF & chr;
   188     // TODO: improve by comparing multiple bytes at once
   189     cx_for_n(i, string.length) {
   190         if (string.ptr[i] == chr) {
   191             return cx_strsubs(string, i);
   192         }
   193     }
   194     return (cxstring) {NULL, 0};
   195 }
   197 cxmutstr cx_strchr_m(
   198         cxmutstr string,
   199         int chr
   200 ) {
   201     cxstring result = cx_strchr(cx_strcast(string), chr);
   202     return (cxmutstr) {(char *) result.ptr, result.length};
   203 }
   205 cxstring cx_strrchr(
   206         cxstring string,
   207         int chr
   208 ) {
   209     chr = 0xFF & chr;
   210     size_t i = string.length;
   211     while (i > 0) {
   212         i--;
   213         // TODO: improve by comparing multiple bytes at once
   214         if (string.ptr[i] == chr) {
   215             return cx_strsubs(string, i);
   216         }
   217     }
   218     return (cxstring) {NULL, 0};
   219 }
   221 cxmutstr cx_strrchr_m(
   222         cxmutstr string,
   223         int chr
   224 ) {
   225     cxstring result = cx_strrchr(cx_strcast(string), chr);
   226     return (cxmutstr) {(char *) result.ptr, result.length};
   227 }
   229 #ifndef CX_STRSTR_SBO_SIZE
   230 #define CX_STRSTR_SBO_SIZE 512
   231 #endif
   233 cxstring cx_strstr(
   234         cxstring haystack,
   235         cxstring needle
   236 ) {
   237     if (needle.length == 0) {
   238         return haystack;
   239     }
   241     // optimize for single-char needles
   242     if (needle.length == 1) {
   243         return cx_strchr(haystack, *needle.ptr);
   244     }
   246     /*
   247      * IMPORTANT:
   248      * Our prefix table contains the prefix length PLUS ONE
   249      * this is our decision, because we want to use the full range of size_t.
   250      * The original algorithm needs a (-1) at one single place,
   251      * and we want to avoid that.
   252      */
   254     // local prefix table
   255     size_t s_prefix_table[CX_STRSTR_SBO_SIZE];
   257     // check needle length and use appropriate prefix table
   258     // if the pattern exceeds static prefix table, allocate on the heap
   259     bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
   260     register size_t *ptable = useheap ? calloc(needle.length + 1,
   261                                                sizeof(size_t)) : s_prefix_table;
   263     // keep counter in registers
   264     register size_t i, j;
   266     // fill prefix table
   267     i = 0;
   268     j = 0;
   269     ptable[i] = j;
   270     while (i < needle.length) {
   271         while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
   272             j = ptable[j - 1];
   273         }
   274         i++;
   275         j++;
   276         ptable[i] = j;
   277     }
   279     // search
   280     cxstring result = {NULL, 0};
   281     i = 0;
   282     j = 1;
   283     while (i < haystack.length) {
   284         while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
   285             j = ptable[j - 1];
   286         }
   287         i++;
   288         j++;
   289         if (j - 1 == needle.length) {
   290             size_t start = i - needle.length;
   291             result.ptr = haystack.ptr + start;
   292             result.length = haystack.length - start;
   293             break;
   294         }
   295     }
   297     // if prefix table was allocated on the heap, free it
   298     if (ptable != s_prefix_table) {
   299         free(ptable);
   300     }
   302     return result;
   303 }
   305 cxmutstr cx_strstr_m(
   306         cxmutstr haystack,
   307         cxstring needle
   308 ) {
   309     cxstring result = cx_strstr(cx_strcast(haystack), needle);
   310     return (cxmutstr) {(char *) result.ptr, result.length};
   311 }
   313 size_t cx_strsplit(
   314         cxstring string,
   315         cxstring delim,
   316         size_t limit,
   317         cxstring *output
   318 ) {
   319     // special case: output limit is zero
   320     if (limit == 0) return 0;
   322     // special case: delimiter is empty
   323     if (delim.length == 0) {
   324         output[0] = string;
   325         return 1;
   326     }
   328     // special cases: delimiter is at least as large as the string
   329     if (delim.length >= string.length) {
   330         // exact match
   331         if (cx_strcmp(string, delim) == 0) {
   332             output[0] = cx_strn(string.ptr, 0);
   333             output[1] = cx_strn(string.ptr + string.length, 0);
   334             return 2;
   335         } else {
   336             // no match possible
   337             output[0] = string;
   338             return 1;
   339         }
   340     }
   342     size_t n = 0;
   343     cxstring curpos = string;
   344     while (1) {
   345         ++n;
   346         cxstring match = cx_strstr(curpos, delim);
   347         if (match.length > 0) {
   348             // is the limit reached?
   349             if (n < limit) {
   350                 // copy the current string to the array
   351                 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
   352                 output[n - 1] = item;
   353                 size_t processed = item.length + delim.length;
   354                 curpos.ptr += processed;
   355                 curpos.length -= processed;
   356             } else {
   357                 // limit reached, copy the _full_ remaining string
   358                 output[n - 1] = curpos;
   359                 break;
   360             }
   361         } else {
   362             // no more matches, copy last string
   363             output[n - 1] = curpos;
   364             break;
   365         }
   366     }
   368     return n;
   369 }
   371 size_t cx_strsplit_a(
   372         CxAllocator *allocator,
   373         cxstring string,
   374         cxstring delim,
   375         size_t limit,
   376         cxstring **output
   377 ) {
   378     // find out how many splits we're going to make and allocate memory
   379     size_t n = 0;
   380     cxstring curpos = string;
   381     while (1) {
   382         ++n;
   383         cxstring match = cx_strstr(curpos, delim);
   384         if (match.length > 0) {
   385             // is the limit reached?
   386             if (n < limit) {
   387                 size_t processed = match.ptr - curpos.ptr + delim.length;
   388                 curpos.ptr += processed;
   389                 curpos.length -= processed;
   390             } else {
   391                 // limit reached
   392                 break;
   393             }
   394         } else {
   395             // no more matches
   396             break;
   397         }
   398     }
   399     *output = cxCalloc(allocator, n, sizeof(cxstring));
   400     return cx_strsplit(string, delim, n, *output);
   401 }
   403 size_t cx_strsplit_m(
   404         cxmutstr string,
   405         cxstring delim,
   406         size_t limit,
   407         cxmutstr *output
   408 ) {
   409     return cx_strsplit(cx_strcast(string),
   410                        delim, limit, (cxstring *) output);
   411 }
   413 size_t cx_strsplit_ma(
   414         CxAllocator *allocator,
   415         cxmutstr string,
   416         cxstring delim,
   417         size_t limit,
   418         cxmutstr **output
   419 ) {
   420     return cx_strsplit_a(allocator, cx_strcast(string),
   421                          delim, limit, (cxstring **) output);
   422 }
   424 int cx_strcmp(
   425         cxstring s1,
   426         cxstring s2
   427 ) {
   428     if (s1.length == s2.length) {
   429         return memcmp(s1.ptr, s2.ptr, s1.length);
   430     } else if (s1.length > s2.length) {
   431         return 1;
   432     } else {
   433         return -1;
   434     }
   435 }
   437 int cx_strcasecmp(
   438         cxstring s1,
   439         cxstring s2
   440 ) {
   441     if (s1.length == s2.length) {
   442 #ifdef _WIN32
   443         return _strnicmp(s1.ptr, s2.ptr, s1.length);
   444 #else
   445         return strncasecmp(s1.ptr, s2.ptr, s1.length);
   446 #endif
   447     } else if (s1.length > s2.length) {
   448         return 1;
   449     } else {
   450         return -1;
   451     }
   452 }
   454 cxmutstr cx_strdup_a(
   455         CxAllocator *allocator,
   456         cxstring string
   457 ) {
   458     cxmutstr result = {
   459             cxMalloc(allocator, string.length + 1),
   460             string.length
   461     };
   462     if (result.ptr == NULL) {
   463         result.length = 0;
   464         return result;
   465     }
   466     memcpy(result.ptr, string.ptr, string.length);
   467     result.ptr[string.length] = '\0';
   468     return result;
   469 }
   471 cxstring cx_strtrim(cxstring string) {
   472     cxstring result = string;
   473     // TODO: optimize by comparing multiple bytes at once
   474     while (result.length > 0 && isspace(*result.ptr)) {
   475         result.ptr++;
   476         result.length--;
   477     }
   478     while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
   479         result.length--;
   480     }
   481     return result;
   482 }
   484 cxmutstr cx_strtrim_m(cxmutstr string) {
   485     cxstring result = cx_strtrim(cx_strcast(string));
   486     return (cxmutstr) {(char *) result.ptr, result.length};
   487 }
   489 bool cx_strprefix(
   490         cxstring string,
   491         cxstring prefix
   492 ) {
   493     if (string.length < prefix.length) return false;
   494     return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
   495 }
   497 bool cx_strsuffix(
   498         cxstring string,
   499         cxstring suffix
   500 ) {
   501     if (string.length < suffix.length) return false;
   502     return memcmp(string.ptr + string.length - suffix.length,
   503                   suffix.ptr, suffix.length) == 0;
   504 }
   506 bool cx_strcaseprefix(
   507         cxstring string,
   508         cxstring prefix
   509 ) {
   510     if (string.length < prefix.length) return false;
   511 #ifdef _WIN32
   512     return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
   513 #else
   514     return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
   515 #endif
   516 }
   518 bool cx_strcasesuffix(
   519         cxstring string,
   520         cxstring suffix
   521 ) {
   522     if (string.length < suffix.length) return false;
   523 #ifdef _WIN32
   524     return _strnicmp(string.ptr+string.length-suffix.length,
   525                   suffix.ptr, suffix.length) == 0;
   526 #else
   527     return strncasecmp(string.ptr + string.length - suffix.length,
   528                        suffix.ptr, suffix.length) == 0;
   529 #endif
   530 }
   532 void cx_strlower(cxmutstr string) {
   533     cx_for_n(i, string.length) {
   534         string.ptr[i] = (char) tolower(string.ptr[i]);
   535     }
   536 }
   538 void cx_strupper(cxmutstr string) {
   539     cx_for_n(i, string.length) {
   540         string.ptr[i] = (char) toupper(string.ptr[i]);
   541     }
   542 }
   544 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
   545 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
   546 #endif
   548 struct cx_strreplace_ibuf {
   549     size_t *buf;
   550     struct cx_strreplace_ibuf *next;
   551     unsigned int len;
   552 };
   554 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
   555     while (buf) {
   556         struct cx_strreplace_ibuf *next = buf->next;
   557         free(buf->buf);
   558         free(buf);
   559         buf = next;
   560     }
   561 }
   563 cxmutstr cx_strreplacen_a(
   564         CxAllocator *allocator,
   565         cxstring str,
   566         cxstring pattern,
   567         cxstring replacement,
   568         size_t replmax
   569 ) {
   571     if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
   572         return cx_strdup_a(allocator, str);
   574     // Compute expected buffer length
   575     size_t ibufmax = str.length / pattern.length;
   576     size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
   577     if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
   578         ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
   579     }
   581     // Allocate first index buffer
   582     struct cx_strreplace_ibuf *firstbuf, *curbuf;
   583     firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
   584     if (!firstbuf) return cx_mutstrn(NULL, 0);
   585     firstbuf->buf = calloc(ibuflen, sizeof(size_t));
   586     if (!firstbuf->buf) {
   587         free(firstbuf);
   588         return cx_mutstrn(NULL, 0);
   589     }
   591     // Search occurrences
   592     cxstring searchstr = str;
   593     size_t found = 0;
   594     do {
   595         cxstring match = cx_strstr(searchstr, pattern);
   596         if (match.length > 0) {
   597             // Allocate next buffer in chain, if required
   598             if (curbuf->len == ibuflen) {
   599                 struct cx_strreplace_ibuf *nextbuf =
   600                         calloc(1, sizeof(struct cx_strreplace_ibuf));
   601                 if (!nextbuf) {
   602                     cx_strrepl_free_ibuf(firstbuf);
   603                     return cx_mutstrn(NULL, 0);
   604                 }
   605                 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
   606                 if (!nextbuf->buf) {
   607                     free(nextbuf);
   608                     cx_strrepl_free_ibuf(firstbuf);
   609                     return cx_mutstrn(NULL, 0);
   610                 }
   611                 curbuf->next = nextbuf;
   612                 curbuf = nextbuf;
   613             }
   615             // Record match index
   616             found++;
   617             size_t idx = match.ptr - str.ptr;
   618             curbuf->buf[curbuf->len++] = idx;
   619             searchstr.ptr = match.ptr + pattern.length;
   620             searchstr.length = str.length - idx - pattern.length;
   621         } else {
   622             break;
   623         }
   624     } while (searchstr.length > 0 && found < replmax);
   626     // Allocate result string
   627     cxmutstr result;
   628     {
   629         ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
   630         size_t rcount = 0;
   631         curbuf = firstbuf;
   632         do {
   633             rcount += curbuf->len;
   634             curbuf = curbuf->next;
   635         } while (curbuf);
   636         result.length = str.length + rcount * adjlen;
   637         result.ptr = cxMalloc(allocator, result.length + 1);
   638         if (!result.ptr) {
   639             cx_strrepl_free_ibuf(firstbuf);
   640             return cx_mutstrn(NULL, 0);
   641         }
   642     }
   644     // Build result string
   645     curbuf = firstbuf;
   646     size_t srcidx = 0;
   647     char *destptr = result.ptr;
   648     do {
   649         for (size_t i = 0; i < curbuf->len; i++) {
   650             // Copy source part up to next match
   651             size_t idx = curbuf->buf[i];
   652             size_t srclen = idx - srcidx;
   653             if (srclen > 0) {
   654                 memcpy(destptr, str.ptr + srcidx, srclen);
   655                 destptr += srclen;
   656                 srcidx += srclen;
   657             }
   659             // Copy the replacement and skip the source pattern
   660             srcidx += pattern.length;
   661             memcpy(destptr, replacement.ptr, replacement.length);
   662             destptr += replacement.length;
   663         }
   664         curbuf = curbuf->next;
   665     } while (curbuf);
   666     memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
   668     // Result is guaranteed to be zero-terminated
   669     result.ptr[result.length] = '\0';
   671     // Free index buffer
   672     cx_strrepl_free_ibuf(firstbuf);
   674     return result;
   675 }
   677 CxStrtokCtx cx_strtok(
   678         cxstring str,
   679         cxstring delim,
   680         size_t limit
   681 ) {
   682     CxStrtokCtx ctx;
   683     ctx.str = str;
   684     ctx.delim = delim;
   685     ctx.limit = limit;
   686     ctx.pos = 0;
   687     ctx.next_pos = 0;
   688     ctx.delim_pos = 0;
   689     ctx.found = 0;
   690     ctx.delim_more = NULL;
   691     ctx.delim_more_count = 0;
   692     return ctx;
   693 }
   695 CxStrtokCtx cx_strtok_m(
   696         cxmutstr str,
   697         cxstring delim,
   698         size_t limit
   699 ) {
   700     return cx_strtok(cx_strcast(str), delim, limit);
   701 }
   703 bool cx_strtok_next(
   704         CxStrtokCtx *ctx,
   705         cxstring *token
   706 ) {
   707     // abortion criteria
   708     if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
   709         return false;
   710     }
   712     // determine the search start
   713     cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);
   715     // search the next delimiter
   716     cxstring delim = cx_strstr(haystack, ctx->delim);
   718     // if found, make delim capture exactly the delimiter
   719     if (delim.length > 0) {
   720         delim.length = ctx->delim.length;
   721     }
   723     // if more delimiters are specified, check them now
   724     if (ctx->delim_more_count > 0) {
   725         cx_for_n(i, ctx->delim_more_count) {
   726             cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
   727             if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
   728                 delim.ptr = d.ptr;
   729                 delim.length = ctx->delim_more[i].length;
   730             }
   731         }
   732     }
   734     // store the token information and adjust the context
   735     ctx->found++;
   736     ctx->pos = ctx->next_pos;
   737     token->ptr = &ctx->str.ptr[ctx->pos];
   738     ctx->delim_pos = delim.length == 0 ?
   739                      ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
   740     token->length = ctx->delim_pos - ctx->pos;
   741     ctx->next_pos = ctx->delim_pos + delim.length;
   743     return true;
   744 }
   746 bool cx_strtok_next_m(
   747         CxStrtokCtx *ctx,
   748         cxmutstr *token
   749 ) {
   750     return cx_strtok_next(ctx, (cxstring *) token);
   751 }
   753 void cx_strtok_delim(
   754         CxStrtokCtx *ctx,
   755         cxstring const *delim,
   756         size_t count
   757 ) {
   758     ctx->delim_more = delim;
   759     ctx->delim_more_count = count;
   760 }

mercurial