ucx: src/string.c@ec50abb285ad

     1 /*

     2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.

     3  *

     4  * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.

     5  *

     6  * Redistribution and use in source and binary forms, with or without

     7  * modification, are permitted provided that the following conditions are met:

     8  *

     9  *   1. Redistributions of source code must retain the above copyright

    10  *      notice, this list of conditions and the following disclaimer.

    11  *

    12  *   2. Redistributions in binary form must reproduce the above copyright

    13  *      notice, this list of conditions and the following disclaimer in the

    14  *      documentation and/or other materials provided with the distribution.

    15  *

    16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

    17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

    18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

    19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

    20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

    21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

    22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

    23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

    24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

    25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

    26  * POSSIBILITY OF SUCH DAMAGE.

    27  */

    29 #include "cx/string.h"

    30 #include "cx/utils.h"

    32 #include <string.h>

    33 #include <stdarg.h>

    34 #include <ctype.h>

    36 #ifndef _WIN32

    38 #include <strings.h> // for strncasecmp()

    40 #endif // _WIN32

    42 cxmutstr cx_mutstr(char *cstring) {

    43     return (cxmutstr) {cstring, strlen(cstring)};

    44 }

    46 cxmutstr cx_mutstrn(

    47         char *cstring,

    48         size_t length

    49 ) {

    50     return (cxmutstr) {cstring, length};

    51 }

    53 cxstring cx_str(const char *cstring) {

    54     return (cxstring) {cstring, strlen(cstring)};

    55 }

    57 cxstring cx_strn(

    58         const char *cstring,

    59         size_t length

    60 ) {

    61     return (cxstring) {cstring, length};

    62 }

    64 cxstring cx_strcast(cxmutstr str) {

    65     return (cxstring) {str.ptr, str.length};

    66 }

    68 void cx_strfree(cxmutstr *str) {

    69     free(str->ptr);

    70     str->ptr = NULL;

    71     str->length = 0;

    72 }

    74 void cx_strfree_a(

    75         CxAllocator *alloc,

    76         cxmutstr *str

    77 ) {

    78     cxFree(alloc, str->ptr);

    79     str->ptr = NULL;

    80     str->length = 0;

    81 }

    83 size_t cx_strlen(

    84         size_t count,

    85         ...

    86 ) {

    87     if (count == 0) return 0;

    89     va_list ap;

    90     va_start(ap, count);

    91     size_t size = 0;

    92     cx_for_n(i, count) {

    93         cxstring str = va_arg(ap, cxstring);

    94         size += str.length;

    95     }

    96     va_end(ap);

    98     return size;

    99 }

   101 cxmutstr cx_strcat_a(

   102         CxAllocator *alloc,

   103         size_t count,

   104         ...

   105 ) {

   106     cxstring *strings = calloc(count, sizeof(cxstring));

   107     if (!strings) abort();

   109     va_list ap;

   110     va_start(ap, count);

   112     // get all args and overall length

   113     size_t slen = 0;

   114     cx_for_n(i, count) {

   115         cxstring s = va_arg (ap, cxstring);

   116         strings[i] = s;

   117         slen += s.length;

   118     }

   120     // create new string

   121     cxmutstr result;

   122     result.ptr = cxMalloc(alloc, slen + 1);

   123     result.length = slen;

   124     if (result.ptr == NULL) abort();

   126     // concatenate strings

   127     size_t pos = 0;

   128     cx_for_n(i, count) {

   129         cxstring s = strings[i];

   130         memcpy(result.ptr + pos, s.ptr, s.length);

   131         pos += s.length;

   132     }

   134     // terminate string

   135     result.ptr[result.length] = '\0';

   137     // free temporary array

   138     free(strings);

   140     return result;

   141 }

   143 cxstring cx_strsubs(

   144         cxstring string,

   145         size_t start

   146 ) {

   147     return cx_strsubsl(string, start, string.length - start);

   148 }

   150 cxmutstr cx_strsubs_m(

   151         cxmutstr string,

   152         size_t start

   153 ) {

   154     return cx_strsubsl_m(string, start, string.length - start);

   155 }

   157 cxstring cx_strsubsl(

   158         cxstring string,

   159         size_t start,

   160         size_t length

   161 ) {

   162     if (start > string.length) {

   163         return (cxstring) {NULL, 0};

   164     }

   166     size_t rem_len = string.length - start;

   167     if (length > rem_len) {

   168         length = rem_len;

   169     }

   171     return (cxstring) {string.ptr + start, length};

   172 }

   174 cxmutstr cx_strsubsl_m(

   175         cxmutstr string,

   176         size_t start,

   177         size_t length

   178 ) {

   179     cxstring result = cx_strsubsl(cx_strcast(string), start, length);

   180     return (cxmutstr) {(char *) result.ptr, result.length};

   181 }

   183 cxstring cx_strchr(

   184         cxstring string,

   185         int chr

   186 ) {

   187     chr = 0xFF & chr;

   188     // TODO: improve by comparing multiple bytes at once

   189     cx_for_n(i, string.length) {

   190         if (string.ptr[i] == chr) {

   191             return cx_strsubs(string, i);

   192         }

   193     }

   194     return (cxstring) {NULL, 0};

   195 }

   197 cxmutstr cx_strchr_m(

   198         cxmutstr string,

   199         int chr

   200 ) {

   201     cxstring result = cx_strchr(cx_strcast(string), chr);

   202     return (cxmutstr) {(char *) result.ptr, result.length};

   203 }

   205 cxstring cx_strrchr(

   206         cxstring string,

   207         int chr

   208 ) {

   209     chr = 0xFF & chr;

   210     size_t i = string.length;

   211     while (i > 0) {

   212         i--;

   213         // TODO: improve by comparing multiple bytes at once

   214         if (string.ptr[i] == chr) {

   215             return cx_strsubs(string, i);

   216         }

   217     }

   218     return (cxstring) {NULL, 0};

   219 }

   221 cxmutstr cx_strrchr_m(

   222         cxmutstr string,

   223         int chr

   224 ) {

   225     cxstring result = cx_strrchr(cx_strcast(string), chr);

   226     return (cxmutstr) {(char *) result.ptr, result.length};

   227 }

   229 #ifndef CX_STRSTR_SBO_SIZE

   230 #define CX_STRSTR_SBO_SIZE 512

   231 #endif

   233 cxstring cx_strstr(

   234         cxstring haystack,

   235         cxstring needle

   236 ) {

   237     if (needle.length == 0) {

   238         return haystack;

   239     }

   241     // optimize for single-char needles

   242     if (needle.length == 1) {

   243         return cx_strchr(haystack, *needle.ptr);

   244     }

   246     /*

   247      * IMPORTANT:

   248      * Our prefix table contains the prefix length PLUS ONE

   249      * this is our decision, because we want to use the full range of size_t.

   250      * The original algorithm needs a (-1) at one single place,

   251      * and we want to avoid that.

   252      */

   254     // local prefix table

   255     size_t s_prefix_table[CX_STRSTR_SBO_SIZE];

   257     // check needle length and use appropriate prefix table

   258     // if the pattern exceeds static prefix table, allocate on the heap

   259     bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;

   260     register size_t *ptable = useheap ? calloc(needle.length + 1,

   261                                                sizeof(size_t)) : s_prefix_table;

   263     // keep counter in registers

   264     register size_t i, j;

   266     // fill prefix table

   267     i = 0;

   268     j = 0;

   269     ptable[i] = j;

   270     while (i < needle.length) {

   271         while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {

   272             j = ptable[j - 1];

   273         }

   274         i++;

   275         j++;

   276         ptable[i] = j;

   277     }

   279     // search

   280     cxstring result = {NULL, 0};

   281     i = 0;

   282     j = 1;

   283     while (i < haystack.length) {

   284         while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {

   285             j = ptable[j - 1];

   286         }

   287         i++;

   288         j++;

   289         if (j - 1 == needle.length) {

   290             size_t start = i - needle.length;

   291             result.ptr = haystack.ptr + start;

   292             result.length = haystack.length - start;

   293             break;

   294         }

   295     }

   297     // if prefix table was allocated on the heap, free it

   298     if (ptable != s_prefix_table) {

   299         free(ptable);

   300     }

   302     return result;

   303 }

   305 cxmutstr cx_strstr_m(

   306         cxmutstr haystack,

   307         cxstring needle

   308 ) {

   309     cxstring result = cx_strstr(cx_strcast(haystack), needle);

   310     return (cxmutstr) {(char *) result.ptr, result.length};

   311 }

   313 size_t cx_strsplit(

   314         cxstring string,

   315         cxstring delim,

   316         size_t limit,

   317         cxstring *output

   318 ) {

   319     // special case: output limit is zero

   320     if (limit == 0) return 0;

   322     // special case: delimiter is empty

   323     if (delim.length == 0) {

   324         output[0] = string;

   325         return 1;

   326     }

   328     // special cases: delimiter is at least as large as the string

   329     if (delim.length >= string.length) {

   330         // exact match

   331         if (cx_strcmp(string, delim) == 0) {

   332             output[0] = cx_strn(string.ptr, 0);

   333             output[1] = cx_strn(string.ptr + string.length, 0);

   334             return 2;

   335         } else {

   336             // no match possible

   337             output[0] = string;

   338             return 1;

   339         }

   340     }

   342     size_t n = 0;

   343     cxstring curpos = string;

   344     while (1) {

   345         ++n;

   346         cxstring match = cx_strstr(curpos, delim);

   347         if (match.length > 0) {

   348             // is the limit reached?

   349             if (n < limit) {

   350                 // copy the current string to the array

   351                 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);

   352                 output[n - 1] = item;

   353                 size_t processed = item.length + delim.length;

   354                 curpos.ptr += processed;

   355                 curpos.length -= processed;

   356             } else {

   357                 // limit reached, copy the _full_ remaining string

   358                 output[n - 1] = curpos;

   359                 break;

   360             }

   361         } else {

   362             // no more matches, copy last string

   363             output[n - 1] = curpos;

   364             break;

   365         }

   366     }

   368     return n;

   369 }

   371 size_t cx_strsplit_a(

   372         CxAllocator *allocator,

   373         cxstring string,

   374         cxstring delim,

   375         size_t limit,

   376         cxstring **output

   377 ) {

   378     // find out how many splits we're going to make and allocate memory

   379     size_t n = 0;

   380     cxstring curpos = string;

   381     while (1) {

   382         ++n;

   383         cxstring match = cx_strstr(curpos, delim);

   384         if (match.length > 0) {

   385             // is the limit reached?

   386             if (n < limit) {

   387                 size_t processed = match.ptr - curpos.ptr + delim.length;

   388                 curpos.ptr += processed;

   389                 curpos.length -= processed;

   390             } else {

   391                 // limit reached

   392                 break;

   393             }

   394         } else {

   395             // no more matches

   396             break;

   397         }

   398     }

   399     *output = cxCalloc(allocator, n, sizeof(cxstring));

   400     return cx_strsplit(string, delim, n, *output);

   401 }

   403 size_t cx_strsplit_m(

   404         cxmutstr string,

   405         cxstring delim,

   406         size_t limit,

   407         cxmutstr *output

   408 ) {

   409     return cx_strsplit(cx_strcast(string),

   410                        delim, limit, (cxstring *) output);

   411 }

   413 size_t cx_strsplit_ma(

   414         CxAllocator *allocator,

   415         cxmutstr string,

   416         cxstring delim,

   417         size_t limit,

   418         cxmutstr **output

   419 ) {

   420     return cx_strsplit_a(allocator, cx_strcast(string),

   421                          delim, limit, (cxstring **) output);

   422 }

   424 int cx_strcmp(

   425         cxstring s1,

   426         cxstring s2

   427 ) {

   428     if (s1.length == s2.length) {

   429         return memcmp(s1.ptr, s2.ptr, s1.length);

   430     } else if (s1.length > s2.length) {

   431         return 1;

   432     } else {

   433         return -1;

   434     }

   435 }

   437 int cx_strcasecmp(

   438         cxstring s1,

   439         cxstring s2

   440 ) {

   441     if (s1.length == s2.length) {

   442 #ifdef _WIN32

   443         return _strnicmp(s1.ptr, s2.ptr, s1.length);

   444 #else

   445         return strncasecmp(s1.ptr, s2.ptr, s1.length);

   446 #endif

   447     } else if (s1.length > s2.length) {

   448         return 1;

   449     } else {

   450         return -1;

   451     }

   452 }

   454 cxmutstr cx_strdup_a(

   455         CxAllocator *allocator,

   456         cxstring string

   457 ) {

   458     cxmutstr result = {

   459             cxMalloc(allocator, string.length + 1),

   460             string.length

   461     };

   462     if (result.ptr == NULL) {

   463         result.length = 0;

   464         return result;

   465     }

   466     memcpy(result.ptr, string.ptr, string.length);

   467     result.ptr[string.length] = '\0';

   468     return result;

   469 }

   471 cxstring cx_strtrim(cxstring string) {

   472     cxstring result = string;

   473     // TODO: optimize by comparing multiple bytes at once

   474     while (result.length > 0 && isspace(*result.ptr)) {

   475         result.ptr++;

   476         result.length--;

   477     }

   478     while (result.length > 0 && isspace(result.ptr[result.length - 1])) {

   479         result.length--;

   480     }

   481     return result;

   482 }

   484 cxmutstr cx_strtrim_m(cxmutstr string) {

   485     cxstring result = cx_strtrim(cx_strcast(string));

   486     return (cxmutstr) {(char *) result.ptr, result.length};

   487 }

   489 bool cx_strprefix(

   490         cxstring string,

   491         cxstring prefix

   492 ) {

   493     if (string.length < prefix.length) return false;

   494     return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;

   495 }

   497 bool cx_strsuffix(

   498         cxstring string,

   499         cxstring suffix

   500 ) {

   501     if (string.length < suffix.length) return false;

   502     return memcmp(string.ptr + string.length - suffix.length,

   503                   suffix.ptr, suffix.length) == 0;

   504 }

   506 bool cx_strcaseprefix(

   507         cxstring string,

   508         cxstring prefix

   509 ) {

   510     if (string.length < prefix.length) return false;

   511 #ifdef _WIN32

   512     return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;

   513 #else

   514     return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;

   515 #endif

   516 }

   518 bool cx_strcasesuffix(

   519         cxstring string,

   520         cxstring suffix

   521 ) {

   522     if (string.length < suffix.length) return false;

   523 #ifdef _WIN32

   524     return _strnicmp(string.ptr+string.length-suffix.length,

   525                   suffix.ptr, suffix.length) == 0;

   526 #else

   527     return strncasecmp(string.ptr + string.length - suffix.length,

   528                        suffix.ptr, suffix.length) == 0;

   529 #endif

   530 }

   532 void cx_strlower(cxmutstr string) {

   533     cx_for_n(i, string.length) {

   534         string.ptr[i] = (char) tolower(string.ptr[i]);

   535     }

   536 }

   538 void cx_strupper(cxmutstr string) {

   539     cx_for_n(i, string.length) {

   540         string.ptr[i] = (char) toupper(string.ptr[i]);

   541     }

   542 }

   544 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE

   545 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64

   546 #endif

   548 struct cx_strreplace_ibuf {

   549     size_t *buf;

   550     struct cx_strreplace_ibuf *next;

   551     unsigned int len;

   552 };

   554 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {

   555     while (buf) {

   556         struct cx_strreplace_ibuf *next = buf->next;

   557         free(buf->buf);

   558         free(buf);

   559         buf = next;

   560     }

   561 }

   563 cxmutstr cx_strreplacen_a(

   564         CxAllocator *allocator,

   565         cxstring str,

   566         cxstring pattern,

   567         cxstring replacement,

   568         size_t replmax

   569 ) {

   571     if (pattern.length == 0 || pattern.length > str.length || replmax == 0)

   572         return cx_strdup_a(allocator, str);

   574     // Compute expected buffer length

   575     size_t ibufmax = str.length / pattern.length;

   576     size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;

   577     if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {

   578         ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;

   579     }

   581     // Allocate first index buffer

   582     struct cx_strreplace_ibuf *firstbuf, *curbuf;

   583     firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));

   584     if (!firstbuf) return cx_mutstrn(NULL, 0);

   585     firstbuf->buf = calloc(ibuflen, sizeof(size_t));

   586     if (!firstbuf->buf) {

   587         free(firstbuf);

   588         return cx_mutstrn(NULL, 0);

   589     }

   591     // Search occurrences

   592     cxstring searchstr = str;

   593     size_t found = 0;

   594     do {

   595         cxstring match = cx_strstr(searchstr, pattern);

   596         if (match.length > 0) {

   597             // Allocate next buffer in chain, if required

   598             if (curbuf->len == ibuflen) {

   599                 struct cx_strreplace_ibuf *nextbuf =

   600                         calloc(1, sizeof(struct cx_strreplace_ibuf));

   601                 if (!nextbuf) {

   602                     cx_strrepl_free_ibuf(firstbuf);

   603                     return cx_mutstrn(NULL, 0);

   604                 }

   605                 nextbuf->buf = calloc(ibuflen, sizeof(size_t));

   606                 if (!nextbuf->buf) {

   607                     free(nextbuf);

   608                     cx_strrepl_free_ibuf(firstbuf);

   609                     return cx_mutstrn(NULL, 0);

   610                 }

   611                 curbuf->next = nextbuf;

   612                 curbuf = nextbuf;

   613             }

   615             // Record match index

   616             found++;

   617             size_t idx = match.ptr - str.ptr;

   618             curbuf->buf[curbuf->len++] = idx;

   619             searchstr.ptr = match.ptr + pattern.length;

   620             searchstr.length = str.length - idx - pattern.length;

   621         } else {

   622             break;

   623         }

   624     } while (searchstr.length > 0 && found < replmax);

   626     // Allocate result string

   627     cxmutstr result;

   628     {

   629         ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;

   630         size_t rcount = 0;

   631         curbuf = firstbuf;

   632         do {

   633             rcount += curbuf->len;

   634             curbuf = curbuf->next;

   635         } while (curbuf);

   636         result.length = str.length + rcount * adjlen;

   637         result.ptr = cxMalloc(allocator, result.length + 1);

   638         if (!result.ptr) {

   639             cx_strrepl_free_ibuf(firstbuf);

   640             return cx_mutstrn(NULL, 0);

   641         }

   642     }

   644     // Build result string

   645     curbuf = firstbuf;

   646     size_t srcidx = 0;

   647     char *destptr = result.ptr;

   648     do {

   649         for (size_t i = 0; i < curbuf->len; i++) {

   650             // Copy source part up to next match

   651             size_t idx = curbuf->buf[i];

   652             size_t srclen = idx - srcidx;

   653             if (srclen > 0) {

   654                 memcpy(destptr, str.ptr + srcidx, srclen);

   655                 destptr += srclen;

   656                 srcidx += srclen;

   657             }

   659             // Copy the replacement and skip the source pattern

   660             srcidx += pattern.length;

   661             memcpy(destptr, replacement.ptr, replacement.length);

   662             destptr += replacement.length;

   663         }

   664         curbuf = curbuf->next;

   665     } while (curbuf);

   666     memcpy(destptr, str.ptr + srcidx, str.length - srcidx);

   668     // Result is guaranteed to be zero-terminated

   669     result.ptr[result.length] = '\0';

   671     // Free index buffer

   672     cx_strrepl_free_ibuf(firstbuf);

   674     return result;

   675 }

   677 CxStrtokCtx cx_strtok(

   678         cxstring str,

   679         cxstring delim,

   680         size_t limit

   681 ) {

   682     CxStrtokCtx ctx;

   683     ctx.str = str;

   684     ctx.delim = delim;

   685     ctx.limit = limit;

   686     ctx.pos = 0;

   687     ctx.next_pos = 0;

   688     ctx.delim_pos = 0;

   689     ctx.found = 0;

   690     ctx.delim_more = NULL;

   691     ctx.delim_more_count = 0;

   692     return ctx;

   693 }

   695 CxStrtokCtx cx_strtok_m(

   696         cxmutstr str,

   697         cxstring delim,

   698         size_t limit

   699 ) {

   700     return cx_strtok(cx_strcast(str), delim, limit);

   701 }

   703 bool cx_strtok_next(

   704         CxStrtokCtx *ctx,

   705         cxstring *token

   706 ) {

   707     // abortion criteria

   708     if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {

   709         return false;

   710     }

   712     // determine the search start

   713     cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);

   715     // search the next delimiter

   716     cxstring delim = cx_strstr(haystack, ctx->delim);

   718     // if found, make delim capture exactly the delimiter

   719     if (delim.length > 0) {

   720         delim.length = ctx->delim.length;

   721     }

   723     // if more delimiters are specified, check them now

   724     if (ctx->delim_more_count > 0) {

   725         cx_for_n(i, ctx->delim_more_count) {

   726             cxstring d = cx_strstr(haystack, ctx->delim_more[i]);

   727             if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {

   728                 delim.ptr = d.ptr;

   729                 delim.length = ctx->delim_more[i].length;

   730             }

   731         }

   732     }

   734     // store the token information and adjust the context

   735     ctx->found++;

   736     ctx->pos = ctx->next_pos;

   737     token->ptr = &ctx->str.ptr[ctx->pos];

   738     ctx->delim_pos = delim.length == 0 ?

   739                      ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);

   740     token->length = ctx->delim_pos - ctx->pos;

   741     ctx->next_pos = ctx->delim_pos + delim.length;

   743     return true;

   744 }

   746 bool cx_strtok_next_m(

   747         CxStrtokCtx *ctx,

   748         cxmutstr *token

   749 ) {

   750     return cx_strtok_next(ctx, (cxstring *) token);

   751 }

   753 void cx_strtok_delim(

   754         CxStrtokCtx *ctx,

   755         cxstring const *delim,

   756         size_t count

   757 ) {

   758     ctx->delim_more = delim;

   759     ctx->delim_more_count = count;

   760 }

Mercurial > hg > ucx / file revision

src/string.c@ec50abb285ad

src/string.c