ucx/string.c

Thu, 23 Feb 2017 14:30:12 +0100

author
Mike Becker <universe@uap-core.de>
date
Thu, 23 Feb 2017 14:30:12 +0100
changeset 236
ffc6d0910342
parent 235
7cf1e41833a2
child 237
5ba9de6361ff
permissions
-rw-r--r--

improves sstrstr function by using KMP string search algorithm

     1 /*
     2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
     3  *
     4  * Copyright 2016 Olaf Wintermann. All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or without
     7  * modification, are permitted provided that the following conditions are met:
     8  *
     9  *   1. Redistributions of source code must retain the above copyright
    10  *      notice, this list of conditions and the following disclaimer.
    11  *
    12  *   2. Redistributions in binary form must reproduce the above copyright
    13  *      notice, this list of conditions and the following disclaimer in the
    14  *      documentation and/or other materials provided with the distribution.
    15  *
    16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
    20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    26  * POSSIBILITY OF SUCH DAMAGE.
    27  */
    29 #include <stdlib.h>
    30 #include <string.h>
    31 #include <stdarg.h>
    32 #include <stdint.h>
    33 #include <ctype.h>
    35 #include "string.h"
    36 #include "allocator.h"
    38 sstr_t sstr(char *cstring) {
    39     sstr_t string;
    40     string.ptr = cstring;
    41     string.length = strlen(cstring);
    42     return string;
    43 }
    45 sstr_t sstrn(char *cstring, size_t length) {
    46     sstr_t string;
    47     string.ptr = cstring;
    48     string.length = length;
    49     return string;
    50 }
    52 size_t sstrnlen(size_t n, sstr_t s, ...) {
    53     va_list ap;
    54     size_t size = s.length;
    55     va_start(ap, s);
    57     for (size_t i = 1 ; i < n ; i++) {
    58         sstr_t str = va_arg(ap, sstr_t);
    59         size += str.length;
    60     }
    61     va_end(ap);
    63     return size;
    64 }
    66 static sstr_t sstrvcat_a(
    67         UcxAllocator *a,
    68         size_t count,
    69         sstr_t s1,
    70         sstr_t s2,
    71         va_list ap) {
    72     sstr_t str;
    73     str.ptr = NULL;
    74     str.length = 0;
    75     if(count < 2) {
    76         return str;
    77     }
    79     sstr_t *strings = (sstr_t*) calloc(count, sizeof(sstr_t));
    80     if(!strings) {
    81         return str;
    82     }
    84     // get all args and overall length
    85     strings[0] = s1;
    86     strings[1] = s2;
    87     size_t strlen = s1.length + s2.length;
    88     for (size_t i=2;i<count;i++) {
    89         sstr_t s = va_arg (ap, sstr_t);
    90         strings[i] = s;
    91         strlen += s.length;
    92     }
    94     // create new string
    95     str.ptr = (char*) almalloc(a, strlen + 1);
    96     str.length = strlen;
    97     if(!str.ptr) {
    98         free(strings);
    99         str.length = 0;
   100         return str;
   101     }
   103     // concatenate strings
   104     size_t pos = 0;
   105     for (size_t i=0;i<count;i++) {
   106         sstr_t s = strings[i];
   107         memcpy(str.ptr + pos, s.ptr, s.length);
   108         pos += s.length;
   109     }
   111     str.ptr[str.length] = '\0';
   113     free(strings);
   115     return str;
   116 }
   118 sstr_t sstrcat(size_t count, sstr_t s1, sstr_t s2, ...) {
   119     va_list ap;
   120     va_start(ap, s2);
   121     sstr_t s = sstrvcat_a(ucx_default_allocator(), count, s1, s2, ap);
   122     va_end(ap);
   123     return s;
   124 }
   126 sstr_t sstrcat_a(UcxAllocator *a, size_t count, sstr_t s1, sstr_t s2, ...) {
   127     va_list ap;
   128     va_start(ap, s2);
   129     sstr_t s = sstrvcat_a(a, count, s1, s2, ap);
   130     va_end(ap);
   131     return s;
   132 }
   134 sstr_t sstrsubs(sstr_t s, size_t start) {
   135     return sstrsubsl (s, start, s.length-start);
   136 }
   138 sstr_t sstrsubsl(sstr_t s, size_t start, size_t length) {
   139     sstr_t new_sstr;
   140     if (start >= s.length) {
   141         new_sstr.ptr = NULL;
   142         new_sstr.length = 0;
   143     } else {
   144         if (length > s.length-start) {
   145             length = s.length-start;
   146         }
   147         new_sstr.ptr = &s.ptr[start];
   148         new_sstr.length = length;
   149     }
   150     return new_sstr;
   151 }
   153 sstr_t sstrchr(sstr_t s, int c) {
   154     for(size_t i=0;i<s.length;i++) {
   155         if(s.ptr[i] == c) {
   156             return sstrsubs(s, i);
   157         }
   158     }
   159     sstr_t n;
   160     n.ptr = NULL;
   161     n.length = 0;
   162     return n;
   163 }
   165 sstr_t sstrrchr(sstr_t s, int c) {
   166     if (s.length > 0) {
   167         for(size_t i=s.length;i>0;i--) {
   168             if(s.ptr[i-1] == c) {
   169                 return sstrsubs(s, i-1);
   170             }
   171         }
   172     }
   173     sstr_t n;
   174     n.ptr = NULL;
   175     n.length = 0;
   176     return n;
   177 }
   179 typedef size_t(*prefix_table_rfun)(void*,size_t);
   180 typedef void(*prefix_table_wfun)(void*,size_t,size_t);
   182 static size_t s_prefix_table_r(void* table, size_t index) {
   183     return (size_t) ((uint8_t*)table)[index];
   184 }
   185 static void s_prefix_table_w(void* table, size_t index, size_t value) {
   186     ((uint8_t*)table)[index] = (uint8_t) value;
   187 }
   188 static size_t h_prefix_table_r(void* table, size_t index) {
   189     return ((size_t*)table)[index];
   190 }
   191 static void h_prefix_table_w(void* table, size_t index, size_t value) {
   192     ((size_t*)table)[index] = value;
   193 }
   195 sstr_t sstrstr(sstr_t string, sstr_t match) {
   196     if (match.length == 0) {
   197         return string;
   198     }
   200     /* prepare default return value in case of no match */
   201     sstr_t result = sstrn(NULL, 0);
   203     /*
   204      * IMPORTANT:
   205      * our prefix table contains the prefix length PLUS ONE
   206      * this is our decision, because we want to use the full range of size_t
   207      * the original algorithm needs a (-1) at one single place
   208      * and we want to avoid that
   209      */
   211     /* static prefix table */
   212     static uint8_t s_prefix_table[256];
   214     /* check pattern length and use appropriate prefix table */
   215     register void* ptable;
   216     prefix_table_rfun ptable_r;
   217     prefix_table_wfun ptable_w;
   218     if (match.length > 255) {
   219         /* pattern exceeds static prefix table, allocate on the heap */
   220         ptable = calloc(match.length+1, sizeof(size_t));
   221         ptable_r = h_prefix_table_r;
   222         ptable_w = h_prefix_table_w;
   223     } else {
   224         ptable = s_prefix_table;
   225         ptable_r = s_prefix_table_r;
   226         ptable_w = s_prefix_table_w;
   227     }
   229     /* keep counter in registers */
   230     register size_t i, j;
   232     /* fill prefix table */
   233     i = 0; j = 0;
   234     ptable_w(ptable, i, j);
   235     while (i < match.length) {
   236         while (j >= 1 && match.ptr[j-1] != match.ptr[i]) {
   237             j = ptable_r(ptable, j-i);
   238         }
   239         i++; j++;
   240         ptable_w(ptable, i, j);
   241     }
   243     /* search */
   244     i = 0; j = 1;
   245     while (i < string.length) {
   246         while (j >= 1 && string.ptr[i] != match.ptr[j-1]) {
   247             j = ptable_r(ptable, j-1);
   248         }
   249         i++; j++;
   250         if (j-1 == match.length) {
   251             size_t start = i - match.length;
   252             result.ptr = string.ptr + start;
   253             result.length = string.length - start;
   254             break;
   255         }
   256     }
   258     /* if prefix table was allocated on the heap, free it */
   259     if (ptable != s_prefix_table) {
   260         free(ptable);
   261     }
   263     return result;
   264 }
   266 sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {
   267     return sstrsplit_a(ucx_default_allocator(), s, d, n);
   268 }
   270 sstr_t* sstrsplit_a(UcxAllocator *allocator, sstr_t s, sstr_t d, ssize_t *n) {
   271     if (s.length == 0 || d.length == 0) {
   272         *n = -1;
   273         return NULL;
   274     }
   276     /* special cases: delimiter is at least as large as the string */
   277     if (d.length >= s.length) {
   278         /* exact match */
   279         if (sstrcmp(s, d) == 0) {
   280             *n = 0;
   281             return NULL;
   282         } else /* no match possible */ {
   283             *n = 1;
   284             sstr_t *result = (sstr_t*) almalloc(allocator, sizeof(sstr_t));
   285             *result = sstrdup_a(allocator, s);
   286             return result;
   287         }
   288     }
   290     ssize_t nmax = *n;
   291     size_t arrlen = 16;
   292     sstr_t* result = (sstr_t*) almalloc(allocator, arrlen*sizeof(sstr_t));
   294     if (result) {
   295         sstr_t curpos = s;
   296         ssize_t j = 1;
   297         while (1) {
   298             sstr_t match;
   299             /* optimize for one byte delimiters */
   300             if (d.length == 1) {
   301                 match = curpos;
   302                 for (size_t i = 0 ; i < curpos.length ; i++) {
   303                     if (curpos.ptr[i] == *(d.ptr)) {
   304                         match.ptr = curpos.ptr + i;
   305                         break;
   306                     }
   307                     match.length--;
   308                 }
   309             } else {
   310                 match = sstrstr(curpos, d);
   311             }
   312             if (match.length > 0) {
   313                 /* is this our last try? */
   314                 if (nmax == 0 || j < nmax) {
   315                     /* copy the current string to the array */
   316                     sstr_t item = sstrn(curpos.ptr, match.ptr - curpos.ptr);
   317                     result[j-1] = sstrdup_a(allocator, item);
   318                     size_t processed = item.length + d.length;
   319                     curpos.ptr += processed;
   320                     curpos.length -= processed;
   322                     /* allocate memory for the next string */
   323                     j++;
   324                     if (j > arrlen) {
   325                         arrlen *= 2;
   326                         sstr_t* reallocated = (sstr_t*) alrealloc(
   327                                 allocator, result, arrlen*sizeof(sstr_t));
   328                         if (reallocated) {
   329                             result = reallocated;
   330                         } else {
   331                             for (ssize_t i = 0 ; i < j-1 ; i++) {
   332                                 alfree(allocator, result[i].ptr);
   333                             }
   334                             alfree(allocator, result);
   335                             *n = -2;
   336                             return NULL;
   337                         }
   338                     }
   339                 } else {
   340                     /* nmax reached, copy the _full_ remaining string */
   341                     result[j-1] = sstrdup_a(allocator, curpos);
   342                     break;
   343                 }
   344             } else {
   345                 /* no more matches, copy last string */
   346                 result[j-1] = sstrdup_a(allocator, curpos);
   347                 break;
   348             }
   349         }
   350         *n = j;
   351     } else {
   352         *n = -2;
   353     }
   355     return result;
   356 }
   358 int sstrcmp(sstr_t s1, sstr_t s2) {
   359     if (s1.length == s2.length) {
   360         return memcmp(s1.ptr, s2.ptr, s1.length);
   361     } else if (s1.length > s2.length) {
   362         return 1;
   363     } else {
   364         return -1;
   365     }
   366 }
   368 int sstrcasecmp(sstr_t s1, sstr_t s2) {
   369     if (s1.length == s2.length) {
   370 #ifdef _WIN32
   371         return _strnicmp(s1.ptr, s2.ptr, s1.length);
   372 #else
   373         return strncasecmp(s1.ptr, s2.ptr, s1.length);
   374 #endif
   375     } else if (s1.length > s2.length) {
   376         return 1;
   377     } else {
   378         return -1;
   379     }
   380 }
   382 sstr_t sstrdup(sstr_t s) {
   383     return sstrdup_a(ucx_default_allocator(), s);
   384 }
   386 sstr_t sstrdup_a(UcxAllocator *allocator, sstr_t s) {
   387     sstr_t newstring;
   388     newstring.ptr = (char*)almalloc(allocator, s.length + 1);
   389     if (newstring.ptr) {
   390         newstring.length = s.length;
   391         newstring.ptr[newstring.length] = 0;
   393         memcpy(newstring.ptr, s.ptr, s.length);
   394     } else {
   395         newstring.length = 0;
   396     }
   398     return newstring;
   399 }
   401 sstr_t sstrtrim(sstr_t string) {
   402     sstr_t newstr = string;
   404     while (newstr.length > 0 && isspace(*newstr.ptr)) {
   405         newstr.ptr++;
   406         newstr.length--;
   407     }
   408     while (newstr.length > 0 && isspace(newstr.ptr[newstr.length-1])) {
   409         newstr.length--;
   410     }
   412     return newstr;
   413 }
   415 int sstrprefix(sstr_t string, sstr_t prefix) {
   416     if (string.length == 0) {
   417         return prefix.length == 0;
   418     }
   419     if (prefix.length == 0) {
   420         return 1;
   421     }
   423     if (prefix.length > string.length) {
   424         return 0;
   425     } else {
   426         return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
   427     }
   428 }
   430 int sstrsuffix(sstr_t string, sstr_t suffix) {
   431     if (string.length == 0) {
   432         return suffix.length == 0;
   433     }
   434     if (suffix.length == 0) {
   435         return 1;
   436     }
   438     if (suffix.length > string.length) {
   439         return 0;
   440     } else {
   441         return memcmp(string.ptr+string.length-suffix.length,
   442             suffix.ptr, suffix.length) == 0;
   443     }
   444 }
   446 sstr_t sstrlower(sstr_t string) {
   447     sstr_t ret = sstrdup(string);
   448     for (size_t i = 0; i < ret.length ; i++) {
   449         ret.ptr[i] = tolower(ret.ptr[i]);
   450     }
   451     return ret;
   452 }
   454 sstr_t sstrlower_a(UcxAllocator *allocator, sstr_t string) {
   455     sstr_t ret = sstrdup_a(allocator, string);
   456     for (size_t i = 0; i < ret.length ; i++) {
   457         ret.ptr[i] = tolower(ret.ptr[i]);
   458     }
   459     return ret;
   460 }
   462 sstr_t sstrupper(sstr_t string) {
   463     sstr_t ret = sstrdup(string);
   464     for (size_t i = 0; i < ret.length ; i++) {
   465         ret.ptr[i] = toupper(ret.ptr[i]);
   466     }
   467     return ret;
   468 }
   470 sstr_t sstrupper_a(UcxAllocator *allocator, sstr_t string) {
   471     sstr_t ret = sstrdup_a(allocator, string);
   472     for (size_t i = 0; i < ret.length ; i++) {
   473         ret.ptr[i] = toupper(ret.ptr[i]);
   474     }
   475     return ret;
   476 }

mercurial