improves sstrstr function by using KMP string search algorithm

Thu, 23 Feb 2017 14:30:12 +0100

author
Mike Becker <universe@uap-core.de>
date
Thu, 23 Feb 2017 14:30:12 +0100
changeset 236
ffc6d0910342
parent 235
7cf1e41833a2
child 237
5ba9de6361ff

improves sstrstr function by using KMP string search algorithm

test/string_tests.c file | annotate | diff | comparison | revisions
ucx/string.c file | annotate | diff | comparison | revisions
     1.1 --- a/test/string_tests.c	Mon Feb 20 17:28:58 2017 +0100
     1.2 +++ b/test/string_tests.c	Thu Feb 23 14:30:12 2017 +0100
     1.3 @@ -81,6 +81,34 @@
     1.4  
     1.5  UCX_TEST(test_sstrstr) {
     1.6      sstr_t str = ST("find the match in this string");
     1.7 +    sstr_t longstr = ST(
     1.8 +            "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl"
     1.9 +            "mnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwx"
    1.10 +            "yzabcdeababababnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij"
    1.11 +            "klmnopqrstuvwxyzaababababababababrstuvwxyzabcdefghijklmnopqrstuv"
    1.12 +            "abababababababababababababababababababababababababababababababab"
    1.13 +            "abababababababababababababababababababababababababababababababab"
    1.14 +            "abababababababababababababababababababababababababababababababab"
    1.15 +            "abababababababababababababababababababababababababababababababab"
    1.16 +            "abababababababababababababababababababababababababababababababab"
    1.17 +            "abababababababababababababababababababababababababababababababab"
    1.18 +            "wxyz1234567890");
    1.19 +    sstr_t longstrpattern = ST(
    1.20 +            "abababababababababababababababababababababababababababababababab"
    1.21 +            "abababababababababababababababababababababababababababababababab"
    1.22 +            "abababababababababababababababababababababababababababababababab"
    1.23 +            "abababababababababababababababababababababababababababababababab"
    1.24 +            "abababababababababababababababababababababababababababababababab"
    1.25 +            );
    1.26 +    sstr_t longstrresult = ST(
    1.27 +            "abababababababababababababababababababababababababababababababab"
    1.28 +            "abababababababababababababababababababababababababababababababab"
    1.29 +            "abababababababababababababababababababababababababababababababab"
    1.30 +            "abababababababababababababababababababababababababababababababab"
    1.31 +            "abababababababababababababababababababababababababababababababab"
    1.32 +            "abababababababababababababababababababababababababababababababab"
    1.33 +            "wxyz1234567890"
    1.34 +            );
    1.35      UCX_TEST_BEGIN
    1.36      
    1.37      sstr_t notfound = sstrstr(str, S("no match"));
    1.38 @@ -97,6 +125,12 @@
    1.39      UCX_TEST_ASSERT(!strcmp(str.ptr, result.ptr),
    1.40          "sstrstr with empty match string did not return the original string");
    1.41      
    1.42 +    result = sstrstr(longstr, longstrpattern);
    1.43 +    UCX_TEST_ASSERT(result.length == longstrresult.length,
    1.44 +            "long string result length incorrect");
    1.45 +    UCX_TEST_ASSERT(!strcmp(result.ptr, longstrresult.ptr),
    1.46 +            "long string result content incorrect");
    1.47 +    
    1.48      UCX_TEST_END
    1.49  }
    1.50  
     2.1 --- a/ucx/string.c	Mon Feb 20 17:28:58 2017 +0100
     2.2 +++ b/ucx/string.c	Thu Feb 23 14:30:12 2017 +0100
     2.3 @@ -29,6 +29,7 @@
     2.4  #include <stdlib.h>
     2.5  #include <string.h>
     2.6  #include <stdarg.h>
     2.7 +#include <stdint.h>
     2.8  #include <ctype.h>
     2.9  
    2.10  #include "string.h"
    2.11 @@ -175,22 +176,91 @@
    2.12      return n;
    2.13  }
    2.14  
    2.15 +typedef size_t(*prefix_table_rfun)(void*,size_t);
    2.16 +typedef void(*prefix_table_wfun)(void*,size_t,size_t);
    2.17 +
    2.18 +static size_t s_prefix_table_r(void* table, size_t index) {
    2.19 +    return (size_t) ((uint8_t*)table)[index];
    2.20 +}
    2.21 +static void s_prefix_table_w(void* table, size_t index, size_t value) {
    2.22 +    ((uint8_t*)table)[index] = (uint8_t) value;
    2.23 +}
    2.24 +static size_t h_prefix_table_r(void* table, size_t index) {
    2.25 +    return ((size_t*)table)[index];
    2.26 +}
    2.27 +static void h_prefix_table_w(void* table, size_t index, size_t value) {
    2.28 +    ((size_t*)table)[index] = value;
    2.29 +}
    2.30 +
    2.31  sstr_t sstrstr(sstr_t string, sstr_t match) {
    2.32      if (match.length == 0) {
    2.33          return string;
    2.34      }
    2.35      
    2.36 -    for (size_t i = 0 ; i < string.length ; i++) {
    2.37 -        sstr_t substr = sstrsubs(string, i);
    2.38 -        if (sstrprefix(substr, match)) {
    2.39 -            return substr;
    2.40 +    /* prepare default return value in case of no match */
    2.41 +    sstr_t result = sstrn(NULL, 0);
    2.42 +    
    2.43 +    /*
    2.44 +     * IMPORTANT:
    2.45 +     * our prefix table contains the prefix length PLUS ONE
    2.46 +     * this is our decision, because we want to use the full range of size_t
    2.47 +     * the original algorithm needs a (-1) at one single place
    2.48 +     * and we want to avoid that
    2.49 +     */
    2.50 +    
    2.51 +    /* static prefix table */
    2.52 +    static uint8_t s_prefix_table[256];
    2.53 +    
    2.54 +    /* check pattern length and use appropriate prefix table */
    2.55 +    register void* ptable;
    2.56 +    prefix_table_rfun ptable_r;
    2.57 +    prefix_table_wfun ptable_w;
    2.58 +    if (match.length > 255) {
    2.59 +        /* pattern exceeds static prefix table, allocate on the heap */
    2.60 +        ptable = calloc(match.length+1, sizeof(size_t));
    2.61 +        ptable_r = h_prefix_table_r;
    2.62 +        ptable_w = h_prefix_table_w;
    2.63 +    } else {
    2.64 +        ptable = s_prefix_table;
    2.65 +        ptable_r = s_prefix_table_r;
    2.66 +        ptable_w = s_prefix_table_w;
    2.67 +    }
    2.68 +    
    2.69 +    /* keep counter in registers */
    2.70 +    register size_t i, j;
    2.71 +    
    2.72 +    /* fill prefix table */
    2.73 +    i = 0; j = 0;
    2.74 +    ptable_w(ptable, i, j);
    2.75 +    while (i < match.length) {
    2.76 +        while (j >= 1 && match.ptr[j-1] != match.ptr[i]) {
    2.77 +            j = ptable_r(ptable, j-i);
    2.78 +        }
    2.79 +        i++; j++;
    2.80 +        ptable_w(ptable, i, j);
    2.81 +    }
    2.82 +
    2.83 +    /* search */
    2.84 +    i = 0; j = 1;
    2.85 +    while (i < string.length) {
    2.86 +        while (j >= 1 && string.ptr[i] != match.ptr[j-1]) {
    2.87 +            j = ptable_r(ptable, j-1);
    2.88 +        }
    2.89 +        i++; j++;
    2.90 +        if (j-1 == match.length) {
    2.91 +            size_t start = i - match.length;
    2.92 +            result.ptr = string.ptr + start;
    2.93 +            result.length = string.length - start;
    2.94 +            break;
    2.95          }
    2.96      }
    2.97 +
    2.98 +    /* if prefix table was allocated on the heap, free it */
    2.99 +    if (ptable != s_prefix_table) {
   2.100 +        free(ptable);
   2.101 +    }
   2.102      
   2.103 -    sstr_t emptystr;
   2.104 -    emptystr.length = 0;
   2.105 -    emptystr.ptr = NULL;
   2.106 -    return emptystr;
   2.107 +    return result;
   2.108  }
   2.109  
   2.110  sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {

mercurial