Thu, 23 Feb 2017 14:30:12 +0100
improves sstrstr function by using KMP string search algorithm
test/string_tests.c | file | annotate | diff | comparison | revisions | |
ucx/string.c | file | annotate | diff | comparison | revisions |
1.1 --- a/test/string_tests.c Mon Feb 20 17:28:58 2017 +0100 1.2 +++ b/test/string_tests.c Thu Feb 23 14:30:12 2017 +0100 1.3 @@ -81,6 +81,34 @@ 1.4 1.5 UCX_TEST(test_sstrstr) { 1.6 sstr_t str = ST("find the match in this string"); 1.7 + sstr_t longstr = ST( 1.8 + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl" 1.9 + "mnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwx" 1.10 + "yzabcdeababababnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij" 1.11 + "klmnopqrstuvwxyzaababababababababrstuvwxyzabcdefghijklmnopqrstuv" 1.12 + "abababababababababababababababababababababababababababababababab" 1.13 + "abababababababababababababababababababababababababababababababab" 1.14 + "abababababababababababababababababababababababababababababababab" 1.15 + "abababababababababababababababababababababababababababababababab" 1.16 + "abababababababababababababababababababababababababababababababab" 1.17 + "abababababababababababababababababababababababababababababababab" 1.18 + "wxyz1234567890"); 1.19 + sstr_t longstrpattern = ST( 1.20 + "abababababababababababababababababababababababababababababababab" 1.21 + "abababababababababababababababababababababababababababababababab" 1.22 + "abababababababababababababababababababababababababababababababab" 1.23 + "abababababababababababababababababababababababababababababababab" 1.24 + "abababababababababababababababababababababababababababababababab" 1.25 + ); 1.26 + sstr_t longstrresult = ST( 1.27 + "abababababababababababababababababababababababababababababababab" 1.28 + "abababababababababababababababababababababababababababababababab" 1.29 + "abababababababababababababababababababababababababababababababab" 1.30 + "abababababababababababababababababababababababababababababababab" 1.31 + "abababababababababababababababababababababababababababababababab" 1.32 + "abababababababababababababababababababababababababababababababab" 1.33 + "wxyz1234567890" 1.34 + ); 1.35 UCX_TEST_BEGIN 1.36 1.37 sstr_t notfound = sstrstr(str, S("no match")); 1.38 @@ -97,6 +125,12 @@ 1.39 UCX_TEST_ASSERT(!strcmp(str.ptr, result.ptr), 1.40 "sstrstr with empty match string did not return the original string"); 1.41 1.42 + result = sstrstr(longstr, longstrpattern); 1.43 + UCX_TEST_ASSERT(result.length == longstrresult.length, 1.44 + "long string result length incorrect"); 1.45 + UCX_TEST_ASSERT(!strcmp(result.ptr, longstrresult.ptr), 1.46 + "long string result content incorrect"); 1.47 + 1.48 UCX_TEST_END 1.49 } 1.50
2.1 --- a/ucx/string.c Mon Feb 20 17:28:58 2017 +0100 2.2 +++ b/ucx/string.c Thu Feb 23 14:30:12 2017 +0100 2.3 @@ -29,6 +29,7 @@ 2.4 #include <stdlib.h> 2.5 #include <string.h> 2.6 #include <stdarg.h> 2.7 +#include <stdint.h> 2.8 #include <ctype.h> 2.9 2.10 #include "string.h" 2.11 @@ -175,22 +176,91 @@ 2.12 return n; 2.13 } 2.14 2.15 +typedef size_t(*prefix_table_rfun)(void*,size_t); 2.16 +typedef void(*prefix_table_wfun)(void*,size_t,size_t); 2.17 + 2.18 +static size_t s_prefix_table_r(void* table, size_t index) { 2.19 + return (size_t) ((uint8_t*)table)[index]; 2.20 +} 2.21 +static void s_prefix_table_w(void* table, size_t index, size_t value) { 2.22 + ((uint8_t*)table)[index] = (uint8_t) value; 2.23 +} 2.24 +static size_t h_prefix_table_r(void* table, size_t index) { 2.25 + return ((size_t*)table)[index]; 2.26 +} 2.27 +static void h_prefix_table_w(void* table, size_t index, size_t value) { 2.28 + ((size_t*)table)[index] = value; 2.29 +} 2.30 + 2.31 sstr_t sstrstr(sstr_t string, sstr_t match) { 2.32 if (match.length == 0) { 2.33 return string; 2.34 } 2.35 2.36 - for (size_t i = 0 ; i < string.length ; i++) { 2.37 - sstr_t substr = sstrsubs(string, i); 2.38 - if (sstrprefix(substr, match)) { 2.39 - return substr; 2.40 + /* prepare default return value in case of no match */ 2.41 + sstr_t result = sstrn(NULL, 0); 2.42 + 2.43 + /* 2.44 + * IMPORTANT: 2.45 + * our prefix table contains the prefix length PLUS ONE 2.46 + * this is our decision, because we want to use the full range of size_t 2.47 + * the original algorithm needs a (-1) at one single place 2.48 + * and we want to avoid that 2.49 + */ 2.50 + 2.51 + /* static prefix table */ 2.52 + static uint8_t s_prefix_table[256]; 2.53 + 2.54 + /* check pattern length and use appropriate prefix table */ 2.55 + register void* ptable; 2.56 + prefix_table_rfun ptable_r; 2.57 + prefix_table_wfun ptable_w; 2.58 + if (match.length > 255) { 2.59 + /* pattern exceeds static prefix table, allocate on the heap */ 2.60 + ptable = calloc(match.length+1, sizeof(size_t)); 2.61 + ptable_r = h_prefix_table_r; 2.62 + ptable_w = h_prefix_table_w; 2.63 + } else { 2.64 + ptable = s_prefix_table; 2.65 + ptable_r = s_prefix_table_r; 2.66 + ptable_w = s_prefix_table_w; 2.67 + } 2.68 + 2.69 + /* keep counter in registers */ 2.70 + register size_t i, j; 2.71 + 2.72 + /* fill prefix table */ 2.73 + i = 0; j = 0; 2.74 + ptable_w(ptable, i, j); 2.75 + while (i < match.length) { 2.76 + while (j >= 1 && match.ptr[j-1] != match.ptr[i]) { 2.77 + j = ptable_r(ptable, j-i); 2.78 + } 2.79 + i++; j++; 2.80 + ptable_w(ptable, i, j); 2.81 + } 2.82 + 2.83 + /* search */ 2.84 + i = 0; j = 1; 2.85 + while (i < string.length) { 2.86 + while (j >= 1 && string.ptr[i] != match.ptr[j-1]) { 2.87 + j = ptable_r(ptable, j-1); 2.88 + } 2.89 + i++; j++; 2.90 + if (j-1 == match.length) { 2.91 + size_t start = i - match.length; 2.92 + result.ptr = string.ptr + start; 2.93 + result.length = string.length - start; 2.94 + break; 2.95 } 2.96 } 2.97 + 2.98 + /* if prefix table was allocated on the heap, free it */ 2.99 + if (ptable != s_prefix_table) { 2.100 + free(ptable); 2.101 + } 2.102 2.103 - sstr_t emptystr; 2.104 - emptystr.length = 0; 2.105 - emptystr.ptr = NULL; 2.106 - return emptystr; 2.107 + return result; 2.108 } 2.109 2.110 sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {