ucx/string.c

Thu, 23 Feb 2017 14:30:12 +0100

author
Mike Becker <universe@uap-core.de>
date
Thu, 23 Feb 2017 14:30:12 +0100
changeset 236
ffc6d0910342
parent 235
7cf1e41833a2
child 237
5ba9de6361ff
permissions
-rw-r--r--

improves sstrstr function by using KMP string search algorithm

olaf@20 1 /*
universe@103 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
olaf@20 3 *
universe@225 4 * Copyright 2016 Olaf Wintermann. All rights reserved.
universe@103 5 *
universe@103 6 * Redistribution and use in source and binary forms, with or without
universe@103 7 * modification, are permitted provided that the following conditions are met:
universe@103 8 *
universe@103 9 * 1. Redistributions of source code must retain the above copyright
universe@103 10 * notice, this list of conditions and the following disclaimer.
universe@103 11 *
universe@103 12 * 2. Redistributions in binary form must reproduce the above copyright
universe@103 13 * notice, this list of conditions and the following disclaimer in the
universe@103 14 * documentation and/or other materials provided with the distribution.
universe@103 15 *
universe@103 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
universe@103 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
universe@103 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
universe@103 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
universe@103 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
universe@103 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
universe@103 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
universe@103 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
universe@103 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
universe@103 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
universe@103 26 * POSSIBILITY OF SUCH DAMAGE.
olaf@20 27 */
olaf@20 28
olaf@20 29 #include <stdlib.h>
universe@69 30 #include <string.h>
olaf@20 31 #include <stdarg.h>
universe@236 32 #include <stdint.h>
universe@189 33 #include <ctype.h>
olaf@20 34
olaf@20 35 #include "string.h"
olaf@109 36 #include "allocator.h"
olaf@20 37
universe@116 38 sstr_t sstr(char *cstring) {
olaf@20 39 sstr_t string;
universe@116 40 string.ptr = cstring;
universe@116 41 string.length = strlen(cstring);
olaf@20 42 return string;
olaf@20 43 }
olaf@20 44
universe@116 45 sstr_t sstrn(char *cstring, size_t length) {
olaf@20 46 sstr_t string;
universe@116 47 string.ptr = cstring;
universe@116 48 string.length = length;
olaf@20 49 return string;
olaf@20 50 }
olaf@20 51
olaf@68 52 size_t sstrnlen(size_t n, sstr_t s, ...) {
olaf@20 53 va_list ap;
olaf@20 54 size_t size = s.length;
olaf@20 55 va_start(ap, s);
olaf@20 56
universe@116 57 for (size_t i = 1 ; i < n ; i++) {
olaf@20 58 sstr_t str = va_arg(ap, sstr_t);
olaf@20 59 size += str.length;
olaf@20 60 }
universe@24 61 va_end(ap);
olaf@20 62
olaf@20 63 return size;
olaf@20 64 }
olaf@20 65
olaf@180 66 static sstr_t sstrvcat_a(
olaf@180 67 UcxAllocator *a,
olaf@180 68 size_t count,
olaf@180 69 sstr_t s1,
olaf@180 70 sstr_t s2,
olaf@180 71 va_list ap) {
olaf@180 72 sstr_t str;
olaf@180 73 str.ptr = NULL;
olaf@180 74 str.length = 0;
olaf@180 75 if(count < 2) {
olaf@180 76 return str;
olaf@180 77 }
olaf@180 78
universe@185 79 sstr_t *strings = (sstr_t*) calloc(count, sizeof(sstr_t));
olaf@180 80 if(!strings) {
olaf@180 81 return str;
olaf@180 82 }
olaf@180 83
olaf@180 84 // get all args and overall length
olaf@180 85 strings[0] = s1;
olaf@180 86 strings[1] = s2;
olaf@180 87 size_t strlen = s1.length + s2.length;
olaf@180 88 for (size_t i=2;i<count;i++) {
olaf@180 89 sstr_t s = va_arg (ap, sstr_t);
olaf@180 90 strings[i] = s;
olaf@180 91 strlen += s.length;
olaf@180 92 }
olaf@180 93
olaf@180 94 // create new string
universe@185 95 str.ptr = (char*) almalloc(a, strlen + 1);
olaf@180 96 str.length = strlen;
olaf@180 97 if(!str.ptr) {
olaf@180 98 free(strings);
olaf@180 99 str.length = 0;
olaf@180 100 return str;
olaf@180 101 }
olaf@180 102
olaf@180 103 // concatenate strings
olaf@180 104 size_t pos = 0;
olaf@180 105 for (size_t i=0;i<count;i++) {
olaf@180 106 sstr_t s = strings[i];
olaf@180 107 memcpy(str.ptr + pos, s.ptr, s.length);
olaf@180 108 pos += s.length;
olaf@180 109 }
olaf@180 110
olaf@180 111 str.ptr[str.length] = '\0';
olaf@180 112
olaf@180 113 free(strings);
olaf@180 114
olaf@180 115 return str;
olaf@180 116 }
olaf@180 117
olaf@180 118 sstr_t sstrcat(size_t count, sstr_t s1, sstr_t s2, ...) {
olaf@180 119 va_list ap;
olaf@180 120 va_start(ap, s2);
olaf@180 121 sstr_t s = sstrvcat_a(ucx_default_allocator(), count, s1, s2, ap);
olaf@180 122 va_end(ap);
olaf@180 123 return s;
olaf@180 124 }
olaf@180 125
olaf@180 126 sstr_t sstrcat_a(UcxAllocator *a, size_t count, sstr_t s1, sstr_t s2, ...) {
olaf@180 127 va_list ap;
olaf@180 128 va_start(ap, s2);
olaf@180 129 sstr_t s = sstrvcat_a(a, count, s1, s2, ap);
olaf@180 130 va_end(ap);
olaf@180 131 return s;
olaf@180 132 }
olaf@180 133
olaf@68 134 sstr_t sstrsubs(sstr_t s, size_t start) {
olaf@20 135 return sstrsubsl (s, start, s.length-start);
olaf@20 136 }
olaf@20 137
olaf@68 138 sstr_t sstrsubsl(sstr_t s, size_t start, size_t length) {
olaf@20 139 sstr_t new_sstr;
olaf@104 140 if (start >= s.length) {
universe@173 141 new_sstr.ptr = NULL;
universe@173 142 new_sstr.length = 0;
universe@173 143 } else {
universe@173 144 if (length > s.length-start) {
universe@173 145 length = s.length-start;
universe@173 146 }
universe@173 147 new_sstr.ptr = &s.ptr[start];
universe@173 148 new_sstr.length = length;
olaf@20 149 }
olaf@20 150 return new_sstr;
olaf@20 151 }
olaf@20 152
olaf@108 153 sstr_t sstrchr(sstr_t s, int c) {
olaf@108 154 for(size_t i=0;i<s.length;i++) {
olaf@108 155 if(s.ptr[i] == c) {
olaf@108 156 return sstrsubs(s, i);
olaf@108 157 }
olaf@108 158 }
olaf@108 159 sstr_t n;
olaf@108 160 n.ptr = NULL;
olaf@108 161 n.length = 0;
olaf@108 162 return n;
olaf@108 163 }
olaf@108 164
universe@148 165 sstr_t sstrrchr(sstr_t s, int c) {
universe@148 166 if (s.length > 0) {
universe@152 167 for(size_t i=s.length;i>0;i--) {
universe@152 168 if(s.ptr[i-1] == c) {
universe@152 169 return sstrsubs(s, i-1);
universe@148 170 }
universe@148 171 }
universe@148 172 }
universe@148 173 sstr_t n;
universe@148 174 n.ptr = NULL;
universe@148 175 n.length = 0;
universe@148 176 return n;
universe@148 177 }
universe@148 178
universe@236 179 typedef size_t(*prefix_table_rfun)(void*,size_t);
universe@236 180 typedef void(*prefix_table_wfun)(void*,size_t,size_t);
universe@236 181
universe@236 182 static size_t s_prefix_table_r(void* table, size_t index) {
universe@236 183 return (size_t) ((uint8_t*)table)[index];
universe@236 184 }
universe@236 185 static void s_prefix_table_w(void* table, size_t index, size_t value) {
universe@236 186 ((uint8_t*)table)[index] = (uint8_t) value;
universe@236 187 }
universe@236 188 static size_t h_prefix_table_r(void* table, size_t index) {
universe@236 189 return ((size_t*)table)[index];
universe@236 190 }
universe@236 191 static void h_prefix_table_w(void* table, size_t index, size_t value) {
universe@236 192 ((size_t*)table)[index] = value;
universe@236 193 }
universe@236 194
universe@214 195 sstr_t sstrstr(sstr_t string, sstr_t match) {
universe@214 196 if (match.length == 0) {
universe@214 197 return string;
universe@214 198 }
universe@214 199
universe@236 200 /* prepare default return value in case of no match */
universe@236 201 sstr_t result = sstrn(NULL, 0);
universe@236 202
universe@236 203 /*
universe@236 204 * IMPORTANT:
universe@236 205 * our prefix table contains the prefix length PLUS ONE
universe@236 206 * this is our decision, because we want to use the full range of size_t
universe@236 207 * the original algorithm needs a (-1) at one single place
universe@236 208 * and we want to avoid that
universe@236 209 */
universe@236 210
universe@236 211 /* static prefix table */
universe@236 212 static uint8_t s_prefix_table[256];
universe@236 213
universe@236 214 /* check pattern length and use appropriate prefix table */
universe@236 215 register void* ptable;
universe@236 216 prefix_table_rfun ptable_r;
universe@236 217 prefix_table_wfun ptable_w;
universe@236 218 if (match.length > 255) {
universe@236 219 /* pattern exceeds static prefix table, allocate on the heap */
universe@236 220 ptable = calloc(match.length+1, sizeof(size_t));
universe@236 221 ptable_r = h_prefix_table_r;
universe@236 222 ptable_w = h_prefix_table_w;
universe@236 223 } else {
universe@236 224 ptable = s_prefix_table;
universe@236 225 ptable_r = s_prefix_table_r;
universe@236 226 ptable_w = s_prefix_table_w;
universe@236 227 }
universe@236 228
universe@236 229 /* keep counter in registers */
universe@236 230 register size_t i, j;
universe@236 231
universe@236 232 /* fill prefix table */
universe@236 233 i = 0; j = 0;
universe@236 234 ptable_w(ptable, i, j);
universe@236 235 while (i < match.length) {
universe@236 236 while (j >= 1 && match.ptr[j-1] != match.ptr[i]) {
universe@236 237 j = ptable_r(ptable, j-i);
universe@236 238 }
universe@236 239 i++; j++;
universe@236 240 ptable_w(ptable, i, j);
universe@236 241 }
universe@236 242
universe@236 243 /* search */
universe@236 244 i = 0; j = 1;
universe@236 245 while (i < string.length) {
universe@236 246 while (j >= 1 && string.ptr[i] != match.ptr[j-1]) {
universe@236 247 j = ptable_r(ptable, j-1);
universe@236 248 }
universe@236 249 i++; j++;
universe@236 250 if (j-1 == match.length) {
universe@236 251 size_t start = i - match.length;
universe@236 252 result.ptr = string.ptr + start;
universe@236 253 result.length = string.length - start;
universe@236 254 break;
universe@214 255 }
universe@214 256 }
universe@236 257
universe@236 258 /* if prefix table was allocated on the heap, free it */
universe@236 259 if (ptable != s_prefix_table) {
universe@236 260 free(ptable);
universe@236 261 }
universe@214 262
universe@236 263 return result;
universe@214 264 }
universe@214 265
universe@173 266 sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {
universe@125 267 return sstrsplit_a(ucx_default_allocator(), s, d, n);
universe@119 268 }
universe@119 269
universe@173 270 sstr_t* sstrsplit_a(UcxAllocator *allocator, sstr_t s, sstr_t d, ssize_t *n) {
universe@119 271 if (s.length == 0 || d.length == 0) {
universe@119 272 *n = -1;
universe@39 273 return NULL;
universe@39 274 }
universe@231 275
universe@231 276 /* special cases: delimiter is at least as large as the string */
universe@231 277 if (d.length >= s.length) {
universe@231 278 /* exact match */
universe@231 279 if (sstrcmp(s, d) == 0) {
universe@231 280 *n = 0;
universe@231 281 return NULL;
universe@231 282 } else /* no match possible */ {
universe@231 283 *n = 1;
universe@231 284 sstr_t *result = (sstr_t*) almalloc(allocator, sizeof(sstr_t));
universe@233 285 *result = sstrdup_a(allocator, s);
universe@231 286 return result;
universe@231 287 }
universe@231 288 }
universe@231 289
universe@173 290 ssize_t nmax = *n;
universe@235 291 size_t arrlen = 16;
universe@235 292 sstr_t* result = (sstr_t*) almalloc(allocator, arrlen*sizeof(sstr_t));
universe@39 293
universe@119 294 if (result) {
universe@233 295 sstr_t curpos = s;
universe@233 296 ssize_t j = 1;
universe@233 297 while (1) {
universe@234 298 sstr_t match;
universe@234 299 /* optimize for one byte delimiters */
universe@234 300 if (d.length == 1) {
universe@234 301 match = curpos;
universe@234 302 for (size_t i = 0 ; i < curpos.length ; i++) {
universe@234 303 if (curpos.ptr[i] == *(d.ptr)) {
universe@234 304 match.ptr = curpos.ptr + i;
universe@234 305 break;
universe@234 306 }
universe@234 307 match.length--;
universe@234 308 }
universe@234 309 } else {
universe@234 310 match = sstrstr(curpos, d);
universe@234 311 }
universe@233 312 if (match.length > 0) {
universe@233 313 /* is this our last try? */
universe@233 314 if (nmax == 0 || j < nmax) {
universe@233 315 /* copy the current string to the array */
universe@233 316 sstr_t item = sstrn(curpos.ptr, match.ptr - curpos.ptr);
universe@233 317 result[j-1] = sstrdup_a(allocator, item);
universe@233 318 size_t processed = item.length + d.length;
universe@233 319 curpos.ptr += processed;
universe@233 320 curpos.length -= processed;
universe@39 321
universe@233 322 /* allocate memory for the next string */
universe@233 323 j++;
universe@235 324 if (j > arrlen) {
universe@235 325 arrlen *= 2;
universe@235 326 sstr_t* reallocated = (sstr_t*) alrealloc(
universe@235 327 allocator, result, arrlen*sizeof(sstr_t));
universe@235 328 if (reallocated) {
universe@235 329 result = reallocated;
universe@235 330 } else {
universe@235 331 for (ssize_t i = 0 ; i < j-1 ; i++) {
universe@235 332 alfree(allocator, result[i].ptr);
universe@235 333 }
universe@235 334 alfree(allocator, result);
universe@235 335 *n = -2;
universe@235 336 return NULL;
universe@233 337 }
universe@233 338 }
universe@233 339 } else {
universe@233 340 /* nmax reached, copy the _full_ remaining string */
universe@233 341 result[j-1] = sstrdup_a(allocator, curpos);
universe@233 342 break;
universe@233 343 }
universe@173 344 } else {
universe@233 345 /* no more matches, copy last string */
universe@233 346 result[j-1] = sstrdup_a(allocator, curpos);
universe@173 347 break;
universe@173 348 }
universe@119 349 }
universe@233 350 *n = j;
universe@119 351 } else {
universe@119 352 *n = -2;
universe@39 353 }
universe@39 354
universe@39 355 return result;
universe@39 356 }
universe@39 357
olaf@68 358 int sstrcmp(sstr_t s1, sstr_t s2) {
universe@116 359 if (s1.length == s2.length) {
universe@116 360 return memcmp(s1.ptr, s2.ptr, s1.length);
universe@116 361 } else if (s1.length > s2.length) {
universe@116 362 return 1;
universe@116 363 } else {
universe@116 364 return -1;
universe@116 365 }
olaf@20 366 }
olaf@20 367
universe@149 368 int sstrcasecmp(sstr_t s1, sstr_t s2) {
universe@149 369 if (s1.length == s2.length) {
universe@149 370 #ifdef _WIN32
universe@149 371 return _strnicmp(s1.ptr, s2.ptr, s1.length);
universe@149 372 #else
universe@149 373 return strncasecmp(s1.ptr, s2.ptr, s1.length);
universe@149 374 #endif
universe@149 375 } else if (s1.length > s2.length) {
universe@149 376 return 1;
universe@149 377 } else {
universe@149 378 return -1;
universe@149 379 }
universe@149 380 }
universe@149 381
olaf@68 382 sstr_t sstrdup(sstr_t s) {
universe@125 383 return sstrdup_a(ucx_default_allocator(), s);
olaf@109 384 }
olaf@20 385
universe@125 386 sstr_t sstrdup_a(UcxAllocator *allocator, sstr_t s) {
olaf@109 387 sstr_t newstring;
universe@173 388 newstring.ptr = (char*)almalloc(allocator, s.length + 1);
olaf@109 389 if (newstring.ptr) {
olaf@109 390 newstring.length = s.length;
olaf@109 391 newstring.ptr[newstring.length] = 0;
olaf@109 392
olaf@109 393 memcpy(newstring.ptr, s.ptr, s.length);
olaf@109 394 } else {
olaf@109 395 newstring.length = 0;
olaf@109 396 }
olaf@109 397
olaf@20 398 return newstring;
olaf@20 399 }
olaf@96 400
olaf@96 401 sstr_t sstrtrim(sstr_t string) {
olaf@96 402 sstr_t newstr = string;
universe@189 403
universe@189 404 while (newstr.length > 0 && isspace(*newstr.ptr)) {
universe@189 405 newstr.ptr++;
universe@189 406 newstr.length--;
universe@98 407 }
universe@189 408 while (newstr.length > 0 && isspace(newstr.ptr[newstr.length-1])) {
universe@189 409 newstr.length--;
olaf@96 410 }
olaf@96 411
olaf@96 412 return newstr;
olaf@96 413 }
universe@146 414
universe@146 415 int sstrprefix(sstr_t string, sstr_t prefix) {
universe@146 416 if (string.length == 0) {
universe@146 417 return prefix.length == 0;
universe@146 418 }
universe@146 419 if (prefix.length == 0) {
universe@146 420 return 1;
universe@146 421 }
universe@146 422
universe@146 423 if (prefix.length > string.length) {
universe@146 424 return 0;
universe@146 425 } else {
universe@146 426 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
universe@146 427 }
universe@146 428 }
universe@146 429
universe@146 430 int sstrsuffix(sstr_t string, sstr_t suffix) {
universe@146 431 if (string.length == 0) {
universe@146 432 return suffix.length == 0;
universe@146 433 }
universe@146 434 if (suffix.length == 0) {
universe@146 435 return 1;
universe@146 436 }
universe@146 437
universe@146 438 if (suffix.length > string.length) {
universe@146 439 return 0;
universe@146 440 } else {
universe@146 441 return memcmp(string.ptr+string.length-suffix.length,
universe@146 442 suffix.ptr, suffix.length) == 0;
universe@146 443 }
universe@146 444 }
universe@210 445
universe@210 446 sstr_t sstrlower(sstr_t string) {
universe@210 447 sstr_t ret = sstrdup(string);
universe@210 448 for (size_t i = 0; i < ret.length ; i++) {
universe@210 449 ret.ptr[i] = tolower(ret.ptr[i]);
universe@210 450 }
universe@210 451 return ret;
universe@210 452 }
universe@210 453
universe@210 454 sstr_t sstrlower_a(UcxAllocator *allocator, sstr_t string) {
universe@210 455 sstr_t ret = sstrdup_a(allocator, string);
universe@210 456 for (size_t i = 0; i < ret.length ; i++) {
universe@210 457 ret.ptr[i] = tolower(ret.ptr[i]);
universe@210 458 }
universe@210 459 return ret;
universe@210 460 }
universe@210 461
universe@210 462 sstr_t sstrupper(sstr_t string) {
universe@210 463 sstr_t ret = sstrdup(string);
universe@210 464 for (size_t i = 0; i < ret.length ; i++) {
universe@210 465 ret.ptr[i] = toupper(ret.ptr[i]);
universe@210 466 }
universe@210 467 return ret;
universe@210 468 }
universe@210 469
universe@210 470 sstr_t sstrupper_a(UcxAllocator *allocator, sstr_t string) {
universe@210 471 sstr_t ret = sstrdup_a(allocator, string);
universe@210 472 for (size_t i = 0; i < ret.length ; i++) {
universe@210 473 ret.ptr[i] = toupper(ret.ptr[i]);
universe@210 474 }
universe@210 475 return ret;
universe@210 476 }

mercurial