tip/src/string.c

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "cx/string.h"

#include <string.h>
#include <stdarg.h>
#include <ctype.h>

#ifndef _WIN32

#include <strings.h> // for strncasecmp()

#endif // _WIN32

cxmutstr cx_mutstr(char *cstring) {
    return (cxmutstr) {cstring, strlen(cstring)};
}

cxmutstr cx_mutstrn(
        char *cstring,
        size_t length
) {
    return (cxmutstr) {cstring, length};
}

cxstring cx_str(const char *cstring) {
    return (cxstring) {cstring, strlen(cstring)};
}

cxstring cx_strn(
        const char *cstring,
        size_t length
) {
    return (cxstring) {cstring, length};
}

cxstring cx_strcast(cxmutstr str) {
    return (cxstring) {str.ptr, str.length};
}

void cx_strfree(cxmutstr *str) {
    if (str == NULL) return;
    free(str->ptr);
    str->ptr = NULL;
    str->length = 0;
}

void cx_strfree_a(
        const CxAllocator *alloc,
        cxmutstr *str
) {
    if (str == NULL) return;
    cxFree(alloc, str->ptr);
    str->ptr = NULL;
    str->length = 0;
}

size_t cx_strlen(
        size_t count,
        ...
) {
    if (count == 0) return 0;

    va_list ap;
    va_start(ap, count);
    size_t size = 0;
    for (size_t i = 0; i < count; i++) {
        cxstring str = va_arg(ap, cxstring);
        size += str.length;
    }
    va_end(ap);

    return size;
}

cxmutstr cx_strcat_ma(
        const CxAllocator *alloc,
        cxmutstr str,
        size_t count,
        ...
) {
    if (count == 0) return str;

    cxstring *strings = calloc(count, sizeof(cxstring));
    if (!strings) abort();

    va_list ap;
    va_start(ap, count);

    // get all args and overall length
    size_t slen = str.length;
    for (size_t i = 0; i < count; i++) {
        cxstring s = va_arg (ap, cxstring);
        strings[i] = s;
        slen += s.length;
    }
    va_end(ap);

    // reallocate or create new string
    if (str.ptr == NULL) {
        str.ptr = cxMalloc(alloc, slen + 1);
    } else {
        str.ptr = cxRealloc(alloc, str.ptr, slen + 1);
    }
    if (str.ptr == NULL) abort();

    // concatenate strings
    size_t pos = str.length;
    str.length = slen;
    for (size_t i = 0; i < count; i++) {
        cxstring s = strings[i];
        memcpy(str.ptr + pos, s.ptr, s.length);
        pos += s.length;
    }

    // terminate string
    str.ptr[str.length] = '\0';

    // free temporary array
    free(strings);

    return str;
}

cxstring cx_strsubs(
        cxstring string,
        size_t start
) {
    return cx_strsubsl(string, start, string.length - start);
}

cxmutstr cx_strsubs_m(
        cxmutstr string,
        size_t start
) {
    return cx_strsubsl_m(string, start, string.length - start);
}

cxstring cx_strsubsl(
        cxstring string,
        size_t start,
        size_t length
) {
    if (start > string.length) {
        return (cxstring) {NULL, 0};
    }

    size_t rem_len = string.length - start;
    if (length > rem_len) {
        length = rem_len;
    }

    return (cxstring) {string.ptr + start, length};
}

cxmutstr cx_strsubsl_m(
        cxmutstr string,
        size_t start,
        size_t length
) {
    cxstring result = cx_strsubsl(cx_strcast(string), start, length);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

cxstring cx_strchr(
        cxstring string,
        int chr
) {
    chr = 0xFF & chr;
    // TODO: improve by comparing multiple bytes at once
    for (size_t i = 0; i < string.length; i++) {
        if (string.ptr[i] == chr) {
            return cx_strsubs(string, i);
        }
    }
    return (cxstring) {NULL, 0};
}

cxmutstr cx_strchr_m(
        cxmutstr string,
        int chr
) {
    cxstring result = cx_strchr(cx_strcast(string), chr);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

cxstring cx_strrchr(
        cxstring string,
        int chr
) {
    chr = 0xFF & chr;
    size_t i = string.length;
    while (i > 0) {
        i--;
        // TODO: improve by comparing multiple bytes at once
        if (string.ptr[i] == chr) {
            return cx_strsubs(string, i);
        }
    }
    return (cxstring) {NULL, 0};
}

cxmutstr cx_strrchr_m(
        cxmutstr string,
        int chr
) {
    cxstring result = cx_strrchr(cx_strcast(string), chr);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

#ifndef CX_STRSTR_SBO_SIZE
#define CX_STRSTR_SBO_SIZE 512
#endif
const unsigned cx_strstr_sbo_size = CX_STRSTR_SBO_SIZE;

cxstring cx_strstr(
        cxstring haystack,
        cxstring needle
) {
    if (needle.length == 0) {
        return haystack;
    }

    // optimize for single-char needles
    if (needle.length == 1) {
        return cx_strchr(haystack, *needle.ptr);
    }

    /*
     * IMPORTANT:
     * Our prefix table contains the prefix length PLUS ONE
     * this is our decision, because we want to use the full range of size_t.
     * The original algorithm needs a (-1) at one single place,
     * and we want to avoid that.
     */

    // local prefix table
    size_t s_prefix_table[CX_STRSTR_SBO_SIZE];

    // check needle length and use appropriate prefix table
    // if the pattern exceeds static prefix table, allocate on the heap
    bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
    register size_t *ptable = useheap ? calloc(needle.length + 1,
                                               sizeof(size_t)) : s_prefix_table;

    // keep counter in registers
    register size_t i, j;

    // fill prefix table
    i = 0;
    j = 0;
    ptable[i] = j;
    while (i < needle.length) {
        while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
            j = ptable[j - 1];
        }
        i++;
        j++;
        ptable[i] = j;
    }

    // search
    cxstring result = {NULL, 0};
    i = 0;
    j = 1;
    while (i < haystack.length) {
        while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
            j = ptable[j - 1];
        }
        i++;
        j++;
        if (j - 1 == needle.length) {
            size_t start = i - needle.length;
            result.ptr = haystack.ptr + start;
            result.length = haystack.length - start;
            break;
        }
    }

    // if prefix table was allocated on the heap, free it
    if (ptable != s_prefix_table) {
        free(ptable);
    }

    return result;
}

cxmutstr cx_strstr_m(
        cxmutstr haystack,
        cxstring needle
) {
    cxstring result = cx_strstr(cx_strcast(haystack), needle);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

size_t cx_strsplit(
        cxstring string,
        cxstring delim,
        size_t limit,
        cxstring *output
) {
    // special case: output limit is zero
    if (limit == 0) return 0;

    // special case: delimiter is empty
    if (delim.length == 0) {
        output[0] = string;
        return 1;
    }

    // special cases: delimiter is at least as large as the string
    if (delim.length >= string.length) {
        // exact match
        if (cx_strcmp(string, delim) == 0) {
            output[0] = cx_strn(string.ptr, 0);
            output[1] = cx_strn(string.ptr + string.length, 0);
            return 2;
        } else {
            // no match possible
            output[0] = string;
            return 1;
        }
    }

    size_t n = 0;
    cxstring curpos = string;
    while (1) {
        ++n;
        cxstring match = cx_strstr(curpos, delim);
        if (match.length > 0) {
            // is the limit reached?
            if (n < limit) {
                // copy the current string to the array
                cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
                output[n - 1] = item;
                size_t processed = item.length + delim.length;
                curpos.ptr += processed;
                curpos.length -= processed;
            } else {
                // limit reached, copy the _full_ remaining string
                output[n - 1] = curpos;
                break;
            }
        } else {
            // no more matches, copy last string
            output[n - 1] = curpos;
            break;
        }
    }

    return n;
}

size_t cx_strsplit_a(
        const CxAllocator *allocator,
        cxstring string,
        cxstring delim,
        size_t limit,
        cxstring **output
) {
    // find out how many splits we're going to make and allocate memory
    size_t n = 0;
    cxstring curpos = string;
    while (1) {
        ++n;
        cxstring match = cx_strstr(curpos, delim);
        if (match.length > 0) {
            // is the limit reached?
            if (n < limit) {
                size_t processed = match.ptr - curpos.ptr + delim.length;
                curpos.ptr += processed;
                curpos.length -= processed;
            } else {
                // limit reached
                break;
            }
        } else {
            // no more matches
            break;
        }
    }
    *output = cxCalloc(allocator, n, sizeof(cxstring));
    return cx_strsplit(string, delim, n, *output);
}

size_t cx_strsplit_m(
        cxmutstr string,
        cxstring delim,
        size_t limit,
        cxmutstr *output
) {
    return cx_strsplit(cx_strcast(string),
                       delim, limit, (cxstring *) output);
}

size_t cx_strsplit_ma(
        const CxAllocator *allocator,
        cxmutstr string,
        cxstring delim,
        size_t limit,
        cxmutstr **output
) {
    return cx_strsplit_a(allocator, cx_strcast(string),
                         delim, limit, (cxstring **) output);
}

int cx_strcmp(
        cxstring s1,
        cxstring s2
) {
    if (s1.length == s2.length) {
        return memcmp(s1.ptr, s2.ptr, s1.length);
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

int cx_strcasecmp(
        cxstring s1,
        cxstring s2
) {
    if (s1.length == s2.length) {
#ifdef _WIN32
        return _strnicmp(s1.ptr, s2.ptr, s1.length);
#else
        return strncasecmp(s1.ptr, s2.ptr, s1.length);
#endif
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

int cx_strcmp_p(
        const void *s1,
        const void *s2
) {
    const cxstring *left = s1;
    const cxstring *right = s2;
    return cx_strcmp(*left, *right);
}

int cx_strcasecmp_p(
        const void *s1,
        const void *s2
) {
    const cxstring *left = s1;
    const cxstring *right = s2;
    return cx_strcasecmp(*left, *right);
}

cxmutstr cx_strdup_a(
        const CxAllocator *allocator,
        cxstring string
) {
    cxmutstr result = {
            cxMalloc(allocator, string.length + 1),
            string.length
    };
    if (result.ptr == NULL) {
        result.length = 0;
        return result;
    }
    memcpy(result.ptr, string.ptr, string.length);
    result.ptr[string.length] = '\0';
    return result;
}

cxstring cx_strtrim(cxstring string) {
    cxstring result = string;
    // TODO: optimize by comparing multiple bytes at once
    while (result.length > 0 && isspace(*result.ptr)) {
        result.ptr++;
        result.length--;
    }
    while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
        result.length--;
    }
    return result;
}

cxmutstr cx_strtrim_m(cxmutstr string) {
    cxstring result = cx_strtrim(cx_strcast(string));
    return (cxmutstr) {(char *) result.ptr, result.length};
}

bool cx_strprefix(
        cxstring string,
        cxstring prefix
) {
    if (string.length < prefix.length) return false;
    return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
}

bool cx_strsuffix(
        cxstring string,
        cxstring suffix
) {
    if (string.length < suffix.length) return false;
    return memcmp(string.ptr + string.length - suffix.length,
                  suffix.ptr, suffix.length) == 0;
}

bool cx_strcaseprefix(
        cxstring string,
        cxstring prefix
) {
    if (string.length < prefix.length) return false;
#ifdef _WIN32
    return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
#else
    return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
#endif
}

bool cx_strcasesuffix(
        cxstring string,
        cxstring suffix
) {
    if (string.length < suffix.length) return false;
#ifdef _WIN32
    return _strnicmp(string.ptr+string.length-suffix.length,
                  suffix.ptr, suffix.length) == 0;
#else
    return strncasecmp(string.ptr + string.length - suffix.length,
                       suffix.ptr, suffix.length) == 0;
#endif
}

void cx_strlower(cxmutstr string) {
    for (size_t i = 0; i < string.length; i++) {
        string.ptr[i] = (char) tolower(string.ptr[i]);
    }
}

void cx_strupper(cxmutstr string) {
    for (size_t i = 0; i < string.length; i++) {
        string.ptr[i] = (char) toupper(string.ptr[i]);
    }
}

#ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
#define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
#endif

struct cx_strreplace_ibuf {
    size_t *buf;
    struct cx_strreplace_ibuf *next;
    unsigned int len;
};

static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
    while (buf) {
        struct cx_strreplace_ibuf *next = buf->next;
        free(buf->buf);
        free(buf);
        buf = next;
    }
}

cxmutstr cx_strreplacen_a(
        const CxAllocator *allocator,
        cxstring str,
        cxstring pattern,
        cxstring replacement,
        size_t replmax
) {

    if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
        return cx_strdup_a(allocator, str);

    // Compute expected buffer length
    size_t ibufmax = str.length / pattern.length;
    size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
    if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
        ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
    }

    // Allocate first index buffer
    struct cx_strreplace_ibuf *firstbuf, *curbuf;
    firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
    if (!firstbuf) return cx_mutstrn(NULL, 0);
    firstbuf->buf = calloc(ibuflen, sizeof(size_t));
    if (!firstbuf->buf) {
        free(firstbuf);
        return cx_mutstrn(NULL, 0);
    }

    // Search occurrences
    cxstring searchstr = str;
    size_t found = 0;
    do {
        cxstring match = cx_strstr(searchstr, pattern);
        if (match.length > 0) {
            // Allocate next buffer in chain, if required
            if (curbuf->len == ibuflen) {
                struct cx_strreplace_ibuf *nextbuf =
                        calloc(1, sizeof(struct cx_strreplace_ibuf));
                if (!nextbuf) {
                    cx_strrepl_free_ibuf(firstbuf);
                    return cx_mutstrn(NULL, 0);
                }
                nextbuf->buf = calloc(ibuflen, sizeof(size_t));
                if (!nextbuf->buf) {
                    free(nextbuf);
                    cx_strrepl_free_ibuf(firstbuf);
                    return cx_mutstrn(NULL, 0);
                }
                curbuf->next = nextbuf;
                curbuf = nextbuf;
            }

            // Record match index
            found++;
            size_t idx = match.ptr - str.ptr;
            curbuf->buf[curbuf->len++] = idx;
            searchstr.ptr = match.ptr + pattern.length;
            searchstr.length = str.length - idx - pattern.length;
        } else {
            break;
        }
    } while (searchstr.length > 0 && found < replmax);

    // Allocate result string
    cxmutstr result;
    {
        ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
        size_t rcount = 0;
        curbuf = firstbuf;
        do {
            rcount += curbuf->len;
            curbuf = curbuf->next;
        } while (curbuf);
        result.length = str.length + rcount * adjlen;
        result.ptr = cxMalloc(allocator, result.length + 1);
        if (!result.ptr) {
            cx_strrepl_free_ibuf(firstbuf);
            return cx_mutstrn(NULL, 0);
        }
    }

    // Build result string
    curbuf = firstbuf;
    size_t srcidx = 0;
    char *destptr = result.ptr;
    do {
        for (size_t i = 0; i < curbuf->len; i++) {
            // Copy source part up to next match
            size_t idx = curbuf->buf[i];
            size_t srclen = idx - srcidx;
            if (srclen > 0) {
                memcpy(destptr, str.ptr + srcidx, srclen);
                destptr += srclen;
                srcidx += srclen;
            }

            // Copy the replacement and skip the source pattern
            srcidx += pattern.length;
            memcpy(destptr, replacement.ptr, replacement.length);
            destptr += replacement.length;
        }
        curbuf = curbuf->next;
    } while (curbuf);
    memcpy(destptr, str.ptr + srcidx, str.length - srcidx);

    // Result is guaranteed to be zero-terminated
    result.ptr[result.length] = '\0';

    // Free index buffer
    cx_strrepl_free_ibuf(firstbuf);

    return result;
}

CxStrtokCtx cx_strtok(
        cxstring str,
        cxstring delim,
        size_t limit
) {
    CxStrtokCtx ctx;
    ctx.str = str;
    ctx.delim = delim;
    ctx.limit = limit;
    ctx.pos = 0;
    ctx.next_pos = 0;
    ctx.delim_pos = 0;
    ctx.found = 0;
    ctx.delim_more = NULL;
    ctx.delim_more_count = 0;
    return ctx;
}

CxStrtokCtx cx_strtok_m(
        cxmutstr str,
        cxstring delim,
        size_t limit
) {
    return cx_strtok(cx_strcast(str), delim, limit);
}

bool cx_strtok_next(
        CxStrtokCtx *ctx,
        cxstring *token
) {
    // abortion criteria
    if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
        return false;
    }

    // determine the search start
    cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);

    // search the next delimiter
    cxstring delim = cx_strstr(haystack, ctx->delim);

    // if found, make delim capture exactly the delimiter
    if (delim.length > 0) {
        delim.length = ctx->delim.length;
    }

    // if more delimiters are specified, check them now
    if (ctx->delim_more_count > 0) {
        for (size_t i = 0; i < ctx->delim_more_count; i++) {
            cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
            if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
                delim.ptr = d.ptr;
                delim.length = ctx->delim_more[i].length;
            }
        }
    }

    // store the token information and adjust the context
    ctx->found++;
    ctx->pos = ctx->next_pos;
    token->ptr = &ctx->str.ptr[ctx->pos];
    ctx->delim_pos = delim.length == 0 ?
                     ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
    token->length = ctx->delim_pos - ctx->pos;
    ctx->next_pos = ctx->delim_pos + delim.length;

    return true;
}

bool cx_strtok_next_m(
        CxStrtokCtx *ctx,
        cxmutstr *token
) {
    return cx_strtok_next(ctx, (cxstring *) token);
}

void cx_strtok_delim(
        CxStrtokCtx *ctx,
        const cxstring *delim,
        size_t count
) {
    ctx->delim_more = delim;
    ctx->delim_more_count = count;
}