--- a/src/highlighter.c Sun Mar 02 12:47:31 2025 +0100 +++ b/src/highlighter.c Sun Mar 02 16:06:24 2025 +0100 @@ -75,6 +75,107 @@ return 1; } +static size_t check_number(const char *str) { + /* this function is not precise, but a good over-approximation */ + size_t i = 0; + if (str[0] == '+' || str[0] == '-') { + i++; + } + bool hex = str[i] == '0' && (str[i + 1] == 'x' || str[i + 1] == 'X'); + bool bin = str[i] == '0' && (str[i + 1] == 'b' || str[i + 1] == 'B'); + if (hex || bin) { + i += 2; + } + bool flt = false; + bool exp = false; + bool dot = false; + bool digit_seen = false; + if (str[i] == '.') { + dot = true; + flt = true; + i++; + } + char exp_char_low = hex ? 'p' : 'e'; + char exp_char_up = hex ? 'P' : 'E'; + while (str[i] != '\0' && str[i] != '\n') { + /* ignore grouping char */ + if (str[i] == '\'') { + i++; + continue; + } + /* binary is always integer, nothing else allowed */ + if (bin) { + if (str[i] != '0' && str[i] != '1') { + break; + } else { + i++; + digit_seen = true; + } + } else { + /* detect decimal and exponent separators */ + if ((!dot && str[i] == '.') || + (!exp && digit_seen && + (str[i] == exp_char_low || str[i] == exp_char_up) + ) + ) { + if (str[i] == '.') { + dot = true; + } else { + exp = true; + /* a sign may directly follow */ + if (str[i+1] == '+' || str[i+1] == '-') { + i++; + } + } + flt = true; + i++; + continue; + } + /* check for allowed digits */ + if ((str[i] >= '0' && str[i] <= '9') || (hex && ( + (str[i] >= 'a' && str[i] <= 'f') + || (str[i] >= 'A' && str[i] <= 'F') + ))) { + digit_seen = true; + i++; + } else { + break; + } + } + } + /* have we seen at least one digit? */ + if (!digit_seen) return 0; + + /* check if we are already done (over-approximation) */ + if (!isalpha(str[i])) return i; + + /* check suffixes (must check with decreasing length) */ + const char *const flt_suffixes[] = { + "f128", "bf16", "F128", "BF16", + "f16", "f32", "f64", "F16", "F32", "F64", + "df", "DF", "dd", "DD", "dl", "DL", + "d", "D", "f", "l", "F", "L", + }; + const unsigned flt_suffixes_len = 22; + const char *const int_suffixes[] = { + "ull", "ULL", + "ul", "UL", "ll", "LL", "wb", "WB", + "u", "U", "l", "L", + }; + const unsigned int_suffixes_len = 12; + const char * const *allowed_suffixes = flt ? flt_suffixes : int_suffixes; + const unsigned allowed_suffixes_len = flt ? flt_suffixes_len : int_suffixes_len; + for (unsigned j = 0 ; j < allowed_suffixes_len ; j++) { + cxstring suffix = cx_str(allowed_suffixes[j]); + const char *testee = str+i; + if (memcmp(testee, suffix.ptr, suffix.length) == 0) { + return i+suffix.length; + } + } + /* no suffix matched */ + return 0; +} + /* Plaintext Highlighter */ void c2html_plain_highlighter(char const *src, CxBuffer *dest, @@ -246,6 +347,22 @@ } else { if (isstring) { put_htmlescaped(dest, c); + } else if (wbuf->size == 0 && + (isdigit(c) || c == '+' || c == '-' || c == '.') + ) { + /* might be a number */ + size_t numlen = check_number(src+sp); + if (numlen > 0) { + start_span("number"); + put_htmlescapedstr(dest, cx_strn(src+sp, numlen)); + stop_span; + sp += numlen - 1; + c = src[sp]; + continue; + } else { + /* start a new buffered word */ + cxBufferPut(wbuf, c); + } } else if (isalnum(c) || c == '_' || c == '#') { /* buffer the current word */ cxBufferPut(wbuf, c); @@ -271,8 +388,14 @@ if (closespan) { stop_span; } + + /* reset word buffer */ + wbuf->pos = wbuf->size = 0; + + /* re-test current char */ + c = src[--sp]; + continue; } - wbuf->pos = wbuf->size = 0; /* reset word buffer */ /* write current character */ put_htmlescaped(dest, c); @@ -367,6 +490,23 @@ } else { if (isstring) { put_htmlescaped(dest, c); + } else if (wbuf->size == 0 && + (isdigit(c) || c == '+' || c == '-' || c == '.') + ) { + /* might be a number */ + size_t numlen = check_number(src+sp); + if (numlen > 0) { + cxBufferPutString(dest, + "<span class=\"c2html-number\">"); + put_htmlescapedstr(dest, cx_strn(src+sp, numlen)); + cxBufferPutString(dest, "</span>"); + sp += numlen - 1; + c = src[sp]; + continue; + } else { + /* start a new buffered word */ + cxBufferPut(wbuf, c); + } } else if (isalnum(c) || c == '_' || c == '@') { /* buffer the current word */ cxBufferPut(wbuf, c); @@ -395,8 +535,14 @@ if (closespan) { cxBufferPutString(dest, "</span>"); } + + /* reset word buffer */ + wbuf->pos = wbuf->size = 0; + + /* re-test current char */ + c = src[--sp]; + continue; } - wbuf->pos = wbuf->size = 0; /* reset buffer */ /* write current character */ put_htmlescaped(dest, c);