src/highlighter.c

changeset 91
2c8514b3891b
parent 90
98adda6171d1
child 92
fe4dfb4d074d
--- a/src/highlighter.c	Sun Mar 02 12:47:31 2025 +0100
+++ b/src/highlighter.c	Sun Mar 02 16:06:24 2025 +0100
@@ -75,6 +75,107 @@
     return 1;
 }
 
+static size_t check_number(const char *str) {
+    /* this function is not precise, but a good over-approximation */
+    size_t i = 0;
+    if (str[0] == '+' || str[0] == '-') {
+        i++;
+    }
+    bool hex = str[i] == '0' && (str[i + 1] == 'x' || str[i + 1] == 'X');
+    bool bin = str[i] == '0' && (str[i + 1] == 'b' || str[i + 1] == 'B');
+    if (hex || bin) {
+        i += 2;
+    }
+    bool flt = false;
+    bool exp = false;
+    bool dot = false;
+    bool digit_seen = false;
+    if (str[i] == '.') {
+        dot = true;
+        flt = true;
+        i++;
+    }
+    char exp_char_low = hex ? 'p' : 'e';
+    char exp_char_up = hex ? 'P' : 'E';
+    while (str[i] != '\0' && str[i] != '\n') {
+        /* ignore grouping char */
+        if (str[i] == '\'') {
+            i++;
+            continue;
+        }
+        /* binary is always integer, nothing else allowed */
+        if (bin) {
+            if (str[i] != '0' && str[i] != '1') {
+                break;
+            } else {
+                i++;
+                digit_seen = true;
+            }
+        } else {
+            /* detect decimal and exponent separators */
+            if ((!dot && str[i] == '.') ||
+                (!exp && digit_seen &&
+                    (str[i] == exp_char_low || str[i] == exp_char_up)
+                )
+            ) {
+                if (str[i] == '.') {
+                    dot = true;
+                } else {
+                    exp = true;
+                    /* a sign may directly follow */
+                    if (str[i+1] == '+' || str[i+1] == '-') {
+                        i++;
+                    }
+                }
+                flt = true;
+                i++;
+                continue;
+            }
+            /* check for allowed digits */
+            if ((str[i] >= '0' && str[i] <= '9') || (hex && (
+                (str[i] >= 'a' && str[i] <= 'f')
+                || (str[i] >= 'A' && str[i] <= 'F')
+            ))) {
+                digit_seen = true;
+                i++;
+            } else {
+                break;
+            }
+        }
+    }
+    /* have we seen at least one digit? */
+    if (!digit_seen) return 0;
+
+    /* check if we are already done (over-approximation) */
+    if (!isalpha(str[i])) return i;
+
+    /* check suffixes (must check with decreasing length) */
+    const char *const flt_suffixes[] = {
+        "f128", "bf16", "F128", "BF16",
+        "f16", "f32", "f64", "F16", "F32", "F64",
+        "df", "DF", "dd", "DD", "dl", "DL",
+        "d", "D", "f", "l", "F", "L",
+    };
+    const unsigned flt_suffixes_len = 22;
+    const char *const int_suffixes[] = {
+        "ull", "ULL",
+        "ul", "UL", "ll", "LL", "wb", "WB",
+        "u", "U", "l", "L",
+    };
+    const unsigned int_suffixes_len = 12;
+    const char * const *allowed_suffixes = flt ? flt_suffixes : int_suffixes;
+    const unsigned allowed_suffixes_len = flt ? flt_suffixes_len : int_suffixes_len;
+    for (unsigned j = 0 ; j < allowed_suffixes_len ; j++) {
+        cxstring suffix = cx_str(allowed_suffixes[j]);
+        const char *testee = str+i;
+        if (memcmp(testee, suffix.ptr, suffix.length) == 0) {
+            return i+suffix.length;
+        }
+    }
+    /* no suffix matched */
+    return 0;
+}
+
 /* Plaintext Highlighter */
 
 void c2html_plain_highlighter(char const *src, CxBuffer *dest,
@@ -246,6 +347,22 @@
             } else {
                 if (isstring) {
                     put_htmlescaped(dest, c);
+                } else if (wbuf->size == 0 &&
+                    (isdigit(c) ||  c == '+' || c == '-' || c == '.')
+                ) {
+                    /* might be a number */
+                    size_t numlen = check_number(src+sp);
+                    if (numlen > 0) {
+                        start_span("number");
+                        put_htmlescapedstr(dest, cx_strn(src+sp, numlen));
+                        stop_span;
+                        sp += numlen - 1;
+                        c = src[sp];
+                        continue;
+                    } else {
+                        /* start a new buffered word */
+                        cxBufferPut(wbuf, c);
+                    }
                 } else if (isalnum(c) ||  c == '_' || c == '#') {
                     /* buffer the current word */
                     cxBufferPut(wbuf, c);
@@ -271,8 +388,14 @@
                         if (closespan) {
                             stop_span;
                         }
+
+                        /* reset word buffer */
+                        wbuf->pos = wbuf->size = 0;
+
+                        /* re-test current char */
+                        c = src[--sp];
+                        continue;
                     }
-                    wbuf->pos = wbuf->size = 0; /* reset word buffer */
                     
                     /* write current character */
                     put_htmlescaped(dest, c);
@@ -367,6 +490,23 @@
             } else {
                 if (isstring) {
                     put_htmlescaped(dest, c);
+                } else if (wbuf->size == 0 &&
+                    (isdigit(c) ||  c == '+' || c == '-' || c == '.')
+                ) {
+                    /* might be a number */
+                    size_t numlen = check_number(src+sp);
+                    if (numlen > 0) {
+                        cxBufferPutString(dest,
+                                "<span class=\"c2html-number\">");
+                        put_htmlescapedstr(dest, cx_strn(src+sp, numlen));
+                        cxBufferPutString(dest, "</span>");
+                        sp += numlen - 1;
+                        c = src[sp];
+                        continue;
+                    } else {
+                        /* start a new buffered word */
+                        cxBufferPut(wbuf, c);
+                    }
                 } else if (isalnum(c) || c == '_' || c == '@') {
                     /* buffer the current word */
                     cxBufferPut(wbuf, c);
@@ -395,8 +535,14 @@
                         if (closespan) {
                             cxBufferPutString(dest, "</span>");
                         }
+
+                        /* reset word buffer */
+                        wbuf->pos = wbuf->size = 0;
+
+                        /* re-test current char */
+                        c = src[--sp];
+                        continue;
                     }
-                    wbuf->pos = wbuf->size = 0; /* reset buffer */
                     
                     /* write current character */
                     put_htmlescaped(dest, c);

mercurial