src/regex_parser.c

Fri, 03 Jun 2022 20:05:15 +0200

author
Mike Becker <universe@uap-core.de>
date
Fri, 03 Jun 2022 20:05:15 +0200
changeset 66
be2084398c37
parent 57
68018eac46c3
permissions
-rw-r--r--

new feature: count non-whitespace characters

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 
 * Copyright 2018 Mike Becker. All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 * 
 * 2. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "regex_parser.h"
#include <ctype.h>

regex_parser_t* new_regex_parser_t() {
  regex_parser_t* ret = malloc(sizeof(regex_parser_t));
  if (ret != NULL) {
    ret->pattern_list = new_string_list_t();
    ret->matched_counted = 0;
    ret->pattern_match = 0;
    ret->compiled_patterns = NULL;
    ret->compiled_pattern_count = 0;
    ret->count_chars = false;
  }
  return ret;
}

void regex_parser_reset(regex_parser_t* parser) {
  parser->pattern_match = parser->matched_counted = 0;
}

void regex_destcomppats(regex_parser_t* parser) {
  if (parser->compiled_patterns != NULL) {
    for (unsigned i = 0 ; i < parser->compiled_pattern_count ; i++) {
      if (parser->compiled_patterns[i] != NULL) {
        free(parser->compiled_patterns[i]);
      }
    }
    free(parser->compiled_patterns);
    parser->compiled_patterns = NULL;
    parser->compiled_pattern_count = 0;
  }
}

void destroy_regex_parser_t(regex_parser_t* parser) {
  regex_destcomppats(parser);
  destroy_string_list_t(parser->pattern_list);
  free(parser);
}

bool regex_parser_matching(regex_parser_t* parser) {
  return parser->pattern_match > 0;
}

static unsigned regex_parser_count_chars(const char* input,
                                         unsigned start, unsigned end) {
  unsigned ret = 0;
  for (unsigned i = start ; i < end ; i++) {
    ret += isspace(input[i]) ? 0 : 1;
  }
  return ret;
}

int regex_parser_do(regex_parser_t* parser, char* input) {
  int err = REG_NOMATCH;
  if (parser->compiled_pattern_count > 0) {
    regmatch_t match;

    if (regex_parser_matching(parser)) {
      if (parser->count_chars) {
        parser->matched_counted +=
            regex_parser_count_chars(input, 0, strlen(input));
      } else {
        parser->matched_counted++;
      }

      err = regexec(parser->compiled_patterns[parser->pattern_match],
          input, 1, &match, 0);
      if (err > 0 && err != REG_NOMATCH) {
        fprintf(stderr, "Regex-Error: 0x%08x", err);
      }
      if (err == 0) {
        parser->pattern_match = 0;
        size_t input_len = strlen(input);
        if (match.rm_eo < input_len) {
          if (parser->count_chars) {
            /* do not exclude chars that occur after pattern end */
            parser->matched_counted -=
                regex_parser_count_chars(input, match.rm_eo, input_len);
          } else {
            /* do not exclude line, if it does not end with the pattern */
            parser->matched_counted--;
          }
        }
      }
    } else {
      for (unsigned i = 0 ; i < parser->compiled_pattern_count - 1 ; i += 2) {
        err = regexec(parser->compiled_patterns[i], input, 1, &match, 0);
        if (err > 0 && err != REG_NOMATCH) {
          fprintf(stderr, "Regex-Error: 0x%08x", err);
        }
        if (err == 0) {
          /* a start pattern matches, start counting */
          parser->matched_counted = 0;
          /* Check, if end pattern is also in this line */
          parser->pattern_match = i+1;
          regex_parser_do(parser, input);
          /* If something was found, determine what exactly to exclude */
          if (parser->matched_counted > 0) {
            if (parser->count_chars) {
              /* do not exclude the characters before the pattern */
              parser->matched_counted -=
                  regex_parser_count_chars(input, 0, match.rm_so);
            } else {
              /* do not match line, if it does not start with the pattern */
              if (match.rm_so > 0) {
                parser->matched_counted--;
              }
            }
          }
          break;
        }
      }
    }
  }
  return err;
}

bool regex_compile_all(regex_parser_t* parser) {
  bool success = true;
  size_t pcount = parser->pattern_list->count;
  if (pcount > 0) {
    regex_destcomppats(parser);
    parser->compiled_patterns = calloc(pcount, sizeof(regex_t));
    parser->compiled_pattern_count = pcount;

    regex_t* re;
    for (unsigned i = 0 ; i < pcount ; i++) {
      re = malloc(sizeof(regex_t));
      if (regcomp(re, parser->pattern_list->items[i], REG_EXTENDED) == 0) {
        parser->compiled_patterns[i] = re;
      } else {
        fprintf(stderr, "Cannot compile pattern: %s\n",
            (parser->pattern_list->items[i]));
        parser->compiled_patterns[i] = NULL;
        success = false;
      }
    }
  }
  return success;
}

mercurial