oceanbase/deps/oblib/unittest/lib/regex/test_regex.cpp

/**
 * Copyright (c) 2021 OceanBase
 * OceanBase CE is licensed under Mulan PubL v2.
 * You can use this software according to the terms and conditions of the Mulan PubL v2.
 * You may obtain a copy of Mulan PubL v2 at:
 *          http://license.coscl.org.cn/MulanPubL-2.0
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PubL v2 for more details.
 */

#include <gtest/gtest.h>
#include "lib/ob_define.h"

#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include "lib/regex/regex/ob_regex.h"
#include <assert.h>

#include "test_regex.ih"

#include "lib/utility/ob_macro_utils.h"

using namespace oceanbase::common;

int debug = 0;
int line = 0;
int status = 0;

int copts = OB_REG_EXTENDED;
int eopts = 0;
ob_regoff_t startoff = 0;
ob_regoff_t endoff = 0;

// extern void regprint();

/*
 - split - divide a string into fields, like awk split()
 = int split(char *string, char *fields[], int nfields, char *sep);
 */
int                                     /* number of fields, including overflow */
    split(char* string, char* fields[], /* list is not NULL-terminated */
        int nfields,                    /* number of entries available in fields[] */
        char* sep                       /* "" white, "c" single char, "ab" [ab]+ */
    )
{
  register char* p = string;
  register char c; /* latest character */
  register char sepc = sep[0];
  register char sepc2;
  register int fn;
  register char** fp = fields;
  register char* sepp;
  register int trimtrail;

  /* white space */
  if (sepc == '\0') {
    while ((c = *p++) == ' ' || c == '\t')
      continue;
    p--;
    trimtrail = 1;
    static char static_sep[5] = " \t";
    sep = static_sep; /* note, code below knows this is 2 long */
    sepc = ' ';
  } else
    trimtrail = 0;
  sepc2 = sep[1]; /* now we can safely pick this up */

  /* catch empties */
  if (*p == '\0')
    return (0);

  /* single separator */
  if (sepc2 == '\0') {
    fn = nfields;
    for (;;) {
      *fp++ = p;
      fn--;
      if (fn == 0)
        break;
      while ((c = *p++) != sepc)
        if (c == '\0')
          return (nfields - fn);
      *(p - 1) = '\0';
    }
    /* we have overflowed the fields vector -- just count them */
    fn = nfields;
    for (;;) {
      while ((c = *p++) != sepc)
        if (c == '\0')
          return (fn);
      fn++;
    }
    /* not reached */
  }

  /* two separators */
  if (sep[2] == '\0') {
    fn = nfields;
    for (;;) {
      *fp++ = p;
      fn--;
      while ((c = *p++) != sepc && c != sepc2)
        if (c == '\0') {
          if (trimtrail && **(fp - 1) == '\0')
            fn++;
          return (nfields - fn);
        }
      if (fn == 0)
        break;
      *(p - 1) = '\0';
      while ((c = *p++) == sepc || c == sepc2)
        continue;
      p--;
    }
    /* we have overflowed the fields vector -- just count them */
    fn = nfields;
    while (c != '\0') {
      while ((c = *p++) == sepc || c == sepc2)
        continue;
      p--;
      fn++;
      while ((c = *p++) != '\0' && c != sepc && c != sepc2)
        continue;
    }
    /* might have to trim trailing white space */
    if (trimtrail) {
      p--;
      while ((c = *--p) == sepc || c == sepc2)
        continue;
      p++;
      if (*p != '\0') {
        if (fn == nfields + 1)
          *p = '\0';
        fn--;
      }
    }
    return (fn);
  }

  /* n separators */
  fn = 0;
  for (;;) {
    if (fn < nfields)
      *fp++ = p;
    fn++;
    for (;;) {
      c = *p++;
      if (c == '\0')
        return (fn);
      sepp = sep;
      while ((sepc = *sepp++) != '\0' && sepc != c)
        continue;
      if (sepc != '\0') /* it was a separator */
        break;
    }
    if (fn < nfields)
      *(p - 1) = '\0';
    for (;;) {
      c = *p++;
      sepp = sep;
      while ((sepc = *sepp++) != '\0' && sepc != c)
        continue;
      if (sepc == '\0') /* it wasn't a separator */
        break;
    }
    p--;
  }

  /* not reached */
}

/*
 - regress - main loop of regression test
 == bool regress(FILE *in);
 */
bool regress(FILE* in)
{
  char inbuf[1000];
#define MAXF 10
  char* f[MAXF];
  int nf;
  int i;
  char erbuf[100];
  size_t ne;
  const char* badpat = "invalid regular expression";
#define SHORT 10
  const char* bpname = "OB_REG_BADPAT";
  ob_regex_t re;
  char sep[5] = "\t\t";

  while (fgets(inbuf, sizeof(inbuf), in) != NULL) {
    line++;
    if (inbuf[0] == '#' || inbuf[0] == '\n')
      continue;                      /* NOTE CONTINUE */
    inbuf[strlen(inbuf) - 1] = '\0'; /* get rid of stupid \n */
    if (debug)
      fprintf(stdout, "%d:\n", line);
    nf = split(inbuf, f, MAXF, sep);
    if (nf < 3) {
      fprintf(stderr, "bad input, line %d\n", line);
      return false;
    }
    for (i = 0; i < nf; i++)
      if (strcmp(f[i], "\"\"") == 0)
        f[i][0] = '\0';
    // f[i] = "";
    if (nf <= 3)
      f[3] = NULL;
    if (nf <= 4)
      f[4] = NULL;
    try_case(f[0], f[1], f[2], f[3], f[4], options('c', f[1]));
    if (opt('&', f[1])) /* try with either type of RE */
      try_case(f[0], f[1], f[2], f[3], f[4], options('c', f[1]) & ~OB_REG_EXTENDED);
  }

  ne = ob_regerror(OB_REG_BADPAT, (ob_regex_t*)NULL, erbuf, sizeof(erbuf));
  if (strcmp(erbuf, badpat) != 0 || ne != strlen(badpat) + 1) {
    fprintf(stderr, "end: regerror() test gave `%s' not `%s'\n", erbuf, badpat);
    status = 1;
  }
  ne = ob_regerror(OB_REG_BADPAT, (ob_regex_t*)NULL, erbuf, (size_t)SHORT);
  if (strncmp(erbuf, badpat, SHORT - 1) != 0 || erbuf[SHORT - 1] != '\0' || ne != strlen(badpat) + 1) {
    fprintf(stderr, "end: regerror() short test gave `%s' not `%.*s'\n", erbuf, SHORT - 1, badpat);
    status = 1;
  }
  ne = ob_regerror(OB_REG_ITOA | OB_REG_BADPAT, (ob_regex_t*)NULL, erbuf, sizeof(erbuf));
  if (strcmp(erbuf, bpname) != 0 || ne != strlen(bpname) + 1) {
    fprintf(stderr, "end: regerror() ITOA test gave `%s' not `%s'\n", erbuf, bpname);
    status = 1;
  }
  re.re_endp = bpname;
  ne = ob_regerror(OB_REG_ATOI, &re, erbuf, sizeof(erbuf));
  if (atoi(erbuf) != (int)OB_REG_BADPAT) {
    fprintf(stderr, "end: regerror() ATOI test gave `%s' not `%ld'\n", erbuf, (long)OB_REG_BADPAT);
    status = 1;
  } else if (ne != strlen(erbuf) + 1) {
    fprintf(stderr, "end: regerror() ATOI test len(`%s') = %ld\n", erbuf, (long)OB_REG_BADPAT);
    status = 1;
  }
  return 0 == status;
}

/*
 - try_case - try it, and report on problems
 == void try_case(char *f0, char *f1, char *f2, char *f3, char *f4, int opts);
 */
void try_case(char* f0, char* f1, char* f2, char* f3, char* f4, int opts /* may not match f1 */
)
{
  ob_regex_t re;
#define NSUBS 10
  ob_regmatch_t subs[NSUBS];
#define NSHOULD 15
  char* should[NSHOULD];
  int nshould;
  char erbuf[100];
  int err;
  int len;
  const char* type = (opts & OB_REG_EXTENDED) ? "ERE" : "BRE";
  register int i;
  char* grump;
  char f0copy[1000];
  char f2copy[1000];
  char sep[5] = ",";

  strcpy(f0copy, f0);
  re.re_endp = (opts & OB_REG_PEND) ? f0copy + strlen(f0copy) : NULL;
  fixstr(f0copy);
  err = ob_regcomp(&re, f0copy, opts, &ob_charset_utf8mb4_general_ci);
  if (err != 0 && (!opt('C', f1) || err != efind(f2))) {
    /* unexpected error or wrong error */
    len = (int)ob_regerror(err, &re, erbuf, sizeof(erbuf));
    fprintf(stderr, "%d: %s error %s, %d/%d `%s'\n", line, type, eprint(err), len, (int)sizeof(erbuf), erbuf);
    status = 1;
  } else if (err == 0 && opt('C', f1)) {
    /* unexpected success */
    fprintf(stderr, "%d: %s should have given OB_REG_%s\n", line, type, f2);
    status = 1;
    err = 1; /* so we won't try regexec */
  }

  if (err != 0) {
    ob_regfree(&re);
    return;
  }

  strcpy(f2copy, f2);
  fixstr(f2copy);

  if (options('e', f1) & OB_REG_STARTEND) {
    if (strchr(f2, '(') == NULL || strchr(f2, ')') == NULL)
      fprintf(stderr, "%d: bad STARTEND syntax\n", line);
    subs[0].rm_so = strchr(f2, '(') - f2 + 1;
    subs[0].rm_eo = strchr(f2, ')') - f2;
  }
  err = ob_regexec(&re, f2copy, NSUBS, subs, options('e', f1));

  if (err != 0 && (f3 != NULL || err != OB_REG_NOMATCH)) {
    /* unexpected error or wrong error */
    len = (int)ob_regerror(err, &re, erbuf, sizeof(erbuf));
    fprintf(stderr, "%d: %s exec error %s, %d/%d `%s'\n", line, type, eprint(err), len, (int)sizeof(erbuf), erbuf);
    status = 1;
  } else if (err != 0) {
    /* nothing more to check */
  } else if (f3 == NULL) {
    /* unexpected success */
    fprintf(stderr, "%d: %s exec should have failed\n", line, type);
    status = 1;
    err = 1; /* just on principle */
  } else if (opts & OB_REG_NOSUB) {
    /* nothing more to check */
  } else if ((grump = check(f2, subs[0], f3)) != NULL) {
    fprintf(stderr, "%d: %s %s\n", line, type, grump);
    status = 1;
    err = 1;
  }

  if (err != 0 || f4 == NULL) {
    ob_regfree(&re);
    return;
  }

  for (i = 1; i < NSHOULD; i++)
    should[i] = NULL;
  nshould = split(f4, should + 1, NSHOULD - 1, sep);
  if (nshould == 0) {
    nshould = 1;
    should[1][0] = '\0';
    // should[1] = "";
  }
  for (i = 1; i < NSUBS; i++) {
    grump = check(f2, subs[i], should[i]);
    if (grump != NULL) {
      fprintf(stderr, "%d: %s $%d %s\n", line, type, i, grump);
      status = 1;
      err = 1;
    }
  }

  ob_regfree(&re);
}

/*
 - options - pick options out of a regression-test string
 == int options(int type, char *s);
 */
int options(int type, /* 'c' compile, 'e' exec */
    char* s)
{
  register char* p;
  register int o = (type == 'c') ? copts : eopts;
  register const char* legal = (type == 'c') ? "bisnmp" : "^$#tl";

  for (p = s; *p != '\0'; p++)
    if (strchr(legal, *p) != NULL)
      switch (*p) {
        case 'b':
          o &= ~OB_REG_EXTENDED;
          break;
        case 'i':
          o |= OB_REG_ICASE;
          break;
        case 's':
          o |= OB_REG_NOSUB;
          break;
        case 'n':
          o |= OB_REG_NEWLINE;
          break;
        case 'm':
          o &= ~OB_REG_EXTENDED;
          o |= OB_REG_NOSPEC;
          break;
        case 'p':
          o |= OB_REG_PEND;
          break;
        case '^':
          o |= OB_REG_NOTBOL;
          break;
        case '$':
          o |= OB_REG_NOTEOL;
          break;
        case '#':
          o |= OB_REG_STARTEND;
          break;
        case 't': /* trace */
          o |= OB_REG_TRACE;
          break;
        case 'l': /* force long representation */
          o |= OB_REG_LARGE;
          break;
        case 'r': /* force backref use */
          o |= OB_REG_BACKR;
          break;
      }
  return (o);
}

/*
 - opt - is a particular option in a regression string?
 == int opt(int c, char *s);
 */
int /* predicate */
    opt(int c, char* s)
{
  return (strchr(s, c) != NULL);
}

/*
 - fixstr - transform magic characters in strings
 == void fixstr(register char *p);
 */
void fixstr(register char* p)
{
  if (p == NULL)
    return;

  for (; *p != '\0'; p++)
    if (*p == 'N')
      *p = '\n';
    else if (*p == 'T')
      *p = '\t';
    else if (*p == 'S')
      *p = ' ';
    else if (*p == 'Z')
      *p = '\0';
}

/*
 - check - check a substring match
 == char *check(char *str, ob_regmatch_t sub, char *should);
 */
char* /* NULL or complaint */
    check(char* str, ob_regmatch_t sub, char* should)
{
  register int len;
  register int shlen;
  register char* p;
  static char grump[500];
  register char* at = NULL;

  if (should != NULL && strcmp(should, "-") == 0)
    should = NULL;
  if (should != NULL && should[0] == '@') {
    at = should + 1;
    should[0] = '\0';
  }

  /* check rm_so and rm_eo for consistency */
  if (sub.rm_so > sub.rm_eo || (sub.rm_so == -1 && sub.rm_eo != -1) || (sub.rm_so != -1 && sub.rm_eo == -1) ||
      (sub.rm_so != -1 && sub.rm_so < 0) || (sub.rm_eo != -1 && sub.rm_eo < 0)) {
    sprintf(grump, "start %ld end %ld", (long)sub.rm_so, (long)sub.rm_eo);
    return (grump);
  }

  /* check for no match */
  if (sub.rm_so == -1 && should == NULL)
    return (NULL);
  if (sub.rm_so == -1) {
    static char ret_err_buf[50] = "did not match";
    return (ret_err_buf);
  }

  /* check for in range */
  if (sub.rm_eo > strlen(str)) {
    sprintf(grump, "start %ld end %ld, past end of string", (long)sub.rm_so, (long)sub.rm_eo);
    return (grump);
  }

  len = (int)(sub.rm_eo - sub.rm_so);
  shlen = (int)strlen(should);
  p = str + sub.rm_so;

  /* check for not supposed to match */
  if (should == NULL) {
    sprintf(grump, "matched `%.*s'", len, p);
    return (grump);
  }

  /* check for wrong match */
  if (len != shlen || strncmp(p, should, (size_t)shlen) != 0) {
    sprintf(grump, "matched `%.*s' instead", len, p);
    return (grump);
  }
  if (shlen > 0)
    return (NULL);

  /* check null match in right place */
  if (at == NULL)
    return (NULL);
  shlen = (int)strlen(at);
  if (shlen == 0)
    shlen = 1; /* force check for end-of-string */
  if (strncmp(p, at, shlen) != 0) {
    sprintf(grump, "matched null at `%.20s'", p);
    return (grump);
  }
  return (NULL);
}

/*
 - eprint - convert error number to name
 == static char *eprint(int err);
 */
static char* eprint(int err)
{
  static char epbuf[100];
  size_t len;

  len = ob_regerror(OB_REG_ITOA | err, (ob_regex_t*)NULL, epbuf, sizeof(epbuf));
  assert(len <= sizeof(epbuf));
  UNUSED(len);
  return (epbuf);
}

/*
 - efind - convert error name to number
 == static int efind(char *name);
 */
static int efind(char* name)
{
  static char efbuf[100];
  // size_t n;
  ob_regex_t re;

  sprintf(efbuf, "OB_REG_%s", name);
  assert(strlen(efbuf) < sizeof(efbuf));
  re.re_endp = efbuf;
  (void)ob_regerror(OB_REG_ATOI, &re, efbuf, sizeof(efbuf));
  return (atoi(efbuf));
}

class ObRegexTest : public ::testing::Test {
public:
  ObRegexTest();
  virtual ~ObRegexTest();
  virtual void SetUp();
  virtual void TearDown();

private:
  // disallow copy
  ObRegexTest(const ObRegexTest& other);
  ObRegexTest& operator=(const ObRegexTest& other);

private:
  // data members
};
ObRegexTest::ObRegexTest()
{}

ObRegexTest::~ObRegexTest()
{}

void ObRegexTest::SetUp()
{}

void ObRegexTest::TearDown()
{}

TEST_F(ObRegexTest, basic_test)
{
  /*
     - main - do the simple case, hand off to regress() for regression
     */
  char test_filename[] = "regex/tests";
  FILE* test_file = fopen(test_filename, "r");
  if (NULL == test_file) {
    fprintf(stderr, "fail to open file '%s'\n", test_filename);
  } else {
    ASSERT_TRUE(regress(test_file));
    fclose(test_file);
  }
}

int main(int argc, char** argv)
{
  OB_LOGGER.set_log_level("INFO");
  // ob_init_memory_pool();
  ::testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
}