From bddf4cedd0d9ea8c215301579649f4a984bd9d48 Mon Sep 17 00:00:00 2001 From: ganyang Date: Wed, 15 Jun 2022 10:55:50 +0800 Subject: [PATCH] fix regex error --- src/common/backend/regex/regc_nfa.cpp | 139 ++++++++++++++++---------- src/common/backend/regex/regcomp.cpp | 17 ++-- src/test/regress/expected/regex2.out | 35 +++++++ src/test/regress/parallel_schedule0 | 4 +- src/test/regress/sql/regex2.sql | 10 ++ 5 files changed, 142 insertions(+), 63 deletions(-) create mode 100644 src/test/regress/expected/regex2.out create mode 100644 src/test/regress/sql/regex2.sql diff --git a/src/common/backend/regex/regc_nfa.cpp b/src/common/backend/regex/regc_nfa.cpp index 8bc066875..36ea7a64a 100644 --- a/src/common/backend/regex/regc_nfa.cpp +++ b/src/common/backend/regex/regc_nfa.cpp @@ -705,7 +705,7 @@ static void moveins(struct nfa* nfa, struct state* oldState, struct state* newSt * * Either all arcs, or only non-empty ones as determined by all value. */ -static void copyins(struct nfa* nfa, struct state* oldState, struct state* newState, int all) +static void copyins(struct nfa* nfa, struct state* oldState, struct state* newState) { Assert(oldState != newState); @@ -714,9 +714,9 @@ static void copyins(struct nfa* nfa, struct state* oldState, struct state* newSt /* With not too many arcs, just do them one at a time */ struct arc* a = NULL; - for (a = oldState->ins; a != NULL; a = a->inchain) - if (all || a->type != EMPTY) - cparc(nfa, a, a->from, newState); + for (a = oldState->ins; a != NULL; a = a->inchain) { + cparc(nfa, a, a->from, newState); + } } else { /* * With many arcs, use a sort-merge approach. Note that createarc() @@ -735,10 +735,6 @@ static void copyins(struct nfa* nfa, struct state* oldState, struct state* newSt while (oa != NULL && na != NULL) { struct arc* a = oa; - if (!all && a->type == EMPTY) { - oa = oa->inchain; - continue; - } switch (sortins_cmp(&oa, &na)) { case -1: @@ -763,11 +759,6 @@ static void copyins(struct nfa* nfa, struct state* oldState, struct state* newSt /* newState does not have anything matching oa */ struct arc* a = oa; - if (!all && a->type == EMPTY) { - oa = oa->inchain; - continue; - } - oa = oa->inchain; createarc(nfa, a->type, a->co, a->from, newState); } @@ -931,7 +922,7 @@ static void moveouts(struct nfa* nfa, struct state* oldState, struct state* newS * * Either all arcs, or only non-empty ones as determined by all value. */ -static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newState, int all) +static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newState) { Assert(oldState != newState); @@ -939,9 +930,9 @@ static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newS /* With not too many arcs, just do them one at a time */ struct arc* a = NULL; - for (a = oldState->outs; a != NULL; a = a->outchain) - if (all || a->type != EMPTY) - cparc(nfa, a, newState, a->to); + for (a = oldState->outs; a != NULL; a = a->outchain) { + cparc(nfa, a, newState, a->to); + } } else { /* * With many arcs, use a sort-merge approach. Note that createarc() @@ -960,11 +951,6 @@ static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newS while (oa != NULL && na != NULL) { struct arc* a = oa; - if (!all && a->type == EMPTY) { - oa = oa->outchain; - continue; - } - switch (sortouts_cmp(&oa, &na)) { case -1: /* newState does not have anything matching oa */ @@ -988,11 +974,6 @@ static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newS /* newState does not have anything matching oa */ struct arc* a = oa; - if (!all && a->type == EMPTY) { - oa = oa->outchain; - continue; - } - oa = oa->outchain; createarc(nfa, a->type, a->co, newState, a->to); } @@ -1262,6 +1243,10 @@ static long /* re_info bits */ fprintf(f, "\nfinal cleanup:\n"); #endif cleanup(nfa); /* final tidying */ +#ifdef REG_DEBUG + if (verbose) + dumpnfa(nfa, f); +#endif return analyze(nfa); /* and analysis */ } @@ -1274,6 +1259,7 @@ static void pullback(struct nfa* nfa, FILE* f) /* for debug output; NULL none */ struct state* nexts = NULL; struct arc* a = NULL; struct arc* nexta = NULL; + struct state* intermediates; int progress; /* find and pull until there are no more */ @@ -1281,13 +1267,23 @@ static void pullback(struct nfa* nfa, FILE* f) /* for debug output; NULL none */ progress = 0; for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { nexts = s->next; + intermediates = NULL; for (a = s->outs; a != NULL && !NISERR(); a = nexta) { nexta = a->outchain; if (a->type == '^' || a->type == BEHIND) - if (pull(nfa, a)) + if (pull(nfa, a, &intermediates)) progress = 1; - Assert(nexta == NULL || s->no != FREESTATE); } + /* clear tmp fields of intermediate states created here */ + while (intermediates != NULL) { + struct state* ns = intermediates->tmp; + + intermediates->tmp = NULL; + intermediates = ns; + } + /* if s is now useless, get rid of it */ + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); } if (progress && f != NULL) dumpnfa(nfa, f); @@ -1318,7 +1314,7 @@ static void pullback(struct nfa* nfa, FILE* f) /* for debug output; NULL none */ * was that state's last outarc. */ static int /* 0 couldn't, 1 could */ - pull(struct nfa* nfa, struct arc* con) + pull(struct nfa* nfa, struct arc* con, struct state** intermediates) { struct state* from = con->from; struct state* to = con->to; @@ -1339,16 +1335,18 @@ static int /* 0 couldn't, 1 could */ s = newstate(nfa); if (NISERR()) return 0; - copyins(nfa, from, s, 1); /* duplicate inarcs */ - cparc(nfa, con, s, to); /* move constraint arc */ + copyins(nfa, from, s); /* duplicate inarcs */ + cparc(nfa, con, s, to); /* move constraint arc */ freearc(nfa, con); + if (NISERR()) + return 0; from = s; con = from->outs; } Assert(from->nouts == 1); /* propagate the constraint into the from state's inarcs */ - for (a = from->ins; a != NULL; a = nexta) { + for (a = from->ins; a != NULL && !NISERR(); a = nexta) { nexta = a->inchain; switch (combine(con, a)) { case INCOMPATIBLE: /* destroy the arc */ @@ -1357,13 +1355,21 @@ static int /* 0 couldn't, 1 could */ case SATISFIED: /* no action needed */ break; case COMPATIBLE: /* swap the two arcs, more or less */ - s = newstate(nfa); - if (NISERR()) - return 0; - cparc(nfa, a, s, to); /* anticipate move */ + /* need an intermediate state, but might have one already */ + for (s = *intermediates; s != NULL; s = s->tmp) { + assert(s->nins > 0 && s->nouts > 0); + if (s->ins->from == a->from && s->outs->to == to) + break; + } + if (s == NULL) { + s = newstate(nfa); + if (NISERR()) + return 0; + s->tmp = *intermediates; + *intermediates = s; + } cparc(nfa, con, a->from, s); - if (NISERR()) - return 0; + cparc(nfa, a, s, to); freearc(nfa, a); break; default: @@ -1374,7 +1380,7 @@ static int /* 0 couldn't, 1 could */ /* remaining inarcs, if any, incorporate the constraint */ moveins(nfa, from, to); - dropstate(nfa, from); /* will free the constraint */ + freearc(nfa, con); return 1; } @@ -1387,6 +1393,7 @@ static void pushfwd(struct nfa* nfa, FILE* f) /* for debug output; NULL none */ struct state* nexts = NULL; struct arc* a = NULL; struct arc* nexta = NULL; + struct state* intermediates; int progress; /* find and push until there are no more */ @@ -1394,13 +1401,23 @@ static void pushfwd(struct nfa* nfa, FILE* f) /* for debug output; NULL none */ progress = 0; for (s = nfa->states; s != NULL && !NISERR(); s = nexts) { nexts = s->next; + intermediates = NULL; for (a = s->ins; a != NULL && !NISERR(); a = nexta) { nexta = a->inchain; if (a->type == '$' || a->type == AHEAD) - if (push(nfa, a)) + if (push(nfa, a, &intermediates)) progress = 1; - Assert(nexta == NULL || s->no != FREESTATE); } + /* clear tmp fields of intermediate states created here */ + while (intermediates != NULL) { + struct state* ns = intermediates->tmp; + + intermediates->tmp = NULL; + intermediates = ns; + } + /* if s is now useless, get rid of it */ + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); } if (progress && f != NULL) dumpnfa(nfa, f); @@ -1431,7 +1448,7 @@ static void pushfwd(struct nfa* nfa, FILE* f) /* for debug output; NULL none */ * was that state's last inarc. */ static int /* 0 couldn't, 1 could */ - push(struct nfa* nfa, struct arc* con) + push(struct nfa* nfa, struct arc* con, struct state** intermediates) { struct state* from = con->from; struct state* to = con->to; @@ -1452,16 +1469,18 @@ static int /* 0 couldn't, 1 could */ s = newstate(nfa); if (NISERR()) return 0; - copyouts(nfa, to, s, 1); /* duplicate outarcs */ + copyouts(nfa, to, s); /* duplicate outarcs */ cparc(nfa, con, from, s); /* move constraint */ freearc(nfa, con); + if (NISERR()) + return 0; to = s; con = to->ins; } Assert(to->nins == 1); /* propagate the constraint into the to state's outarcs */ - for (a = to->outs; a != NULL; a = nexta) { + for (a = to->outs; a != NULL && !NISERR(); a = nexta) { nexta = a->outchain; switch (combine(con, a)) { case INCOMPATIBLE: /* destroy the arc */ @@ -1470,13 +1489,21 @@ static int /* 0 couldn't, 1 could */ case SATISFIED: /* no action needed */ break; case COMPATIBLE: /* swap the two arcs, more or less */ - s = newstate(nfa); - if (NISERR()) - return 0; - cparc(nfa, con, s, a->to); /* anticipate move */ + /* need an intermediate state, but might have one already */ + for (s = *intermediates; s != NULL; s = s->tmp) { + assert(s->nins > 0 && s->nouts > 0); + if (s->ins->from == from && s->outs->to == a->to) + break; + } + if (s == NULL) { + s = newstate(nfa); + if (NISERR()) + return 0; + s->tmp = *intermediates; + *intermediates = s; + } + cparc(nfa, con, s, a->to); cparc(nfa, a, from, s); - if (NISERR()) - return 0; freearc(nfa, a); break; default: @@ -1487,7 +1514,7 @@ static int /* 0 couldn't, 1 could */ /* remaining outarcs, if any, incorporate the constraint */ moveouts(nfa, to, from); - dropstate(nfa, to); /* will free the constraint */ + freearc(nfa, con); return 1; } @@ -2543,6 +2570,8 @@ static void dumpnfa(struct nfa* nfa, FILE* f) { #ifdef REG_DEBUG struct state* s = NULL; + int nstates = 0; + int narcs = 0; fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no); if (nfa->bos[0] != COLORLESS) @@ -2554,8 +2583,12 @@ static void dumpnfa(struct nfa* nfa, FILE* f) if (nfa->eos[1] != COLORLESS) fprintf(f, ", eol [%ld]", (long)nfa->eos[1]); fprintf(f, "\n"); - for (s = nfa->states; s != NULL; s = s->next) + for (s = nfa->states; s != NULL; s = s->next) { dumpstate(s, f); + nstates++; + narcs += s->nouts; + } + fprintf(f, "total of %d states, %d arcs\n", nstates, narcs); if (nfa->parent == NULL) dumpcolors(nfa->cm, f); fflush(f); diff --git a/src/common/backend/regex/regcomp.cpp b/src/common/backend/regex/regcomp.cpp index b58bd1019..25bc13f7e 100644 --- a/src/common/backend/regex/regcomp.cpp +++ b/src/common/backend/regex/regcomp.cpp @@ -135,10 +135,10 @@ static int sortins_cmp(const void*, const void*); static void sortouts(struct nfa*, struct state*); static int sortouts_cmp(const void*, const void*); static void moveins(struct nfa*, struct state*, struct state*); -static void copyins(struct nfa*, struct state*, struct state*, int); +static void copyins(struct nfa*, struct state*, struct state*); static void mergeins(struct nfa*, struct state*, struct arc**, int); static void moveouts(struct nfa*, struct state*, struct state*); -static void copyouts(struct nfa*, struct state*, struct state*, int); +static void copyouts(struct nfa*, struct state*, struct state*); static void cloneouts(struct nfa*, struct state*, struct state*, struct state*, int); static void delsub(struct nfa*, struct state*, struct state*); static void deltraverse(struct nfa*, struct state*, struct state*); @@ -149,9 +149,9 @@ static struct state* single_color_transition(struct state*, struct state*); static void specialcolors(struct nfa*); static long optimize(struct nfa*, FILE*); static void pullback(struct nfa*, FILE*); -static int pull(struct nfa*, struct arc*); +static int pull(struct nfa *, struct arc *, struct state **); static void pushfwd(struct nfa*, FILE*); -static int push(struct nfa*, struct arc*); +static int push(struct nfa *, struct arc *, struct state **); #define INCOMPATIBLE 1 /* destroys arc */ #define SATISFIED 2 /* constraint satisfied */ @@ -179,7 +179,6 @@ static void dumpnfa(struct nfa*, FILE*); #ifdef REG_DEBUG static void dumpstate(struct state*, FILE*); static void dumparcs(struct state*, FILE*); -static int dumprarcs(struct arc*, struct state*, FILE*, int); static void dumparc(struct arc*, struct state*, FILE*); static void dumpcnfa(struct cnfa*, FILE*); static void dumpcstate(int, struct cnfa*, FILE*); @@ -597,7 +596,9 @@ static void makesearch(struct vars* v, struct nfa* nfa) /* do the splits */ for (s = slist; s != NULL; s = s2) { s2 = newstate(nfa); - copyouts(nfa, s, s2, 1); + NOERR(); + copyouts(nfa, s, s2); + NOERR(); for (a = s->ins; a != NULL; a = b) { b = a->inchain; if (a->from != pre) { @@ -1730,7 +1731,7 @@ static void cleanst(struct vars* v) /* * nfatree - turn a subRE subtree into a tree of compacted NFAs - * f£ºfor debug output + * for debug output * return optimize results from top node */ static long nfatree(struct vars* v, struct subre* t, FILE* f) @@ -1895,7 +1896,7 @@ static void dump(regex_t* re, FILE* f) dumpcolors(&g->cmap, f); if (!NULLCNFA(g->search)) { - printf("\nsearch:\n"); + fprintf(f, "\nsearch:\n"); dumpcnfa(&g->search, f); } for (i = 1; i < g->nlacons; i++) { diff --git a/src/test/regress/expected/regex2.out b/src/test/regress/expected/regex2.out new file mode 100644 index 000000000..8155b4516 --- /dev/null +++ b/src/test/regress/expected/regex2.out @@ -0,0 +1,35 @@ +-- These cases used to give too-many-states failures +select 'x' ~ 'abcd(\m)+xyz'; + ?column? +---------- + f +(1 row) + +select 'a' ~ '^abcd*(((((^(a c(e?d)a+|)+|)+|)+|)+|a)+|)'; + ?column? +---------- + f +(1 row) + +select 'x' ~ 'a^(^)bcd*xy(((((($a+|)+|)+|)+$|)+|)+|)^$'; + ?column? +---------- + f +(1 row) + +select 'x' ~ 'xyz(\Y\Y)+'; + ?column? +---------- + f +(1 row) + +select 'x' ~ 'x|(?:\M)+'; + ?column? +---------- + t +(1 row) + +-- This generates O(N) states but O(N^2) arcs, so it causes problems +-- if arc count is not constrained +select 'x' ~ repeat('x*y*z*', 1000); +ERROR: invalid regular expression: regular expression is too complex diff --git a/src/test/regress/parallel_schedule0 b/src/test/regress/parallel_schedule0 index 52dc78697..7a0b5e328 100644 --- a/src/test/regress/parallel_schedule0 +++ b/src/test/regress/parallel_schedule0 @@ -687,7 +687,7 @@ test: hw_to_timestamp hw_view_privilege test: hw_identifier #test: hw_hashint1 hw_smalldatetime_hash hw_rawtype_hash #test: hw_nvarchar2_hash cmpr_smallint cmpr_prefix_150left cmpr_uint32_oid -test: oidjoins opr_sanity_2 regex +test: oidjoins opr_sanity_2 regex regex2 #test: opr_sanity_1 test: pmk @@ -915,4 +915,4 @@ test: subscription test: fdw_audit test: gs_global_config_audit test: detail -test: gs_dump_encrypt +test: gs_dump_encrypt \ No newline at end of file diff --git a/src/test/regress/sql/regex2.sql b/src/test/regress/sql/regex2.sql new file mode 100644 index 000000000..333f780ed --- /dev/null +++ b/src/test/regress/sql/regex2.sql @@ -0,0 +1,10 @@ +-- These cases used to give too-many-states failures +select 'x' ~ 'abcd(\m)+xyz'; +select 'a' ~ '^abcd*(((((^(a c(e?d)a+|)+|)+|)+|)+|a)+|)'; +select 'x' ~ 'a^(^)bcd*xy(((((($a+|)+|)+|)+$|)+|)+|)^$'; +select 'x' ~ 'xyz(\Y\Y)+'; +select 'x' ~ 'x|(?:\M)+'; + +-- This generates O(N) states but O(N^2) arcs, so it causes problems +-- if arc count is not constrained +select 'x' ~ repeat('x*y*z*', 1000); \ No newline at end of file