!1753 修复正则表达式issue

Merge pull request !1753 from 仲夏十三/regex
2022-06-15 06:54:05 +00:00
parent 6f1ddebb78 d78b1348a3
commit 06a4a632e4
5 changed files with 141 additions and 62 deletions
--- a/src/common/backend/regex/regc_nfa.cpp
+++ b/src/common/backend/regex/regc_nfa.cpp
@ -705,7 +705,7 @@ static void moveins(struct nfa* nfa, struct state* oldState, struct state* newSt
 *
 * Either all arcs, or only non-empty ones as determined by all value.
 */
-static void copyins(struct nfa* nfa, struct state* oldState, struct state* newState, int all)
+static void copyins(struct nfa* nfa, struct state* oldState, struct state* newState)
 {

    Assert(oldState != newState);
@ -714,9 +714,9 @@ static void copyins(struct nfa* nfa, struct state* oldState, struct state* newSt
        /* With not too many arcs, just do them one at a time */
        struct arc* a = NULL;

-        for (a = oldState->ins; a != NULL; a = a->inchain)
-            if (all || a->type != EMPTY)
-                cparc(nfa, a, a->from, newState);
+        for (a = oldState->ins; a != NULL; a = a->inchain) {
+            cparc(nfa, a, a->from, newState);
+        }
    } else {
        /*
         * With many arcs, use a sort-merge approach.  Note that createarc()
@ -735,10 +735,6 @@ static void copyins(struct nfa* nfa, struct state* oldState, struct state* newSt
        while (oa != NULL && na != NULL) {
            struct arc* a = oa;

-            if (!all && a->type == EMPTY) {
-                oa = oa->inchain;
-                continue;
-            }

            switch (sortins_cmp(&oa, &na)) {
                case -1:
@ -763,11 +759,6 @@ static void copyins(struct nfa* nfa, struct state* oldState, struct state* newSt
            /* newState does not have anything matching oa */
            struct arc* a = oa;

-            if (!all && a->type == EMPTY) {
-                oa = oa->inchain;
-                continue;
-            }
-
            oa = oa->inchain;
            createarc(nfa, a->type, a->co, a->from, newState);
        }
@ -931,7 +922,7 @@ static void moveouts(struct nfa* nfa, struct state* oldState, struct state* newS
 *
 * Either all arcs, or only non-empty ones as determined by all value.
 */
-static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newState, int all)
+static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newState)
 {
    Assert(oldState != newState);

@ -939,9 +930,9 @@ static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newS
        /* With not too many arcs, just do them one at a time */
        struct arc* a = NULL;

-        for (a = oldState->outs; a != NULL; a = a->outchain)
-            if (all || a->type != EMPTY)
-                cparc(nfa, a, newState, a->to);
+        for (a = oldState->outs; a != NULL; a = a->outchain) {
+            cparc(nfa, a, newState, a->to);
+        }
    } else {
        /*
         * With many arcs, use a sort-merge approach.  Note that createarc()
@ -960,11 +951,6 @@ static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newS
        while (oa != NULL && na != NULL) {
            struct arc* a = oa;

-            if (!all && a->type == EMPTY) {
-                oa = oa->outchain;
-                continue;
-            }
-
            switch (sortouts_cmp(&oa, &na)) {
                case -1:
                    /* newState does not have anything matching oa */
@ -988,11 +974,6 @@ static void copyouts(struct nfa* nfa, struct state* oldState, struct state* newS
            /* newState does not have anything matching oa */
            struct arc* a = oa;

-            if (!all && a->type == EMPTY) {
-                oa = oa->outchain;
-                continue;
-            }
-
            oa = oa->outchain;
            createarc(nfa, a->type, a->co, newState, a->to);
        }
@ -1262,6 +1243,10 @@ static long                            /* re_info bits */
        fprintf(f, "\nfinal cleanup:\n");
 #endif
    cleanup(nfa);        /* final tidying */
+#ifdef REG_DEBUG
+    if (verbose)
+        dumpnfa(nfa, f);
+#endif
    return analyze(nfa); /* and analysis */
 }

@ -1274,6 +1259,7 @@ static void pullback(struct nfa* nfa, FILE* f) /* for debug output; NULL none */
    struct state* nexts = NULL;
    struct arc* a = NULL;
    struct arc* nexta = NULL;
+    struct state* intermediates;
    int progress;

    /* find and pull until there are no more */
@ -1281,13 +1267,23 @@ static void pullback(struct nfa* nfa, FILE* f) /* for debug output; NULL none */
        progress = 0;
        for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
            nexts = s->next;
+            intermediates = NULL;
            for (a = s->outs; a != NULL && !NISERR(); a = nexta) {
                nexta = a->outchain;
                if (a->type == '^' || a->type == BEHIND)
-                    if (pull(nfa, a))
+                    if (pull(nfa, a, &intermediates))
                        progress = 1;
-                Assert(nexta == NULL || s->no != FREESTATE);
            }
+            /* clear tmp fields of intermediate states created here */
+            while (intermediates != NULL) {
+                struct state* ns = intermediates->tmp;
+
+                intermediates->tmp = NULL;
+                intermediates = ns;
+            }
+            /* if s is now useless, get rid of it */
+            if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+                dropstate(nfa, s);
        }
        if (progress && f != NULL)
            dumpnfa(nfa, f);
@ -1318,7 +1314,7 @@ static void pullback(struct nfa* nfa, FILE* f) /* for debug output; NULL none */
 * was that state's last outarc.
 */
 static int /* 0 couldn't, 1 could */
-    pull(struct nfa* nfa, struct arc* con)
+    pull(struct nfa* nfa, struct arc* con, struct state** intermediates)
 {
    struct state* from = con->from;
    struct state* to = con->to;
@ -1339,16 +1335,18 @@ static int /* 0 couldn't, 1 could */
        s = newstate(nfa);
        if (NISERR())
            return 0;
-        copyins(nfa, from, s, 1); /* duplicate inarcs */
-        cparc(nfa, con, s, to);   /* move constraint arc */
+        copyins(nfa, from, s);  /* duplicate inarcs */
+        cparc(nfa, con, s, to); /* move constraint arc */
        freearc(nfa, con);
+        if (NISERR())
+            return 0;
        from = s;
        con = from->outs;
    }
    Assert(from->nouts == 1);

    /* propagate the constraint into the from state's inarcs */
-    for (a = from->ins; a != NULL; a = nexta) {
+    for (a = from->ins; a != NULL && !NISERR(); a = nexta) {
        nexta = a->inchain;
        switch (combine(con, a)) {
            case INCOMPATIBLE: /* destroy the arc */
@ -1357,13 +1355,21 @@ static int /* 0 couldn't, 1 could */
            case SATISFIED: /* no action needed */
                break;
            case COMPATIBLE: /* swap the two arcs, more or less */
-                s = newstate(nfa);
-                if (NISERR())
-                    return 0;
-                cparc(nfa, a, s, to); /* anticipate move */
+                /* need an intermediate state, but might have one already */
+                for (s = *intermediates; s != NULL; s = s->tmp) {
+                    assert(s->nins > 0 && s->nouts > 0);
+                    if (s->ins->from == a->from && s->outs->to == to)
+                        break;
+                }
+                if (s == NULL) {
+                    s = newstate(nfa);
+                    if (NISERR())
+                        return 0;
+                    s->tmp = *intermediates;
+                    *intermediates = s;
+                }
                cparc(nfa, con, a->from, s);
-                if (NISERR())
-                    return 0;
+                cparc(nfa, a, s, to);
                freearc(nfa, a);
                break;
            default:
@ -1374,7 +1380,7 @@ static int /* 0 couldn't, 1 could */

    /* remaining inarcs, if any, incorporate the constraint */
    moveins(nfa, from, to);
-    dropstate(nfa, from); /* will free the constraint */
+    freearc(nfa, con);
    return 1;
 }

@ -1387,6 +1393,7 @@ static void pushfwd(struct nfa* nfa, FILE* f) /* for debug output; NULL none */
    struct state* nexts = NULL;
    struct arc* a = NULL;
    struct arc* nexta = NULL;
+    struct state* intermediates;
    int progress;

    /* find and push until there are no more */
@ -1394,13 +1401,23 @@ static void pushfwd(struct nfa* nfa, FILE* f) /* for debug output; NULL none */
        progress = 0;
        for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
            nexts = s->next;
+            intermediates = NULL;
            for (a = s->ins; a != NULL && !NISERR(); a = nexta) {
                nexta = a->inchain;
                if (a->type == '$' || a->type == AHEAD)
-                    if (push(nfa, a))
+                    if (push(nfa, a, &intermediates))
                        progress = 1;
-                Assert(nexta == NULL || s->no != FREESTATE);
            }
+            /* clear tmp fields of intermediate states created here */
+            while (intermediates != NULL) {
+                struct state* ns = intermediates->tmp;
+
+                intermediates->tmp = NULL;
+                intermediates = ns;
+            }
+            /* if s is now useless, get rid of it */
+            if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+                dropstate(nfa, s);
        }
        if (progress && f != NULL)
            dumpnfa(nfa, f);
@ -1431,7 +1448,7 @@ static void pushfwd(struct nfa* nfa, FILE* f) /* for debug output; NULL none */
 * was that state's last inarc.
 */
 static int /* 0 couldn't, 1 could */
-    push(struct nfa* nfa, struct arc* con)
+    push(struct nfa* nfa, struct arc* con, struct state** intermediates)
 {
    struct state* from = con->from;
    struct state* to = con->to;
@ -1452,16 +1469,18 @@ static int /* 0 couldn't, 1 could */
        s = newstate(nfa);
        if (NISERR())
            return 0;
-        copyouts(nfa, to, s, 1);  /* duplicate outarcs */
+        copyouts(nfa, to, s);  /* duplicate outarcs */
        cparc(nfa, con, from, s); /* move constraint */
        freearc(nfa, con);
+        if (NISERR())
+            return 0;
        to = s;
        con = to->ins;
    }
    Assert(to->nins == 1);

    /* propagate the constraint into the to state's outarcs */
-    for (a = to->outs; a != NULL; a = nexta) {
+    for (a = to->outs; a != NULL && !NISERR(); a = nexta) {
        nexta = a->outchain;
        switch (combine(con, a)) {
            case INCOMPATIBLE: /* destroy the arc */
@ -1470,13 +1489,21 @@ static int /* 0 couldn't, 1 could */
            case SATISFIED: /* no action needed */
                break;
            case COMPATIBLE: /* swap the two arcs, more or less */
-                s = newstate(nfa);
-                if (NISERR())
-                    return 0;
-                cparc(nfa, con, s, a->to); /* anticipate move */
+                             /* need an intermediate state, but might have one already */
+                for (s = *intermediates; s != NULL; s = s->tmp) {
+                    assert(s->nins > 0 && s->nouts > 0);
+                    if (s->ins->from == from && s->outs->to == a->to)
+                        break;
+                }
+                if (s == NULL) {
+                    s = newstate(nfa);
+                    if (NISERR())
+                        return 0;
+                    s->tmp = *intermediates;
+                    *intermediates = s;
+                }
+                cparc(nfa, con, s, a->to);
                cparc(nfa, a, from, s);
-                if (NISERR())
-                    return 0;
                freearc(nfa, a);
                break;
            default:
@ -1487,7 +1514,7 @@ static int /* 0 couldn't, 1 could */

    /* remaining outarcs, if any, incorporate the constraint */
    moveouts(nfa, to, from);
-    dropstate(nfa, to); /* will free the constraint */
+    freearc(nfa, con);
    return 1;
 }

@ -2543,6 +2570,8 @@ static void dumpnfa(struct nfa* nfa, FILE* f)
 {
 #ifdef REG_DEBUG
    struct state* s = NULL;
+    int nstates = 0;
+    int narcs = 0;

    fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
    if (nfa->bos[0] != COLORLESS)
@ -2554,8 +2583,12 @@ static void dumpnfa(struct nfa* nfa, FILE* f)
    if (nfa->eos[1] != COLORLESS)
        fprintf(f, ", eol [%ld]", (long)nfa->eos[1]);
    fprintf(f, "\n");
-    for (s = nfa->states; s != NULL; s = s->next)
+    for (s = nfa->states; s != NULL; s = s->next) {
        dumpstate(s, f);
+        nstates++;
+        narcs += s->nouts;
+    }
+    fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
    if (nfa->parent == NULL)
        dumpcolors(nfa->cm, f);
    fflush(f);
--- a/src/common/backend/regex/regcomp.cpp
+++ b/src/common/backend/regex/regcomp.cpp
@ -135,10 +135,10 @@ static int sortins_cmp(const void*, const void*);
 static void sortouts(struct nfa*, struct state*);
 static int sortouts_cmp(const void*, const void*);
 static void moveins(struct nfa*, struct state*, struct state*);
-static void copyins(struct nfa*, struct state*, struct state*, int);
+static void copyins(struct nfa*, struct state*, struct state*);
 static void mergeins(struct nfa*, struct state*, struct arc**, int);
 static void moveouts(struct nfa*, struct state*, struct state*);
-static void copyouts(struct nfa*, struct state*, struct state*, int);
+static void copyouts(struct nfa*, struct state*, struct state*);
 static void cloneouts(struct nfa*, struct state*, struct state*, struct state*, int);
 static void delsub(struct nfa*, struct state*, struct state*);
 static void deltraverse(struct nfa*, struct state*, struct state*);
@ -149,9 +149,9 @@ static struct state* single_color_transition(struct state*, struct state*);
 static void specialcolors(struct nfa*);
 static long optimize(struct nfa*, FILE*);
 static void pullback(struct nfa*, FILE*);
-static int pull(struct nfa*, struct arc*);
+static int pull(struct nfa *, struct arc *, struct state **);
 static void pushfwd(struct nfa*, FILE*);
-static int push(struct nfa*, struct arc*);
+static int push(struct nfa *, struct arc *, struct state **);

 #define INCOMPATIBLE 1 /* destroys arc */
 #define SATISFIED 2    /* constraint satisfied */
@ -179,7 +179,6 @@ static void dumpnfa(struct nfa*, FILE*);
 #ifdef REG_DEBUG
 static void dumpstate(struct state*, FILE*);
 static void dumparcs(struct state*, FILE*);
-static int dumprarcs(struct arc*, struct state*, FILE*, int);
 static void dumparc(struct arc*, struct state*, FILE*);
 static void dumpcnfa(struct cnfa*, FILE*);
 static void dumpcstate(int, struct cnfa*, FILE*);
@ -597,7 +596,9 @@ static void makesearch(struct vars* v, struct nfa* nfa)
    /* do the splits */
    for (s = slist; s != NULL; s = s2) {
        s2 = newstate(nfa);
-        copyouts(nfa, s, s2, 1);
+        NOERR();
+        copyouts(nfa, s, s2);
+        NOERR();
        for (a = s->ins; a != NULL; a = b) {
            b = a->inchain;
            if (a->from != pre) {
@ -1730,7 +1731,7 @@ static void cleanst(struct vars* v)

 /*
 * nfatree - turn a subRE subtree into a tree of compacted NFAs
- * f��for debug output
+ * for debug output
 * return optimize results from top node
 */
 static long nfatree(struct vars* v, struct subre* t, FILE* f)
@ -1895,7 +1896,7 @@ static void dump(regex_t* re, FILE* f)

    dumpcolors(&g->cmap, f);
    if (!NULLCNFA(g->search)) {
-        printf("\nsearch:\n");
+        fprintf(f, "\nsearch:\n");
        dumpcnfa(&g->search, f);
    }
    for (i = 1; i < g->nlacons; i++) {
--- a/src/test/regress/expected/regex2.out
+++ b/src/test/regress/expected/regex2.out
@ -0,0 +1,35 @@
+-- These cases used to give too-many-states failures
+select 'x' ~ 'abcd(\m)+xyz';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'a' ~ '^abcd*(((((^(a c(e?d)a+|)+|)+|)+|)+|a)+|)';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'x' ~ 'a^(^)bcd*xy(((((($a+|)+|)+|)+$|)+|)+|)^$';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'x' ~ 'xyz(\Y\Y)+';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'x' ~ 'x|(?:\M)+';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- This generates O(N) states but O(N^2) arcs, so it causes problems
+-- if arc count is not constrained
+select 'x' ~ repeat('x*y*z*', 1000);
+ERROR:  invalid regular expression: regular expression is too complex
--- a/src/test/regress/parallel_schedule0
+++ b/src/test/regress/parallel_schedule0
@ -687,7 +687,7 @@ test: hw_to_timestamp hw_view_privilege
 test: hw_identifier
 #test: hw_hashint1 hw_smalldatetime_hash hw_rawtype_hash
 #test: hw_nvarchar2_hash cmpr_smallint cmpr_prefix_150left cmpr_uint32_oid
-test: oidjoins opr_sanity_2 regex
+test: oidjoins opr_sanity_2 regex regex2
 #test: opr_sanity_1

 test: pmk
--- a/src/test/regress/sql/regex2.sql
+++ b/src/test/regress/sql/regex2.sql
@ -0,0 +1,10 @@
+-- These cases used to give too-many-states failures
+select 'x' ~ 'abcd(\m)+xyz';
+select 'a' ~ '^abcd*(((((^(a c(e?d)a+|)+|)+|)+|)+|a)+|)';
+select 'x' ~ 'a^(^)bcd*xy(((((($a+|)+|)+|)+$|)+|)+|)^$';
+select 'x' ~ 'xyz(\Y\Y)+';
+select 'x' ~ 'x|(?:\M)+';
+
+-- This generates O(N) states but O(N^2) arcs, so it causes problems
+-- if arc count is not constrained
+select 'x' ~ repeat('x*y*z*', 1000);