fixed c385b14 from https://gitee.com/dodders/openGauss-server/pulls/3151

增加正则匹配支持多字节字符的功能
2023-03-15 21:16:31 +08:00
parent 3d3de895c6
commit a815e1fed5
3 changed files with 95 additions and 7 deletions
--- a/src/common/backend/utils/adt/regexp.cpp
+++ b/src/common/backend/utils/adt/regexp.cpp
@ -777,13 +777,17 @@ Datum similar_escape(PG_FUNCTION_ARGS)
        esc_text = PG_GETARG_TEXT_PP(1);
        e = VARDATA_ANY(esc_text);
        elen = VARSIZE_ANY_EXHDR(esc_text);
-        if (elen == 0)
+        if (elen == 0) {
            e = NULL; /* no escape character */
-        else if (elen != 1)
-            ereport(ERROR,
-                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
-                    errmsg("invalid escape string"),
-                    errhint("Escape string must be empty or one character.")));
+        } else if (elen > 1) {
+            int	escape_mblen = pg_mbstrlen_with_len(e, elen);
+            if (escape_mblen > 1) {
+                ereport(ERROR,
+                    (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+                        errmsg("invalid escape string"),
+                        errhint("Escape string must be empty or one character.")));
+            }
+        }
    }

    /* ----------
@ -803,8 +807,10 @@ Datum similar_escape(PG_FUNCTION_ARGS)
     * We need room for the prefix/postfix plus as many as 3 output bytes per
     * input byte; since the input is at most 1GB this can't overflow
     */
-    result = (text*)palloc(VARHDRSZ + 6 + 3 * plen);
+    const int dataBuffSize = 6 + 3 * plen;
+    result = (text*)palloc(VARHDRSZ + dataBuffSize);
    r = VARDATA(result);
+    const char* dataStartPtr = r;

    *r++ = '^';
    *r++ = '(';
@ -814,6 +820,50 @@ Datum similar_escape(PG_FUNCTION_ARGS)
    while (plen > 0) {
        char pchar = *p;

+        /*
+         * If both the escape character and the current character from the
+         * pattern are multi-byte, we need to take the slow path.
+         *
+         * But if one of them is single-byte, we can process the pattern one
+         * byte at a time, ignoring multi-byte characters.  (This works
+         * because all server-encodings have the property that a valid
+         * multi-byte character representation cannot contain the
+         * representation of a valid single-byte character.)
+         */
+        if (elen > 1) {
+            int mblen = pg_mblen(p);
+            if (mblen > 1) {
+                /* slow, multi-byte path */
+                if (afterescape) {
+                    *r++ = '\\';
+                    int destMax = dataBuffSize - (r - dataStartPtr) / sizeof(char);
+                    errno_t rc = memcpy_s(r, destMax, p, mblen);
+                    securec_check(rc, "\0", "\0");
+                    r += mblen;
+                    afterescape = false;
+                } else if (e && elen == mblen && memcmp(e, p, mblen) == 0) {
+                    /* SQL99 escape character; do not send to output */
+                    afterescape = true;
+                } else {
+                    /*
+                     * We know it's a multi-byte character, so we don't need
+                     * to do all the comparisons to single-byte characters
+                     * that we do below.
+                     */
+                    int destMax = dataBuffSize - (r - dataStartPtr) / sizeof(char);
+                    errno_t rc = memcpy_s(r, destMax, p, mblen);
+                    securec_check(rc, "\0", "\0");
+                    r += mblen;
+                }
+
+                p += mblen;
+                plen -= mblen;
+
+                continue;
+            }
+        }
+
+        /* fast path */
        if (afterescape) {
            if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */
                *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@ -278,3 +278,34 @@ select 'xyz' ~ '((.)){0}(\2){0}' as t;
 t
 (1 row)

+-- test similar with regex
+SELECT 'abc' SIMILAR TO '我%(b|d)%' escape '我' AS RESULT;
+ result 
+--------
+ f
+(1 row)
+
+SELECT '%abc' SIMILAR TO '我%abc' escape '我' AS RESULT;
+ result 
+--------
+ t
+(1 row)
+
+SELECT 'abc' SIMILAR TO '你%(b|d)%' escape '你' AS RESULT;
+ result 
+--------
+ f
+(1 row)
+
+SELECT '%abc' SIMILAR TO '你%abc' escape '你' AS RESULT;
+ result 
+--------
+ t
+(1 row)
+
+SELECT '%abc' SIMILAR TO '\%abc' escape '\' AS RESULT;
+ result 
+--------
+ t
+(1 row)
+
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@ -70,3 +70,10 @@ select 'a' ~ '()+\1';

 -- test {0}
 select 'xyz' ~ '((.)){0}(\2){0}' as t;
+
+-- test similar with regex
+SELECT 'abc' SIMILAR TO '我%(b|d)%' escape '我' AS RESULT;
+SELECT '%abc' SIMILAR TO '我%abc' escape '我' AS RESULT;
+SELECT 'abc' SIMILAR TO '你%(b|d)%' escape '你' AS RESULT;
+SELECT '%abc' SIMILAR TO '你%abc' escape '你' AS RESULT;
+SELECT '%abc' SIMILAR TO '\%abc' escape '\' AS RESULT;