Files
tidb/pkg/expression/builtin_regexp_test.go

1304 lines
56 KiB
Go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package expression
import (
"fmt"
"testing"
"github.com/pingcap/tidb/pkg/parser/ast"
"github.com/pingcap/tidb/pkg/parser/charset"
"github.com/pingcap/tidb/pkg/parser/mysql"
"github.com/pingcap/tidb/pkg/parser/terror"
"github.com/pingcap/tidb/pkg/testkit/testutil"
"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/chunk"
"github.com/stretchr/testify/require"
)
// should be 5
// We will raise error for binary collation so far,
// so we have to suppress the binary collation tests.
const testCharsetAndCollateTpNum = 5 - 1
const binaryTpIdx = 4
func getStringConstNull() *Constant {
c := getStringConstant("", false)
c.Value.SetNull()
return c
}
func getIntConstNull() *Constant {
c := getIntConstant(0)
c.Value.SetNull()
return c
}
func getStringConstant(value string, isBin bool) *Constant {
c := &Constant{
Value: types.NewStringDatum(value),
}
if isBin {
c.RetType = types.NewFieldTypeBuilder().SetType(mysql.TypeString).SetFlag(mysql.BinaryFlag).SetCharset(charset.CharsetBin).SetCollate(charset.CollationBin).BuildP()
} else {
c.RetType = types.NewFieldType(mysql.TypeVarchar)
}
return c
}
func getIntConstant(num int64) *Constant {
return &Constant{
Value: types.NewIntDatum(num),
RetType: types.NewFieldType(mysql.TypeLong),
}
}
func setConstants(isNull bool, isBin bool, constVals map[int]any, constants []*Constant) {
for i, val := range constVals {
switch v := val.(type) {
case string:
if isNull {
constants[i] = getStringConstNull()
} else {
constants[i] = getStringConstant(v, isBin)
}
case int64:
if isNull {
constants[i] = getIntConstNull()
} else {
constants[i] = getIntConstant(v)
}
default:
panic("Unsupport type")
}
}
}
func getVecExprBenchCaseForRegexpIncludeConst(retType types.EvalType, isBin bool, isNull bool, constVals map[int]any, paramNum int, constants []*Constant, inputs ...any) vecExprBenchCase {
setConstants(isNull, isBin, constVals, constants)
defer func() {
// reset constants, so that following cases could reuse this constant slice
for i := range constVals {
constants[i] = nil
}
}()
retCase := getVecExprBenchCaseForRegexp(retType, isBin, inputs[:paramNum]...)
retCase.constants = make([]*Constant, paramNum)
copy(retCase.constants, constants[:paramNum])
return retCase
}
func getVecExprBenchCaseForRegexp(retType types.EvalType, isBin bool, inputs ...any) vecExprBenchCase {
gens := make([]dataGenerator, 0, 6)
paramTypes := make([]types.EvalType, 0, 6)
for _, input := range inputs {
switch input := input.(type) {
case []int:
gens = append(gens, &rangeInt64Gener{
begin: input[0],
end: input[1],
randGen: newDefaultRandGen(),
})
paramTypes = append(paramTypes, types.ETInt)
case []string:
strs := make([]string, 0)
strs = append(strs, input...)
gens = append(gens, &selectStringGener{
candidates: strs,
randGen: newDefaultRandGen(),
})
paramTypes = append(paramTypes, types.ETString)
default:
panic("Invalid type")
}
}
ret := vecExprBenchCase{
retEvalType: retType,
childrenTypes: paramTypes,
geners: gens,
}
if isBin {
length := len(inputs)
ft := make([]*types.FieldType, length)
ft[0] = types.NewFieldTypeBuilder().SetType(mysql.TypeString).SetFlag(mysql.BinaryFlag).SetCharset(charset.CharsetBin).SetCollate(charset.CollationBin).BuildP()
ret.childrenFieldTypes = ft
}
return ret
}
func setCharsetAndCollation(id int, tps ...*types.FieldType) {
switch id {
case 0:
for _, tp := range tps {
setUtf8mb4CICollation(tp)
}
case 1:
for _, tp := range tps {
setUtf8mb4BinCollation(tp)
}
case 2:
for _, tp := range tps {
setGBKCICollation(tp)
}
case 3:
for _, tp := range tps {
setGBKBinCollation(tp)
}
case binaryTpIdx:
for _, tp := range tps {
setBinaryCollation(tp)
}
default:
panic("Invalid index")
}
}
func setUtf8mb4CICollation(tp *types.FieldType) {
tp.SetType(mysql.TypeVarString)
tp.SetCharset(charset.CharsetUTF8MB4)
tp.SetCollate("utf8mb4_general_ci")
tp.SetFlen(types.UnspecifiedLength)
}
func setUtf8mb4BinCollation(tp *types.FieldType) {
tp.SetType(mysql.TypeVarString)
tp.SetCharset(charset.CharsetUTF8MB4)
tp.SetCollate(charset.CollationUTF8MB4)
tp.SetFlen(types.UnspecifiedLength)
}
func setGBKCICollation(tp *types.FieldType) {
tp.SetType(mysql.TypeVarString)
tp.SetCharset(charset.CharsetGBK)
tp.SetCollate(charset.CollationGBKChineseCI)
tp.SetFlen(types.UnspecifiedLength)
}
func setGBKBinCollation(tp *types.FieldType) {
tp.SetType(mysql.TypeVarString)
tp.SetCharset(charset.CharsetGBK)
tp.SetCollate(charset.CollationGBKBin)
tp.SetFlen(types.UnspecifiedLength)
}
func setBinaryCollation(tp *types.FieldType) {
tp.SetFlag(mysql.BinaryFlag)
tp.SetType(mysql.TypeVarString)
tp.SetCharset(charset.CharsetBin)
tp.SetCollate(charset.CollationBin)
}
func TestRegexpLike(t *testing.T) {
ctx := createContext(t)
// test Regexp_like without match type
testsExcludeMatchType := []struct {
pattern string
input string
match int64
err error
}{
{"^$", "a", 0, nil},
{"a", "a", 1, nil},
{"a", "b", 0, nil},
{"aA", "aA", 1, nil},
{".", "a", 1, nil},
{"^.$", "ab", 0, nil}, // index 5
{"..", "b", 0, nil},
{".ab", "aab", 1, nil},
{".*", "abcd", 1, nil},
{"", "a", 0, ErrRegexp}, // issue 37988
{"(", "", 0, ErrRegexp}, // index 10
{"(*", "", 0, ErrRegexp},
{"[a", "", 0, ErrRegexp},
{"\\", "", 0, ErrRegexp},
}
for _, tt := range testsExcludeMatchType {
fc := funcs[ast.Regexp]
f, err := fc.getFunction(ctx, datumsToConstants(types.MakeDatums(tt.input, tt.pattern)))
require.NoError(t, err)
match, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(tt.match), match, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
// test Regexp_like with match type
testsIncludeMatchType := []struct {
pattern string
input string
matchType string
match int64
err error
}{
{"^$", "a", "", 0, nil},
{"a", "a", "", 1, nil},
{"a", "b", "", 0, nil},
{"aA", "aA", "", 1, nil},
{".", "a", "", 1, nil},
{"^.$", "ab", "", 0, nil},
{"..", "b", "", 0, nil},
{".ab", "aab", "", 1, nil},
{".*", "abcd", "", 1, nil},
// Test case-insensitive
{"AbC", "abc", "", 0, nil},
{"AbC", "abc", "i", 1, nil},
// Test multiple-line mode
{"23$", "123\n321", "", 0, nil},
{"23$", "123\n321", "m", 1, nil},
{"^day", "good\nday", "m", 1, nil},
// Test n flag
{".", "\n", "", 0, nil},
{".", "\n", "s", 1, nil},
// Test rightmost rule
{"aBc", "abc", "ic", 0, nil},
{"aBc", "abc", "ci", 1, nil},
// Test invalid match type
{"abc", "abc", "p", 0, ErrRegexp},
{"abc", "abc", "cpi", 0, ErrRegexp},
}
for _, tt := range testsIncludeMatchType {
fc := funcs[ast.RegexpLike]
f, err := fc.getFunction(ctx, datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.matchType)))
require.NoError(t, err)
match, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(tt.match), match, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
func TestRegexpLikeVec(t *testing.T) {
var expr []string = []string{"abc", "aBc", "Good\nday", "\n"}
var pattern []string = []string{"abc", "od$", "^day", "day$", "."}
var matchType []string = []string{"m", "i", "icc", "cii", "s", "msi"}
constants := make([]*Constant, 3)
for i := 0; i < 3; i++ {
constants[i] = nil
}
args := make([]any, 0)
args = append(args, any(expr))
args = append(args, any(pattern))
args = append(args, any(matchType))
cases := make([]vecExprBenchCase, 0, 30)
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, pattern)) // without match type
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, pattern)) // without match type, with BinCollation
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, pattern, matchType)) // with match type
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, pattern, matchType)) // with match type, with BinCollation
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, make([]string, 0), pattern, matchType)) // Test expr == null
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, make([]string, 0), pattern, matchType)) // Test expr == null, with BinCollation
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, make([]string, 0), matchType)) // Test pattern == null
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, make([]string, 0), matchType)) // Test pattern == null, with BinCollation
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, pattern, make([]string, 0))) // Test matchType == null
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, expr, pattern, make([]string, 0))) // Test matchType == null, with BinCollation
// Prepare data: expr is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{0: any("abc")}, len(args), constants, args...)) // index 10
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{0: any("abc")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{0: any("abc")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{0: any("abc")}, len(args), constants, args...))
// Prepare data: pattern is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("ab.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{1: any("ab.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("ab.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{1: any("ab.")}, len(args), constants, args...))
// Prepare data: matchType is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{2: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{2: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{2: any("msi")}, len(args), constants, args...)) // index 20
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{2: any("msi")}, len(args), constants, args...))
// Prepare data: test memorization
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("abc"), 2: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("abc")}, len(args)-1, constants, args...))
// Build vecBuiltinRegexpLikeCases
var vecBuiltinRegexpLikeCases = map[string][]vecExprBenchCase{
ast.RegexpLike: cases,
}
testVectorizedBuiltinFunc(t, vecBuiltinRegexpLikeCases)
}
func TestRegexpSubstr(t *testing.T) {
ctx := createContext(t)
// test regexp_substr(expr, pat)
testParam2 := []struct {
input any // string
pattern any // string
match any // string
matchBin any // bin result
err error
}{
{"abc", "bc", "bc", "0x6263", nil},
{"你好", "好", "好", "0xE5A5BD", nil},
{"abc", nil, nil, nil, nil},
{nil, "bc", nil, nil, nil},
{nil, nil, nil, nil, nil},
{"a", "", nil, nil, ErrRegexp}, // issue 37988
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam2 {
fc := funcs[ast.RegexpSubstr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// // test regexp_substr(expr, pat, pos)
testParam3 := []struct {
input any // string
pattern any // string
pos any // int64
match any // string
matchBin any // bin result
err error
}{
{"abc", "bc", int64(2), "bc", "0x6263", nil},
{"你好", "好", int64(2), "好", "0xE5A5BD", nil},
{"abc", "bc", int64(3), nil, nil, nil},
{"你好啊", "好", int64(3), nil, "0xE5A5BD", nil},
{"", "^$", int64(1), "", "0x", nil},
// Invalid position index tests
{"abc", "bc", int64(-1), nil, nil, ErrRegexp},
{"abc", "bc", int64(4), nil, nil, ErrRegexp},
{"", "bc", int64(0), nil, nil, ErrRegexp},
{"", "^$", int64(2), nil, nil, ErrRegexp},
// Some nullable input tests
{"", "^$", nil, nil, nil, nil},
{nil, "^$", nil, nil, nil, nil},
{"", nil, nil, nil, nil, nil},
{nil, nil, int64(1), nil, nil, nil},
{nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam3 {
fc := funcs[ast.RegexpSubstr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_substr(expr, pat, pos, occurrence)
testParam4 := []struct {
input any // string
pattern any // string
pos any // int64
occur any // int64
match any // string
matchBin any // bin result
err error
}{
{"abc abd abe", "ab.", int64(1), int64(1), "abc", "0x616263", nil},
{"abc abd abe", "ab.", int64(1), int64(0), "abc", "0x616263", nil},
{"abc abd abe", "ab.", int64(1), int64(-1), "abc", "0x616263", nil},
{"abc abd abe", "ab.", int64(1), int64(2), "abd", "0x616264", nil},
{"abc abd abe", "ab.", int64(3), int64(1), "abd", "0x616264", nil},
{"abc abd abe", "ab.", int64(3), int64(2), "abe", "0x616265", nil}, // index 5
{"abc abd abe", "ab.", int64(6), int64(1), "abe", "0x616265", nil},
{"abc abd abe", "ab.", int64(6), int64(100), nil, nil, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(1), "嗯嗯", "0xE597AFE597AF", nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(2), "嗯好", "0xE597AFE5A5BD", nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(5), int64(1), "嗯呐", "0xE597AFE5A5BD", nil}, // index 10
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(5), int64(2), nil, "0xE597AFE59190", nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(100), nil, nil, nil},
// Some nullable input tests
{"", "^$", int64(1), nil, nil, nil, nil},
{nil, "^$", int64(1), nil, nil, nil, nil},
{nil, "^$", nil, int64(1), nil, nil, nil}, // index 15
{"", nil, nil, int64(1), nil, nil, nil},
{nil, nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam4 {
fc := funcs[ast.RegexpSubstr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos, tt.occur))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_substr(expr, pat, pos, occurrence, matchType)
testParam5 := []struct {
input any // string
pattern any // string
pos any // int64
occur any // int64
matchType any // string
match any // string
matchBin any // bin result
err error
}{
{"abc", "ab.", int64(1), int64(1), "", "abc", "0x616263", nil},
{"abc", "aB.", int64(1), int64(1), "i", "abc", "0x616263", nil},
{"good\nday", "od", int64(1), int64(1), "m", "od", "0x6F64", nil},
{"\n", ".", int64(1), int64(1), "s", "\n", "0x0A", nil},
// Test invalid matchType
{"abc", "ab.", int64(1), int64(1), "p", nil, nil, ErrRegexp}, // index 5
// Some nullable input tests
{"abc", "ab.", int64(1), int64(1), nil, nil, nil, nil},
{"abc", "ab.", nil, int64(1), nil, nil, nil, nil},
{nil, "ab.", nil, int64(1), nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam5 {
fc := funcs[ast.RegexpSubstr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos, tt.occur, tt.matchType))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
}
func TestRegexpSubstrVec(t *testing.T) {
var expr []string = []string{"abc abd abe", "你好啊啊啊啊啊", "好的 好滴 好~", "Good\nday", "\n\n\n\n\n\n"}
var pattern []string = []string{"^$", "ab.", "aB.", "abc", "好", "好.", "od$", "^day", "day$", "."}
var position []int = []int{1, 5}
var occurrence []int = []int{-1, 10}
var matchType []string = []string{"m", "i", "icc", "cii", "s", "msi"}
args := make([]any, 0)
args = append(args, any(expr))
args = append(args, any(pattern))
args = append(args, any(position))
args = append(args, any(occurrence))
args = append(args, any(matchType))
constants := make([]*Constant, 5)
for i := 0; i < 5; i++ {
constants[i] = nil
}
cases := make([]vecExprBenchCase, 0, 50)
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETString, false, args...))
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETString, false, args...))
// Prepare data: expr is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...)) // index 5
// Prepare data: pattern is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("aB.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("aB.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{1: any("aB.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{1: any("aB.")}, len(args), constants, args...))
// Prepare data: position is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{2: any(int64(2))}, len(args), constants, args...)) // index 10
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{2: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{2: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{2: any(int64(2))}, len(args), constants, args...))
// Prepare data: occurrence is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{3: any(int64(2))}, len(args), constants, args...)) // index 15
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{3: any(int64(2))}, len(args), constants, args...))
// Prepare data: match type is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{4: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{4: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{4: any("msi")}, len(args), constants, args...)) // index 20
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{4: any("msi")}, len(args), constants, args...))
// Prepare data: test memorization
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("aB."), 4: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("aB.")}, len(args)-1, constants, args...))
// Build vecBuiltinRegexpSubstrCases
var vecBuiltinRegexpSubstrCases = map[string][]vecExprBenchCase{
ast.RegexpSubstr: cases,
}
testVectorizedBuiltinFunc(t, vecBuiltinRegexpSubstrCases)
}
func TestRegexpInStr(t *testing.T) {
ctx := createContext(t)
// test regexp_instr(expr, pat)
testParam2 := []struct {
input any // string
pattern any // string
match any // int64
matchBin any // bin result
err error
}{
{"abc", "bc", int64(2), int64(2), nil},
{"你好", "好", int64(2), int64(4), nil},
{"", "^$", int64(1), int64(1), nil},
{"abc", nil, nil, nil, nil},
{nil, "bc", nil, nil, nil},
{nil, nil, nil, nil, nil},
{"a", "", nil, nil, ErrRegexp}, // issue 37988
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam2 {
fc := funcs[ast.RegexpInStr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_instr(expr, pat, pos)
testParam3 := []struct {
input any // string
pattern any // string
pos any // int64
match any // int64
matchBin any // bin result
err error
}{
{"abc", "bc", int64(2), int64(2), int64(2), nil},
{"你好", "好", int64(2), int64(2), int64(4), nil},
{"abc", "bc", int64(3), int64(0), int64(0), nil},
{"你好啊", "好", int64(3), int64(0), int64(4), nil},
{"", "^$", int64(1), 1, 1, nil},
// Invalid position index tests
{"abc", "bc", int64(-1), nil, nil, ErrRegexp},
{"abc", "bc", int64(4), nil, nil, ErrRegexp},
{"", "bc", int64(0), nil, nil, ErrRegexp},
// Some nullable input tests
{"", "^$", nil, nil, nil, nil},
{nil, "^$", nil, nil, nil, nil},
{"", nil, nil, nil, nil, nil},
{nil, nil, int64(1), nil, nil, nil},
{nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam3 {
fc := funcs[ast.RegexpInStr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_instr(expr, pat, pos, occurrence)
testParam4 := []struct {
input any // string
pattern any // string
pos any // int64
occurrence any // int64
match any // int64
matchBin any // bin result
err error
}{
{"abc abd abe", "ab.", int64(1), int64(1), 1, 1, nil},
{"abc abd abe", "ab.", int64(1), int64(0), 1, 1, nil},
{"abc abd abe", "ab.", int64(1), int64(-1), 1, 1, nil},
{"abc abd abe", "ab.", int64(1), int64(2), 5, 5, nil},
{"abc abd abe", "ab.", int64(3), int64(1), 5, 5, nil},
{"abc abd abe", "ab.", int64(3), int64(2), 9, 9, nil}, // index 5
{"abc abd abe", "ab.", int64(6), int64(1), 9, 9, nil},
{"abc abd abe", "ab.", int64(6), int64(100), 0, 0, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(1), 1, 1, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(2), 4, 8, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(5), int64(1), 7, 8, nil}, // index 10
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(5), int64(2), 0, 15, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(100), 0, 0, nil},
// Some nullable input tests
{"", "^$", int64(1), nil, nil, nil, nil},
{nil, "^$", int64(1), nil, nil, nil, nil},
{nil, "^$", nil, int64(1), nil, nil, nil}, // index 15
{"", nil, nil, int64(1), nil, nil, nil},
{nil, nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam4 {
fc := funcs[ast.RegexpInStr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos, tt.occurrence))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_instr(expr, pat, pos, occurrence, return_option)
testParam5 := []struct {
input any // string
pattern any // string
pos any // int64
occurrence any // int64
retOpt any // int64
match any // int64
matchBin any // bin result
err error
}{
{"abc abd abe", "ab.", int64(1), int64(1), int64(0), 1, 1, nil},
{"abc abd abe", "ab.", int64(1), int64(1), int64(1), 4, 4, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(1), int64(0), 1, 1, nil},
{"嗯嗯 嗯好 嗯呐", "嗯.", int64(1), int64(1), int64(1), 3, 7, nil},
{"", "^$", int64(1), int64(1), int64(0), 1, 1, nil},
{"", "^$", int64(1), int64(1), int64(1), 1, 1, nil},
// Some nullable input tests
{"", "^$", int64(1), nil, nil, nil, nil, nil},
{nil, "^$", int64(1), nil, nil, nil, nil, nil},
{nil, "^$", nil, int64(1), nil, nil, nil, nil},
{"", nil, nil, int64(1), nil, nil, nil, nil},
{nil, nil, nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam5 {
fc := funcs[ast.RegexpInStr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos, tt.occurrence, tt.retOpt))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_instr(expr, pat, pos, occurrence, return_option, match_type)
testParam6 := []struct {
input any // string
pattern any // string
pos any // int64
occurrence any // int64
retOpt any // int64
matchType any // string
match any // int64
matchBin any // bin result
err error
}{
{"abc", "ab.", int64(1), int64(1), int64(0), "", 1, 1, nil},
{"abc", "aB.", int64(1), int64(1), int64(0), "i", 1, 1, nil},
{"good\nday", "od$", int64(1), int64(1), int64(0), "m", 3, 3, nil},
{"good\nday", "oD$", int64(1), int64(1), int64(0), "mi", 3, 3, nil},
{"\n", ".", int64(1), int64(1), int64(0), "s", 1, 1, nil}, // index 4
// Test invalid matchType
{"abc", "ab.", int64(1), int64(1), int64(0), "p", nil, nil, ErrRegexp},
// Some nullable input tests
{"abc", "ab.", int64(1), int64(1), int64(0), nil, nil, nil, nil},
{"abc", "ab.", nil, int64(1), int64(0), nil, nil, nil, nil},
{nil, "ab.", nil, int64(1), int64(0), nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam6 {
fc := funcs[ast.RegexpInStr]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.pos, tt.occurrence, tt.retOpt, tt.matchType))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
}
func TestRegexpInStrVec(t *testing.T) {
var expr []string = []string{"abc abd abe", "你好啊啊啊啊啊", "好的 好滴 好~", "Good\nday", "\n\n\n\n\n\n"}
var pattern []string = []string{"^$", "ab.", "aB.", "abc", "好", "好.", "od$", "^day", "day$", "."}
var position []int = []int{1, 5}
var occurrence []int = []int{-1, 10}
var retOpt []int = []int{0, 1}
var matchType []string = []string{"m", "i", "icc", "cii", "s", "msi"}
args := make([]any, 0)
args = append(args, any(expr))
args = append(args, any(pattern))
args = append(args, any(position))
args = append(args, any(occurrence))
args = append(args, any(retOpt))
args = append(args, any(matchType))
constants := make([]*Constant, 6)
for i := 0; i < 6; i++ {
constants[i] = nil
}
cases := make([]vecExprBenchCase, 0, 50)
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, args...))
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETInt, false, args...))
// Prepare data: expr is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
// Prepare data: pattern is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("aB.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("aB.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{1: any("aB.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{1: any("aB.")}, len(args), constants, args...))
// Prepare data: position is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{2: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{2: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{2: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{2: any(int64(2))}, len(args), constants, args...))
// Prepare data: occurrence is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{3: any(int64(2))}, len(args), constants, args...))
// Prepare data: return_option is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{4: any(int64(1))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{4: any(int64(1))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{4: any(int64(1))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{4: any(int64(1))}, len(args), constants, args...))
// Prepare data: match type is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, true, map[int]any{5: any("msi")}, len(args), constants, args...))
// Prepare data: test memorization
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("aB."), 5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETInt, false, false, map[int]any{1: any("aB.")}, len(args)-1, constants, args...))
// Build vecBuiltinRegexpSubstrCases
var vecBuiltinRegexpInStrCases = map[string][]vecExprBenchCase{
ast.RegexpInStr: cases,
}
testVectorizedBuiltinFunc(t, vecBuiltinRegexpInStrCases)
}
func TestRegexpReplace(t *testing.T) {
ctx := createContext(t)
url1 := "https://go.mail/folder-1/online/ru-en/#lingvo/#1О 50000&price_ashka/rav4/page=/check.xml"
url2 := "http://saint-peters-total=меньше 1000-rublyayusche/catalogue/kolasuryat-v-2-kadyirovka-personal/serial_id=0&input_state/apartments/mokrotochki.net/upravda.ru/yandex.ru/GameMain.aspx?mult]/on/orders/50195&text=мыс и орелка в Балаш смотреть онлайн бесплатно в хорошем камбалакс&lr=20030393833539353862643188&op_promo=C-Teaser_id=06d162.html"
url1Repl := "a\\12\\13"
url1Res := "ago.mail2go.mail3"
url1BinRes := "0x61676F2E6D61696C32676F2E6D61696C33"
url2Repl := "aaa\\1233"
url2Res := "aaasaint-peters-total=меньше 1000-rublyayusche233"
url2BinRes := "0x6161617361696E742D7065746572732D746F74616C3DC390C2BCC390C2B5C390C2BDC391C592C391CB86C390C2B520313030302D7275626C7961797573636865323333"
urlPat := "^https?://(?:www\\.)?([^/]+)/.*$"
// test regexp_replace(expr, pat, repl)
testParam3 := []struct {
input any // string
pattern any // string
replace any // string
match any // string
matchBin any // bin result
err error
}{
{"abc abd abe", "ab.", "cz", "cz cz cz", "0x637A20637A20637A", nil},
{"你好 好的", "好", "逸", "你逸 逸的", "0xE4BDA0E980B820E980B8E79A84", nil},
{"", "^$", "123", "123", "0x313233", nil},
{"stackoverflow", "(.{5})(.*)", `\\+\2+\1+\2+\1\`, `\+overflow+stack+overflow+stack`, "0x5C2B6F766572666C6F772B737461636B2B6F766572666C6F772B737461636B", nil},
{"fooabcdefghij fooABCDEFGHIJ", "foo(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)", `\\\9\\\8-\7\\\6-\5\\\4-\3\\\2-\1\\`, `\i\h-g\f-e\d-c\b-a\ \I\H-G\F-E\D-C\B-A\`, "0x5C395C382D375C362D355C342D335C322D315C205C395C382D375C362D355C342D335C322D315C", nil},
{"fool food foo", "foo(.?)", `\0+\1`, "fool+l food+d foo+", "0x5C302B5C31205C302B5C31205C302B5C31", nil},
{url1, urlPat, url1Repl, url1Res, url1BinRes, nil},
{url2, urlPat, url2Repl, url2Res, url2BinRes, nil},
{"abc", nil, nil, nil, nil, nil},
{nil, "bc", nil, nil, nil, nil},
{nil, nil, nil, nil, nil, nil},
{"abc", "\\d*", "d", "dadbdcd", "0x64616462646364", nil},
{"abc", "\\d*$", "d", "abcd", "0x64616462646364", nil},
{"我们", "\\d*", "d", "d我d们d", "0x64C3A664CB8664E2809864C3A464C2BB64C2AC64", nil},
{"a", "", "a", nil, nil, ErrRegexp}, // issue 37988
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam3 {
fc := funcs[ast.RegexpReplace]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.replace))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx), args[2].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_replace(expr, pat, repl, pos)
testParam4 := []struct {
input any // string
pattern any // string
replace any // string
pos any // int64
match any // string
matchBin any // bin result
err error
}{
{"abc", "ab.", "cc", int64(1), "cc", "0x6363", nil},
{"abc", "bc", "cc", int64(3), "abc", "0x616263", nil},
{"你好", "好", "的", int64(2), "你的", "0xE4BDA0E79A84", nil},
{"你好啊", "好", "的", int64(3), "你好啊", "0xE4BDA0E79A84E5958A", nil},
{"", "^$", "cc", int64(1), "cc", "0x6363", nil},
{"seafood fool", "foo(.?)", "123", int64(3), "sea123 123", "0x73656131323320313233", nil}, // index 5
{"seafood fool", "foo(.?)", "123", int64(5), "seafood 123", "0x736561666F6F6420313233", nil},
{"seafood fool", "foo(.?)", "123", int64(10), "seafood fool", "0x736561666F6F6420666F6F6C", nil},
{"seafood fool", "foo(.?)", "z\\12", int64(3), "seazd2 zl2", "0x7365617A6432207A6C32", nil},
{"seafood fool", "foo(.?)", "z\\12", int64(5), "seafood zl2", "0x736561666F6F64207A6C32", nil},
// Invalid position index tests
{"", "^$", "a", int64(2), "", "", ErrRegexp}, // index 10
{"", "^&", "a", int64(0), "", "", ErrRegexp},
{"abc", "bc", "a", int64(-1), "", "", ErrRegexp},
{"abc", "bc", "a", int64(4), "", "", ErrRegexp},
// Some nullable input tests
{"", "^$", "a", nil, nil, nil, nil},
{nil, "^$", "a", nil, nil, nil, nil}, // index 15
{"", nil, nil, nil, nil, nil, nil},
{nil, nil, nil, int64(1), nil, nil, nil},
{nil, nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam4 {
fc := funcs[ast.RegexpReplace]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.replace, tt.pos))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx), args[2].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_replace(expr, pat, repl, pos, occurrence)
testParam5 := []struct {
input any // string
pattern any // string
replace any // string
pos any // int64
occurrence any // int64
match any // string
matchBin any // bin result
err error
}{
{"abc abd", "ab.", "cc", int64(1), int64(1), "cc abd", "0x636320616264", nil},
{"abc abd", "ab.", "cc", int64(1), int64(2), "abc cc", "0x616263206363", nil},
{"abc abd", "ab.", "cc", int64(1), int64(0), "cc cc", "0x6363206363", nil},
{"abc abd abe", "ab.", "cc", int64(3), int64(2), "abc abd cc", "0x61626320616264206363", nil},
{"abc abd abe", "ab.", "cc", int64(3), int64(10), "abc abd abe", "0x6162632061626420616265", nil},
{"你好 好啊", "好", "的", int64(1), int64(1), "你的 好啊", "0xE4BDA0E79A8420E5A5BDE5958A", nil}, // index 5
{"你好 好啊", "好", "的", int64(3), int64(1), "你好 的啊", "0xE4BDA0E79A8420E5A5BDE5958A", nil},
{"seafood fool", "foo(.?)", "123", int64(1), int(1), "sea123 fool", "0x73656131323320666F6F6C", nil},
{"seafood fool", "foo(.?)", "123", int64(1), int(2), "seafood 123", "0x736561666F6F6420313233", nil},
{"seafood fool", "foo(.?)", "123", int64(1), int(10), "seafood fool", "0x736561666F6F6420666F6F6C", nil},
{"seafood fool", "foo(.?)", "z\\12", int64(1), int(1), "seazd2 fool", "0x7365617A643220666F6F6C", nil}, // index 10
{"seafood fool", "foo(.?)", "z\\12", int64(1), int(2), "seafood zl2", "0x736561666F6F64207A6C32", nil},
{"", "^$", "cc", int64(1), int64(1), "cc", "0x6363", nil},
{"", "^$", "cc", int64(1), int64(2), "", "0x", nil},
{"", "^$", "cc", int64(1), int64(-1), "cc", "0x6363", nil},
{"abc", "\\d*", "p", 1, 2, "apbc", "0x61706263", nil}, // index 15
{"abc", "\\d*$", "p", 1, 1, "abcp", "0x61626370", nil},
{"我们", "\\d*", "p", 1, 2, "我p们", "0xC3A670CB86E28098C3A4C2BBC2AC", nil},
// Some nullable input tests
{"", "^$", "a", nil, int64(1), nil, nil, nil},
{nil, "^$", "a", nil, nil, nil, nil, nil},
{"", nil, nil, nil, int64(1), nil, nil, nil}, // index 20
{nil, nil, nil, int64(1), int64(1), nil, nil, nil},
{nil, nil, nil, nil, nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam5 {
fc := funcs[ast.RegexpReplace]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.replace, tt.pos, tt.occurrence))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx), args[2].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
// test regexp_replace(expr, pat, repl, pos, occurrence, match_type)
testParam6 := []struct {
input any // string
pattern any // string
replace any // string
pos any // int64
occurrence any // int64
matchType any // string
match any // string
matchBin any // bin result
err error
}{
{"abc", "ab.", "cc", int64(1), int64(0), "", "cc", "0x6363", nil},
{"abc", "aB.", "cc", int64(1), int64(0), "i", "cc", "0x6363", nil},
{"good\nday", "od$", "cc", int64(1), int64(0), "m", "gocc\nday", "0x676F63630A646179", nil},
{"good\nday", "oD$", "cc", int64(1), int64(0), "mi", "gocc\nday", "0x676F63630A646179", nil},
{"Good\nday", "a(B)", "a\\12", int64(2), int64(0), "msi", "Good\nday", "0x476F6F640A646179", nil},
{"Good\nday", "(.)", "cc", int64(1), int64(3), "ci", "Goccd\nday", "0x476F6363640A646179", nil},
{"seafood fool", "foo(.?)", "的", int64(1), int64(2), "m", "seafood 的", "0x736561666F6F6420C3A7C5A1E2809E", nil},
{"abc abd abe", "(.)", "cc", int64(4), int64(1), "cii", "abcccabd abe", "0x616263636361626420616265", nil},
{"\n", ".", "cc", int64(1), int64(0), "s", "cc", "0x6363", nil},
{"好的 好滴 好~", ".", "的", int64(1), int64(0), "msi", "的的的的的的的的", "0xE79A84E79A84E79A84E79A84E79A84E79A84E79A84E79A84", nil},
// Test invalid matchType
{"abc", "ab.", "cc", int64(1), int64(0), "p", nil, nil, ErrRegexp},
// Some nullable input tests
{"abc", "ab.", "cc", int64(1), int64(0), nil, nil, nil, nil},
{"abc", "ab.", nil, int64(1), int64(0), nil, nil, nil, nil},
{nil, "ab.", nil, int64(1), int64(0), nil, nil, nil, nil},
}
for charsetAndCollateTp := 0; charsetAndCollateTp < testCharsetAndCollateTpNum; charsetAndCollateTp++ {
for _, tt := range testParam6 {
fc := funcs[ast.RegexpReplace]
expectMatch := tt.match
args := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, tt.replace, tt.pos, tt.occurrence, tt.matchType))
setCharsetAndCollation(charsetAndCollateTp, args[0].GetType(ctx), args[1].GetType(ctx), args[2].GetType(ctx))
if charsetAndCollateTp == binaryTpIdx {
expectMatch = tt.matchBin
}
f, err := fc.getFunction(ctx, args)
require.NoError(t, err)
actualMatch, err := evalBuiltinFunc(f, ctx, chunk.Row{})
if tt.err == nil {
require.NoError(t, err)
testutil.DatumEqual(t, types.NewDatum(expectMatch), actualMatch, fmt.Sprintf("%v", tt))
} else {
require.True(t, terror.ErrorEqual(err, tt.err))
}
}
}
}
func TestRegexpReplaceVec(t *testing.T) {
var expr []string = []string{"abc abd abe", "你好啊啊啊啊啊", "好的 好滴 好~", "Good\nday", "seafood fool"} // , "\n\n\n\n\n\n"
var pattern []string = []string{"(^$)", "(a)b.", "a(B).", "(ab)c", "(好)", "(好).", "(o)d$", "^da(y)", "(d)ay$", "(.)", "foo(.?)", "foo(d|l)"}
var repl []string = []string{"cc", "的", "a\\12"}
var position []int = []int{1, 5}
var occurrence []int = []int{-1, 5}
var matchType []string = []string{"m", "i", "icc", "cii", "s", "msi"}
args := make([]any, 0)
args = append(args, any(expr))
args = append(args, any(pattern))
args = append(args, any(repl))
args = append(args, any(position))
args = append(args, any(occurrence))
args = append(args, any(matchType))
constants := make([]*Constant, 6)
for i := 0; i < 6; i++ {
constants[i] = nil
}
cases := make([]vecExprBenchCase, 0, 50)
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETString, false, args...))
cases = append(cases, getVecExprBenchCaseForRegexp(types.ETString, false, args...))
// Prepare data: expr is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{0: any("好的 好滴 好~")}, len(args), constants, args...)) // index 5
// Prepare data: pattern is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("(a)B.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("(a)B.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{1: any("(a)B.")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{1: any("(a)B.")}, len(args), constants, args...))
// Prepare data: repl is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{2: any("cc")}, len(args), constants, args...)) // index 10
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{2: any("cc")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{2: any("cc")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{2: any("cc")}, len(args), constants, args...))
// Prepare data: position is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{3: any(int64(2))}, len(args), constants, args...)) // index 15
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{3: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{3: any(int64(2))}, len(args), constants, args...))
// Prepare data: occurrence is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{4: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{4: any(int64(2))}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{4: any(int64(2))}, len(args), constants, args...)) // index 20
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{4: any(int64(2))}, len(args), constants, args...))
// Prepare data: match type is constant
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, true, map[int]any{5: any("msi")}, len(args), constants, args...)) // index 25
// Prepare data: test memorization
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("a(B)."), 5: any("msi")}, len(args), constants, args...))
cases = append(cases, getVecExprBenchCaseForRegexpIncludeConst(types.ETString, false, false, map[int]any{1: any("a(B).")}, len(args)-1, constants, args...))
// Build vecBuiltinRegexpSubstrCases
var vecBuiltinRegexpReplaceCases = map[string][]vecExprBenchCase{
ast.RegexpReplace: cases,
}
testVectorizedBuiltinFunc(t, vecBuiltinRegexpReplaceCases)
}
func TestRegexpCache(t *testing.T) {
ctx := createContext(t)
// if the pattern or match type is not constant, it should not be cached
sig := regexpBaseFuncSig{}
sig.args = []Expression{&Column{}, &Column{}, &Constant{}}
reg, err := sig.getRegexp(ctx, "abc", "", 2)
require.NoError(t, err)
require.Equal(t, "abc", reg.String())
reg, err = sig.getRegexp(ctx, "def", "", 2)
require.NoError(t, err)
require.Equal(t, "def", reg.String())
reg, ok, err := sig.tryVecMemorizedRegexp(ctx, []*funcParam{
{defaultStrVal: "x"},
{defaultStrVal: "aaa"},
{defaultStrVal: ""},
}, 2, 1)
require.Nil(t, reg)
require.False(t, ok)
require.NoError(t, err)
_, ok = sig.memorizedRegexp.getCache(ctx.GetSessionVars().StmtCtx.CtxID())
require.False(t, ok)
sig.args = []Expression{&Column{}, &Constant{}, &Column{}}
reg, err = sig.getRegexp(ctx, "bbb", "", 2)
require.NoError(t, err)
require.Equal(t, "bbb", reg.String())
reg, ok, err = sig.tryVecMemorizedRegexp(ctx, []*funcParam{
{defaultStrVal: "x"},
{defaultStrVal: "aaa"},
{defaultStrVal: ""},
}, 2, 1)
require.Nil(t, reg)
require.False(t, ok)
require.NoError(t, err)
_, ok = sig.memorizedRegexp.getCache(ctx.GetSessionVars().StmtCtx.CtxID())
require.False(t, ok)
// if pattern and match type are both constant, it should be cached
sig = regexpBaseFuncSig{}
sig.args = []Expression{&Column{}, &Constant{ParamMarker: &ParamMarker{}}, &Constant{ParamMarker: &ParamMarker{}}}
reg, err = sig.getRegexp(ctx, "ccc", "", 2)
require.NoError(t, err)
require.Equal(t, "ccc", reg.String())
reg2, err := sig.getRegexp(ctx, "ddd", "", 2)
require.NoError(t, err)
require.Same(t, reg, reg2)
require.Equal(t, "ccc", reg2.String())
sig = regexpBaseFuncSig{}
sig.args = []Expression{&Column{}, &Constant{ParamMarker: &ParamMarker{}}, &Constant{ParamMarker: &ParamMarker{}}}
reg, ok, err = sig.tryVecMemorizedRegexp(ctx, []*funcParam{
{defaultStrVal: "x"},
{defaultStrVal: "ddd"},
{defaultStrVal: ""},
}, 2, 1)
require.Equal(t, "ddd", reg.String())
require.True(t, ok)
require.NoError(t, err)
reg2, ok, err = sig.tryVecMemorizedRegexp(ctx, []*funcParam{
{defaultStrVal: "x"},
{defaultStrVal: "eee"},
{defaultStrVal: ""},
}, 2, 1)
require.Same(t, reg, reg2)
require.Equal(t, "ddd", reg2.String())
require.True(t, ok)
require.NoError(t, err)
}