[cherry-pick](branch-2.1) add function regexp_extract_or_null (#39561)
# Proposed changes pick https://github.com/apache/doris/pull/38296
This commit is contained in:
@ -184,8 +184,9 @@ struct RegexpReplaceOneImpl {
|
||||
}
|
||||
};
|
||||
|
||||
template <bool ReturnNull>
|
||||
struct RegexpExtractImpl {
|
||||
static constexpr auto name = "regexp_extract";
|
||||
static constexpr auto name = ReturnNull ? "regexp_extract_or_null" : "regexp_extract";
|
||||
// 3 args
|
||||
static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[],
|
||||
size_t input_rows_count, ColumnString::Chars& result_data,
|
||||
@ -201,7 +202,8 @@ struct RegexpExtractImpl {
|
||||
}
|
||||
const auto& index_data = index_col->get_int(i);
|
||||
if (index_data < 0) {
|
||||
StringOP::push_empty_string(i, result_data, result_offset);
|
||||
ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map)
|
||||
: StringOP::push_empty_string(i, result_data, result_offset);
|
||||
continue;
|
||||
}
|
||||
_execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data,
|
||||
@ -220,7 +222,8 @@ struct RegexpExtractImpl {
|
||||
const auto& index_data = index_col->get_int(0);
|
||||
if (index_data < 0) {
|
||||
for (size_t i = 0; i < input_rows_count; ++i) {
|
||||
StringOP::push_empty_string(i, result_data, result_offset);
|
||||
ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map)
|
||||
: StringOP::push_empty_string(i, result_data, result_offset);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -260,7 +263,8 @@ struct RegexpExtractImpl {
|
||||
|
||||
int max_matches = 1 + re->NumberOfCapturingGroups();
|
||||
if (index_data >= max_matches) {
|
||||
StringOP::push_empty_string(index_now, result_data, result_offset);
|
||||
ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map)
|
||||
: StringOP::push_empty_string(index_now, result_data, result_offset);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -268,7 +272,8 @@ struct RegexpExtractImpl {
|
||||
bool success =
|
||||
re->Match(str_sp, 0, str.size, re2::RE2::UNANCHORED, &matches[0], max_matches);
|
||||
if (!success) {
|
||||
StringOP::push_empty_string(index_now, result_data, result_offset);
|
||||
ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map)
|
||||
: StringOP::push_empty_string(index_now, result_data, result_offset);
|
||||
return;
|
||||
}
|
||||
const re2::StringPiece& match = matches[index_data];
|
||||
@ -486,7 +491,8 @@ public:
|
||||
|
||||
void register_function_regexp_extract(SimpleFunctionFactory& factory) {
|
||||
factory.register_function<FunctionRegexp<RegexpReplaceImpl>>();
|
||||
factory.register_function<FunctionRegexp<RegexpExtractImpl>>();
|
||||
factory.register_function<FunctionRegexp<RegexpExtractImpl<true>>>();
|
||||
factory.register_function<FunctionRegexp<RegexpExtractImpl<false>>>();
|
||||
factory.register_function<FunctionRegexp<RegexpReplaceOneImpl>>();
|
||||
factory.register_function<FunctionRegexp<RegexpExtractAllImpl>>();
|
||||
}
|
||||
|
||||
@ -155,6 +155,45 @@ TEST(FunctionLikeTest, regexp_extract) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FunctionLikeTest, regexp_extract_or_null) {
|
||||
std::string func_name = "regexp_extract_or_null";
|
||||
|
||||
DataSet data_set = {{{std::string("x=a3&x=18abc&x=2&y=3&x=4"),
|
||||
std::string("x=([0-9]+)([a-z]+)"), (int64_t)0},
|
||||
std::string("x=18abc")},
|
||||
{{std::string("x=a3&x=18abc&x=2&y=3&x=4"),
|
||||
std::string("^x=([a-z]+)([0-9]+)"), (int64_t)0},
|
||||
std::string("x=a3")},
|
||||
{{std::string("x=a3&x=18abc&x=2&y=3&x=4"),
|
||||
std::string("^x=([a-z]+)([0-9]+)"), (int64_t)1},
|
||||
std::string("a")},
|
||||
{{std::string("http://a.m.baidu.com/i41915173660.htm"),
|
||||
std::string("i([0-9]+)"), (int64_t)0},
|
||||
std::string("i41915173660")},
|
||||
{{std::string("http://a.m.baidu.com/i41915173660.htm"),
|
||||
std::string("i([0-9]+)"), (int64_t)1},
|
||||
std::string("41915173660")},
|
||||
|
||||
{{std::string("hitdecisiondlist"), std::string("(i)(.*?)(e)"), (int64_t)0},
|
||||
std::string("itde")},
|
||||
{{std::string("hitdecisiondlist"), std::string("(i)(.*?)(e)"), (int64_t)1},
|
||||
std::string("i")},
|
||||
{{std::string("hitdecisiondlist"), std::string("(i)(.*?)(e)"), (int64_t)2},
|
||||
std::string("td")},
|
||||
// null
|
||||
{{std::string("abc"), Null(), (int64_t)0}, Null()},
|
||||
{{Null(), std::string("i([0-9]+)"), (int64_t)0}, Null()}};
|
||||
|
||||
// pattern is constant value
|
||||
InputTypeSet const_pattern_input_types = {TypeIndex::String, Consted {TypeIndex::String},
|
||||
TypeIndex::Int64};
|
||||
for (const auto& line : data_set) {
|
||||
DataSet const_pattern_dataset = {line};
|
||||
static_cast<void>(check_function<DataTypeString, true>(func_name, const_pattern_input_types,
|
||||
const_pattern_dataset));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FunctionLikeTest, regexp_extract_all) {
|
||||
std::string func_name = "regexp_extract_all";
|
||||
|
||||
|
||||
@ -330,6 +330,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Repeat;
|
||||
@ -787,6 +788,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
|
||||
scalar(Regexp.class, "regexp"),
|
||||
scalar(RegexpExtract.class, "regexp_extract"),
|
||||
scalar(RegexpExtractAll.class, "regexp_extract_all"),
|
||||
scalar(RegexpExtractOrNull.class, "regexp_extract_or_null"),
|
||||
scalar(RegexpReplace.class, "regexp_replace"),
|
||||
scalar(RegexpReplaceOne.class, "regexp_replace_one"),
|
||||
scalar(Repeat.class, "repeat"),
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.nereids.trees.expressions.functions.scalar;
|
||||
|
||||
import org.apache.doris.catalog.FunctionSignature;
|
||||
import org.apache.doris.nereids.trees.expressions.Expression;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
|
||||
import org.apache.doris.nereids.trees.expressions.shape.TernaryExpression;
|
||||
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
|
||||
import org.apache.doris.nereids.types.BigIntType;
|
||||
import org.apache.doris.nereids.types.StringType;
|
||||
import org.apache.doris.nereids.types.VarcharType;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* ScalarFunction 'regexp_extract_or_null'. This class is generated by GenerateFunction.
|
||||
*/
|
||||
public class RegexpExtractOrNull extends ScalarFunction
|
||||
implements TernaryExpression, ExplicitlyCastableSignature, AlwaysNullable {
|
||||
|
||||
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
|
||||
FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT)
|
||||
.args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT, BigIntType.INSTANCE),
|
||||
FunctionSignature.ret(StringType.INSTANCE)
|
||||
.args(StringType.INSTANCE, StringType.INSTANCE, BigIntType.INSTANCE)
|
||||
);
|
||||
|
||||
/**
|
||||
* constructor with 3 arguments.
|
||||
*/
|
||||
public RegexpExtractOrNull(Expression arg0, Expression arg1, Expression arg2) {
|
||||
super("regexp_extract_or_null", arg0, arg1, arg2);
|
||||
}
|
||||
|
||||
/**
|
||||
* withChildren.
|
||||
*/
|
||||
@Override
|
||||
public RegexpExtractOrNull withChildren(List<Expression> children) {
|
||||
Preconditions.checkArgument(children.size() == 3);
|
||||
return new RegexpExtractOrNull(children.get(0), children.get(1), children.get(2));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FunctionSignature> getSignatures() {
|
||||
return SIGNATURES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
|
||||
return visitor.visitRegexpExtractOrNull(this, context);
|
||||
}
|
||||
}
|
||||
@ -329,6 +329,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Repeat;
|
||||
@ -1669,6 +1670,10 @@ public interface ScalarFunctionVisitor<R, C> {
|
||||
return visitScalarFunction(regexpExtractAll, context);
|
||||
}
|
||||
|
||||
default R visitRegexpExtractOrNull(RegexpExtractOrNull regexpExtractOrNull, C context) {
|
||||
return visitScalarFunction(regexpExtractOrNull, context);
|
||||
}
|
||||
|
||||
default R visitRegexpReplace(RegexpReplace regexpReplace, C context) {
|
||||
return visitScalarFunction(regexpReplace, context);
|
||||
}
|
||||
|
||||
@ -49,6 +49,18 @@ b
|
||||
-- !sql --
|
||||
d
|
||||
|
||||
-- !sql --
|
||||
|
||||
|
||||
-- !sql --
|
||||
b
|
||||
|
||||
-- !sql --
|
||||
d
|
||||
|
||||
-- !sql --
|
||||
\N
|
||||
|
||||
-- !sql --
|
||||
['18','17']
|
||||
|
||||
|
||||
@ -51,6 +51,11 @@ suite("test_string_function_regexp") {
|
||||
|
||||
qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 1);"
|
||||
qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2);"
|
||||
qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 3);"
|
||||
|
||||
qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 1);"
|
||||
qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2);"
|
||||
qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 3);"
|
||||
|
||||
qt_sql "SELECT regexp_extract_all('x=a3&x=18abc&x=2&y=3&x=4&x=17bcd', 'x=([0-9]+)([a-z]+)');"
|
||||
qt_sql "SELECT regexp_extract_all('http://a.m.baidu.com/i41915i73660.htm', 'i([0-9]+)');"
|
||||
|
||||
Reference in New Issue
Block a user