[improvment](function) add new function substring_index (#15024)
This commit is contained in:
@ -685,6 +685,7 @@ void register_function_string(SimpleFunctionFactory& factory) {
|
||||
factory.register_function<FunctionSplitPart>();
|
||||
factory.register_function<FunctionSplitByString>();
|
||||
factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
|
||||
factory.register_function<FunctionSubstringIndex>();
|
||||
factory.register_function<FunctionExtractURLParameter>();
|
||||
factory.register_function<FunctionStringParseUrl>();
|
||||
factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();
|
||||
|
||||
@ -1407,6 +1407,179 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionSubstringIndex : public IFunction {
|
||||
public:
|
||||
static constexpr auto name = "substring_index";
|
||||
static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); }
|
||||
String get_name() const override { return name; }
|
||||
size_t get_number_of_arguments() const override { return 3; }
|
||||
|
||||
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
|
||||
return make_nullable(std::make_shared<DataTypeString>());
|
||||
}
|
||||
|
||||
bool use_default_implementation_for_nulls() const override { return true; }
|
||||
bool use_default_implementation_for_constants() const override { return false; }
|
||||
|
||||
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
|
||||
size_t result, size_t input_rows_count) override {
|
||||
DCHECK_EQ(arguments.size(), 3);
|
||||
|
||||
auto null_map = ColumnUInt8::create(input_rows_count, 0);
|
||||
// Create a zero column to simply implement
|
||||
auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
|
||||
auto res = ColumnString::create();
|
||||
|
||||
auto& res_offsets = res->get_offsets();
|
||||
auto& res_chars = res->get_chars();
|
||||
res_offsets.resize(input_rows_count);
|
||||
|
||||
ColumnPtr content_column =
|
||||
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
|
||||
|
||||
if (auto* nullable = check_and_get_column<const ColumnNullable>(*content_column)) {
|
||||
// Danger: Here must dispose the null map data first! Because
|
||||
// argument_columns[0]=nullable->get_nested_column_ptr(); will release the mem
|
||||
// of column nullable mem of null map
|
||||
VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data());
|
||||
content_column = nullable->get_nested_column_ptr();
|
||||
}
|
||||
|
||||
for (size_t i = 1; i <= 2; i++) {
|
||||
ColumnPtr columnPtr = remove_nullable(block.get_by_position(arguments[i]).column);
|
||||
|
||||
if (!is_column_const(*columnPtr)) {
|
||||
return Status::RuntimeError("Argument at index {} for function {} must be constant",
|
||||
i + 1, get_name());
|
||||
}
|
||||
}
|
||||
|
||||
auto str_col = assert_cast<const ColumnString*>(content_column.get());
|
||||
|
||||
const IColumn& delimiter_col = *block.get_by_position(arguments[1]).column;
|
||||
const auto* delimiter_const = typeid_cast<const ColumnConst*>(&delimiter_col);
|
||||
auto delimiter = delimiter_const->get_field().get<String>();
|
||||
int32_t delimiter_size = delimiter.size();
|
||||
|
||||
const IColumn& part_num_col = *block.get_by_position(arguments[2]).column;
|
||||
const auto* part_num_col_const = typeid_cast<const ColumnConst*>(&part_num_col);
|
||||
auto part_number = part_num_col_const->get_field().get<Int32>();
|
||||
|
||||
if (part_number == 0 || delimiter_size == 0) {
|
||||
for (size_t i = 0; i < input_rows_count; ++i) {
|
||||
StringOP::push_empty_string(i, res_chars, res_offsets);
|
||||
}
|
||||
} else if (part_number > 0) {
|
||||
if (delimiter_size == 1) {
|
||||
// If delimiter is a char, use memchr to split
|
||||
for (size_t i = 0; i < input_rows_count; ++i) {
|
||||
auto str = str_col->get_data_at(i);
|
||||
int32_t offset = -1;
|
||||
int32_t num = 0;
|
||||
while (num < part_number) {
|
||||
size_t n = str.size - offset - 1;
|
||||
const char* pos = reinterpret_cast<const char*>(
|
||||
memchr(str.data + offset + 1, delimiter[0], n));
|
||||
if (pos != nullptr) {
|
||||
offset = pos - str.data;
|
||||
num++;
|
||||
} else {
|
||||
offset = str.size;
|
||||
num = (num == 0) ? 0 : num + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (num == part_number) {
|
||||
StringOP::push_value_string(
|
||||
std::string_view {reinterpret_cast<const char*>(str.data),
|
||||
(size_t)offset},
|
||||
i, res_chars, res_offsets);
|
||||
} else {
|
||||
StringOP::push_value_string(std::string_view(str.data, str.size), i,
|
||||
res_chars, res_offsets);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If delimiter is a string, use memmem to split
|
||||
for (size_t i = 0; i < input_rows_count; ++i) {
|
||||
auto str = str_col->get_data_at(i);
|
||||
int32_t offset = -delimiter_size;
|
||||
int32_t num = 0;
|
||||
while (num < part_number) {
|
||||
size_t n = str.size - offset - delimiter_size;
|
||||
char* pos = reinterpret_cast<char*>(
|
||||
memmem(str.data + offset + delimiter_size, n, delimiter.c_str(),
|
||||
delimiter_size));
|
||||
if (pos != nullptr) {
|
||||
offset = pos - str.data;
|
||||
num++;
|
||||
} else {
|
||||
offset = str.size;
|
||||
num = (num == 0) ? 0 : num + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (num == part_number) {
|
||||
StringOP::push_value_string(
|
||||
std::string_view {reinterpret_cast<const char*>(str.data),
|
||||
(size_t)offset},
|
||||
i, res_chars, res_offsets);
|
||||
} else {
|
||||
StringOP::push_value_string(std::string_view(str.data, str.size), i,
|
||||
res_chars, res_offsets);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// if part_number is negative
|
||||
part_number = -part_number;
|
||||
for (size_t i = 0; i < input_rows_count; ++i) {
|
||||
auto str = str_col->get_data_at(i);
|
||||
auto str_str = str.to_string();
|
||||
int32_t offset = str.size;
|
||||
int32_t pre_offset = offset;
|
||||
int32_t num = 0;
|
||||
auto substr = str_str;
|
||||
while (num <= part_number && offset >= 0) {
|
||||
offset = (int)substr.rfind(delimiter, offset);
|
||||
if (offset != -1) {
|
||||
if (++num == part_number) {
|
||||
break;
|
||||
}
|
||||
pre_offset = offset;
|
||||
offset = offset - 1;
|
||||
substr = str_str.substr(0, pre_offset);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
num = (offset == -1 && num != 0) ? num + 1 : num;
|
||||
|
||||
if (num == part_number) {
|
||||
if (offset == -1) {
|
||||
StringOP::push_value_string(std::string_view(str.data, str.size), i,
|
||||
res_chars, res_offsets);
|
||||
} else {
|
||||
StringOP::push_value_string(
|
||||
std::string_view {str.data + offset + delimiter_size,
|
||||
str.size - offset - delimiter_size},
|
||||
i, res_chars, res_offsets);
|
||||
}
|
||||
} else {
|
||||
StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars,
|
||||
res_offsets);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
block.get_by_position(result).column =
|
||||
ColumnNullable::create(std::move(res), std::move(null_map));
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionSplitByString : public IFunction {
|
||||
public:
|
||||
static constexpr auto name = "split_by_string";
|
||||
|
||||
@ -0,0 +1,92 @@
|
||||
---
|
||||
{
|
||||
"title": "substring_index",
|
||||
"language": "en"
|
||||
}
|
||||
---
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
## substring_index
|
||||
|
||||
### Name
|
||||
|
||||
<version since="1.2">
|
||||
|
||||
SUBSTRING_INDEX
|
||||
|
||||
</version>
|
||||
|
||||
### description
|
||||
|
||||
#### Syntax
|
||||
|
||||
`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
|
||||
|
||||
Split `content` to two parts at position where the `field`s of `delimiter` stays, return one of them according to below rules:
|
||||
if `field` is positive, return the left part;
|
||||
else if `field` is negative, return the right part;
|
||||
if `field` is zero, return an empty string when `content` is not null, else will return null.
|
||||
|
||||
- `delimiter` is case sensitive and multi-byte safe.
|
||||
- `delimiter` and `field` parameter should be constant.
|
||||
|
||||
|
||||
### example
|
||||
|
||||
```
|
||||
mysql> select substring_index("hello world", " ", 1);
|
||||
+----------------------------------------+
|
||||
| substring_index("hello world", " ", 1) |
|
||||
+----------------------------------------+
|
||||
| hello |
|
||||
+----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", 2);
|
||||
+----------------------------------------+
|
||||
| substring_index("hello world", " ", 2) |
|
||||
+----------------------------------------+
|
||||
| hello world |
|
||||
+----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", -1);
|
||||
+-----------------------------------------+
|
||||
| substring_index("hello world", " ", -1) |
|
||||
+-----------------------------------------+
|
||||
| world |
|
||||
+-----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", -2);
|
||||
+-----------------------------------------+
|
||||
| substring_index("hello world", " ", -2) |
|
||||
+-----------------------------------------+
|
||||
| hello world |
|
||||
+-----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", -3);
|
||||
+-----------------------------------------+
|
||||
| substring_index("hello world", " ", -3) |
|
||||
+-----------------------------------------+
|
||||
| hello world |
|
||||
+-----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", 0);
|
||||
+----------------------------------------+
|
||||
| substring_index("hello world", " ", 0) |
|
||||
+----------------------------------------+
|
||||
| |
|
||||
+----------------------------------------+
|
||||
```
|
||||
### keywords
|
||||
|
||||
SUBSTRING_INDEX, SUBSTRING
|
||||
@ -410,6 +410,7 @@
|
||||
"sql-manual/sql-functions/string-functions/strleft",
|
||||
"sql-manual/sql-functions/string-functions/strright",
|
||||
"sql-manual/sql-functions/string-functions/split_part",
|
||||
"sql-manual/sql-functions/string-functions/substring_index",
|
||||
"sql-manual/sql-functions/string-functions/money_format",
|
||||
"sql-manual/sql-functions/string-functions/parse_url",
|
||||
"sql-manual/sql-functions/string-functions/convert_to",
|
||||
|
||||
@ -0,0 +1,91 @@
|
||||
---
|
||||
{
|
||||
"title": "substring_index",
|
||||
"language": "zh-CN"
|
||||
}
|
||||
---
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
## substring_index
|
||||
|
||||
### Name
|
||||
|
||||
<version since="1.2">
|
||||
|
||||
SUBSTRING_INDEX
|
||||
|
||||
</version>
|
||||
|
||||
### description
|
||||
|
||||
#### Syntax
|
||||
|
||||
`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
|
||||
|
||||
返回 content 的子字符串,在 delimiter 出现 field 次的位置按如下规则截取:
|
||||
如果 field > 0,则从左边算起,返回截取位置前的子串;
|
||||
如果 field < 0,则从右边算起,返回截取位置后的子串;
|
||||
如果 field = 0,返回一个空串(`content` 不为null), 或者Null (`content` = null)。
|
||||
|
||||
- delimiter 大小写敏感,且是多字节安全的。
|
||||
- `delimiter` 和 `field` 参数需要是常量, 不支持变量。
|
||||
|
||||
### example
|
||||
|
||||
```
|
||||
mysql> select substring_index("hello world", " ", 1);
|
||||
+----------------------------------------+
|
||||
| substring_index("hello world", " ", 1) |
|
||||
+----------------------------------------+
|
||||
| hello |
|
||||
+----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", 2);
|
||||
+----------------------------------------+
|
||||
| substring_index("hello world", " ", 2) |
|
||||
+----------------------------------------+
|
||||
| hello world |
|
||||
+----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", -1);
|
||||
+-----------------------------------------+
|
||||
| substring_index("hello world", " ", -1) |
|
||||
+-----------------------------------------+
|
||||
| world |
|
||||
+-----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", -2);
|
||||
+-----------------------------------------+
|
||||
| substring_index("hello world", " ", -2) |
|
||||
+-----------------------------------------+
|
||||
| hello world |
|
||||
+-----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", -3);
|
||||
+-----------------------------------------+
|
||||
| substring_index("hello world", " ", -3) |
|
||||
+-----------------------------------------+
|
||||
| hello world |
|
||||
+-----------------------------------------+
|
||||
mysql> select substring_index("hello world", " ", 0);
|
||||
+----------------------------------------+
|
||||
| substring_index("hello world", " ", 0) |
|
||||
+----------------------------------------+
|
||||
| |
|
||||
+----------------------------------------+
|
||||
```
|
||||
### keywords
|
||||
|
||||
SUBSTRING_INDEX, SUBSTRING
|
||||
@ -2382,6 +2382,9 @@ visible_functions = [
|
||||
[['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
|
||||
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
|
||||
'', '', 'vec', 'ALWAYS_NULLABLE'],
|
||||
[['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
|
||||
'_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
|
||||
'', '', 'vec', 'ALWAYS_NULLABLE'],
|
||||
[['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''],
|
||||
|
||||
[['sub_replace'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],'','', '', 'vec', 'ALWAYS_NULLABLE'],
|
||||
@ -2539,6 +2542,9 @@ visible_functions = [
|
||||
[['split_part'], 'STRING', ['STRING', 'STRING', 'INT'],
|
||||
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
|
||||
'', '', 'vec', 'ALWAYS_NULLABLE'],
|
||||
[['substring_index'], 'STRING', ['STRING', 'STRING', 'INT'],
|
||||
'_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
|
||||
'', '', 'vec', 'ALWAYS_NULLABLE'],
|
||||
|
||||
# Utility functions
|
||||
[['convert_to'], 'VARCHAR', ['VARCHAR','VARCHAR'], '','', '', 'vec', ''],
|
||||
|
||||
@ -311,6 +311,45 @@ tNEW-STRorigin str
|
||||
-- !sql --
|
||||
d***is
|
||||
|
||||
-- !sql --
|
||||
hello
|
||||
|
||||
-- !sql --
|
||||
hello world
|
||||
|
||||
-- !sql --
|
||||
hello world
|
||||
|
||||
-- !sql --
|
||||
world
|
||||
|
||||
-- !sql --
|
||||
hello world
|
||||
|
||||
-- !sql --
|
||||
hello world
|
||||
|
||||
-- !sql --
|
||||
prefix__string2
|
||||
|
||||
-- !sql --
|
||||
prefix_
|
||||
|
||||
-- !sql --
|
||||
prefix_string2
|
||||
|
||||
-- !sql --
|
||||
\N
|
||||
|
||||
-- !sql --
|
||||
\N
|
||||
|
||||
-- !sql --
|
||||
\N
|
||||
|
||||
-- !sql --
|
||||
prefix_string
|
||||
|
||||
-- !sql --
|
||||
\N
|
||||
|
||||
@ -328,4 +367,3 @@ tNEW-STRorigin str
|
||||
|
||||
-- !sql --
|
||||
d***is
|
||||
|
||||
|
||||
@ -154,6 +154,20 @@ suite("test_string_function") {
|
||||
qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);"
|
||||
qt_sql "select sub_replace(\"doris\",\"***\",1,2);"
|
||||
|
||||
qt_sql "select substring_index(\"hello world\", \" \", 1);"
|
||||
qt_sql "select substring_index(\"hello world\", \" \", 2);"
|
||||
qt_sql "select substring_index(\"hello world\", \" \", 3);"
|
||||
qt_sql "select substring_index(\"hello world\", \" \", -1);"
|
||||
qt_sql "select substring_index(\"hello world\", \" \", -2);"
|
||||
qt_sql "select substring_index(\"hello world\", \" \", -3);"
|
||||
qt_sql "select substring_index(\"prefix__string2\", \"__\", 2);"
|
||||
qt_sql "select substring_index(\"prefix__string2\", \"_\", 2);"
|
||||
qt_sql "select substring_index(\"prefix_string2\", \"__\", 1);"
|
||||
qt_sql "select substring_index(null, \"__\", 1);"
|
||||
qt_sql "select substring_index(\"prefix_string\", null, 1);"
|
||||
qt_sql "select substring_index(\"prefix_string\", \"_\", null);"
|
||||
qt_sql "select substring_index(\"prefix_string\", \"__\", -1);"
|
||||
|
||||
sql 'set enable_nereids_planner=true'
|
||||
sql 'set enable_fallback_to_original_planner=false'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user