[improvment](function) add new function substring_index (#15024)

This commit is contained in:
Yulei-Yang
2022-12-15 09:54:34 +08:00
committed by GitHub
parent 46030d786a
commit 21c2e485ae
8 changed files with 417 additions and 1 deletions

View File

@ -685,6 +685,7 @@ void register_function_string(SimpleFunctionFactory& factory) {
factory.register_function<FunctionSplitPart>();
factory.register_function<FunctionSplitByString>();
factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
factory.register_function<FunctionSubstringIndex>();
factory.register_function<FunctionExtractURLParameter>();
factory.register_function<FunctionStringParseUrl>();
factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();

View File

@ -1407,6 +1407,179 @@ public:
}
};
class FunctionSubstringIndex : public IFunction {
public:
static constexpr auto name = "substring_index";
static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); }
String get_name() const override { return name; }
size_t get_number_of_arguments() const override { return 3; }
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return make_nullable(std::make_shared<DataTypeString>());
}
bool use_default_implementation_for_nulls() const override { return true; }
bool use_default_implementation_for_constants() const override { return false; }
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
size_t result, size_t input_rows_count) override {
DCHECK_EQ(arguments.size(), 3);
auto null_map = ColumnUInt8::create(input_rows_count, 0);
// Create a zero column to simply implement
auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
auto res = ColumnString::create();
auto& res_offsets = res->get_offsets();
auto& res_chars = res->get_chars();
res_offsets.resize(input_rows_count);
ColumnPtr content_column =
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
if (auto* nullable = check_and_get_column<const ColumnNullable>(*content_column)) {
// Danger: Here must dispose the null map data first! Because
// argument_columns[0]=nullable->get_nested_column_ptr(); will release the mem
// of column nullable mem of null map
VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data());
content_column = nullable->get_nested_column_ptr();
}
for (size_t i = 1; i <= 2; i++) {
ColumnPtr columnPtr = remove_nullable(block.get_by_position(arguments[i]).column);
if (!is_column_const(*columnPtr)) {
return Status::RuntimeError("Argument at index {} for function {} must be constant",
i + 1, get_name());
}
}
auto str_col = assert_cast<const ColumnString*>(content_column.get());
const IColumn& delimiter_col = *block.get_by_position(arguments[1]).column;
const auto* delimiter_const = typeid_cast<const ColumnConst*>(&delimiter_col);
auto delimiter = delimiter_const->get_field().get<String>();
int32_t delimiter_size = delimiter.size();
const IColumn& part_num_col = *block.get_by_position(arguments[2]).column;
const auto* part_num_col_const = typeid_cast<const ColumnConst*>(&part_num_col);
auto part_number = part_num_col_const->get_field().get<Int32>();
if (part_number == 0 || delimiter_size == 0) {
for (size_t i = 0; i < input_rows_count; ++i) {
StringOP::push_empty_string(i, res_chars, res_offsets);
}
} else if (part_number > 0) {
if (delimiter_size == 1) {
// If delimiter is a char, use memchr to split
for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);
int32_t offset = -1;
int32_t num = 0;
while (num < part_number) {
size_t n = str.size - offset - 1;
const char* pos = reinterpret_cast<const char*>(
memchr(str.data + offset + 1, delimiter[0], n));
if (pos != nullptr) {
offset = pos - str.data;
num++;
} else {
offset = str.size;
num = (num == 0) ? 0 : num + 1;
break;
}
}
if (num == part_number) {
StringOP::push_value_string(
std::string_view {reinterpret_cast<const char*>(str.data),
(size_t)offset},
i, res_chars, res_offsets);
} else {
StringOP::push_value_string(std::string_view(str.data, str.size), i,
res_chars, res_offsets);
}
}
} else {
// If delimiter is a string, use memmem to split
for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);
int32_t offset = -delimiter_size;
int32_t num = 0;
while (num < part_number) {
size_t n = str.size - offset - delimiter_size;
char* pos = reinterpret_cast<char*>(
memmem(str.data + offset + delimiter_size, n, delimiter.c_str(),
delimiter_size));
if (pos != nullptr) {
offset = pos - str.data;
num++;
} else {
offset = str.size;
num = (num == 0) ? 0 : num + 1;
break;
}
}
if (num == part_number) {
StringOP::push_value_string(
std::string_view {reinterpret_cast<const char*>(str.data),
(size_t)offset},
i, res_chars, res_offsets);
} else {
StringOP::push_value_string(std::string_view(str.data, str.size), i,
res_chars, res_offsets);
}
}
}
} else {
// if part_number is negative
part_number = -part_number;
for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);
auto str_str = str.to_string();
int32_t offset = str.size;
int32_t pre_offset = offset;
int32_t num = 0;
auto substr = str_str;
while (num <= part_number && offset >= 0) {
offset = (int)substr.rfind(delimiter, offset);
if (offset != -1) {
if (++num == part_number) {
break;
}
pre_offset = offset;
offset = offset - 1;
substr = str_str.substr(0, pre_offset);
} else {
break;
}
}
num = (offset == -1 && num != 0) ? num + 1 : num;
if (num == part_number) {
if (offset == -1) {
StringOP::push_value_string(std::string_view(str.data, str.size), i,
res_chars, res_offsets);
} else {
StringOP::push_value_string(
std::string_view {str.data + offset + delimiter_size,
str.size - offset - delimiter_size},
i, res_chars, res_offsets);
}
} else {
StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars,
res_offsets);
}
}
}
block.get_by_position(result).column =
ColumnNullable::create(std::move(res), std::move(null_map));
return Status::OK();
}
};
class FunctionSplitByString : public IFunction {
public:
static constexpr auto name = "split_by_string";

View File

@ -0,0 +1,92 @@
---
{
"title": "substring_index",
"language": "en"
}
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## substring_index
### Name
<version since="1.2">
SUBSTRING_INDEX
</version>
### description
#### Syntax
`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
Split `content` to two parts at position where the `field`s of `delimiter` stays, return one of them according to below rules:
if `field` is positive, return the left part;
else if `field` is negative, return the right part;
if `field` is zero, return an empty string when `content` is not null, else will return null.
- `delimiter` is case sensitive and multi-byte safe.
- `delimiter` and `field` parameter should be constant.
### example
```
mysql> select substring_index("hello world", " ", 1);
+----------------------------------------+
| substring_index("hello world", " ", 1) |
+----------------------------------------+
| hello |
+----------------------------------------+
mysql> select substring_index("hello world", " ", 2);
+----------------------------------------+
| substring_index("hello world", " ", 2) |
+----------------------------------------+
| hello world |
+----------------------------------------+
mysql> select substring_index("hello world", " ", -1);
+-----------------------------------------+
| substring_index("hello world", " ", -1) |
+-----------------------------------------+
| world |
+-----------------------------------------+
mysql> select substring_index("hello world", " ", -2);
+-----------------------------------------+
| substring_index("hello world", " ", -2) |
+-----------------------------------------+
| hello world |
+-----------------------------------------+
mysql> select substring_index("hello world", " ", -3);
+-----------------------------------------+
| substring_index("hello world", " ", -3) |
+-----------------------------------------+
| hello world |
+-----------------------------------------+
mysql> select substring_index("hello world", " ", 0);
+----------------------------------------+
| substring_index("hello world", " ", 0) |
+----------------------------------------+
| |
+----------------------------------------+
```
### keywords
SUBSTRING_INDEX, SUBSTRING

View File

@ -410,6 +410,7 @@
"sql-manual/sql-functions/string-functions/strleft",
"sql-manual/sql-functions/string-functions/strright",
"sql-manual/sql-functions/string-functions/split_part",
"sql-manual/sql-functions/string-functions/substring_index",
"sql-manual/sql-functions/string-functions/money_format",
"sql-manual/sql-functions/string-functions/parse_url",
"sql-manual/sql-functions/string-functions/convert_to",

View File

@ -0,0 +1,91 @@
---
{
"title": "substring_index",
"language": "zh-CN"
}
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## substring_index
### Name
<version since="1.2">
SUBSTRING_INDEX
</version>
### description
#### Syntax
`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
返回 content 的子字符串,在 delimiter 出现 field 次的位置按如下规则截取:
如果 field > 0,则从左边算起,返回截取位置前的子串;
如果 field < 0则从右边算起返回截取位置后的子串
如果 field = 0,返回一个空串(`content` 不为null), 或者Null `content` = null)。
- delimiter 大小写敏感且是多字节安全的
- `delimiter` `field` 参数需要是常量, 不支持变量
### example
```
mysql> select substring_index("hello world", " ", 1);
+----------------------------------------+
| substring_index("hello world", " ", 1) |
+----------------------------------------+
| hello |
+----------------------------------------+
mysql> select substring_index("hello world", " ", 2);
+----------------------------------------+
| substring_index("hello world", " ", 2) |
+----------------------------------------+
| hello world |
+----------------------------------------+
mysql> select substring_index("hello world", " ", -1);
+-----------------------------------------+
| substring_index("hello world", " ", -1) |
+-----------------------------------------+
| world |
+-----------------------------------------+
mysql> select substring_index("hello world", " ", -2);
+-----------------------------------------+
| substring_index("hello world", " ", -2) |
+-----------------------------------------+
| hello world |
+-----------------------------------------+
mysql> select substring_index("hello world", " ", -3);
+-----------------------------------------+
| substring_index("hello world", " ", -3) |
+-----------------------------------------+
| hello world |
+-----------------------------------------+
mysql> select substring_index("hello world", " ", 0);
+----------------------------------------+
| substring_index("hello world", " ", 0) |
+----------------------------------------+
| |
+----------------------------------------+
```
### keywords
SUBSTRING_INDEX, SUBSTRING

View File

@ -2382,6 +2382,9 @@ visible_functions = [
[['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
[['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
'_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
[['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''],
[['sub_replace'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],'','', '', 'vec', 'ALWAYS_NULLABLE'],
@ -2539,6 +2542,9 @@ visible_functions = [
[['split_part'], 'STRING', ['STRING', 'STRING', 'INT'],
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
[['substring_index'], 'STRING', ['STRING', 'STRING', 'INT'],
'_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
# Utility functions
[['convert_to'], 'VARCHAR', ['VARCHAR','VARCHAR'], '','', '', 'vec', ''],

View File

@ -311,6 +311,45 @@ tNEW-STRorigin str
-- !sql --
d***is
-- !sql --
hello
-- !sql --
hello world
-- !sql --
hello world
-- !sql --
world
-- !sql --
hello world
-- !sql --
hello world
-- !sql --
prefix__string2
-- !sql --
prefix_
-- !sql --
prefix_string2
-- !sql --
\N
-- !sql --
\N
-- !sql --
\N
-- !sql --
prefix_string
-- !sql --
\N
@ -328,4 +367,3 @@ tNEW-STRorigin str
-- !sql --
d***is

View File

@ -154,6 +154,20 @@ suite("test_string_function") {
qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);"
qt_sql "select sub_replace(\"doris\",\"***\",1,2);"
qt_sql "select substring_index(\"hello world\", \" \", 1);"
qt_sql "select substring_index(\"hello world\", \" \", 2);"
qt_sql "select substring_index(\"hello world\", \" \", 3);"
qt_sql "select substring_index(\"hello world\", \" \", -1);"
qt_sql "select substring_index(\"hello world\", \" \", -2);"
qt_sql "select substring_index(\"hello world\", \" \", -3);"
qt_sql "select substring_index(\"prefix__string2\", \"__\", 2);"
qt_sql "select substring_index(\"prefix__string2\", \"_\", 2);"
qt_sql "select substring_index(\"prefix_string2\", \"__\", 1);"
qt_sql "select substring_index(null, \"__\", 1);"
qt_sql "select substring_index(\"prefix_string\", null, 1);"
qt_sql "select substring_index(\"prefix_string\", \"_\", null);"
qt_sql "select substring_index(\"prefix_string\", \"__\", -1);"
sql 'set enable_nereids_planner=true'
sql 'set enable_fallback_to_original_planner=false'