diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 5d5d45409b..2a6a8af38c 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -683,6 +683,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_function>(); factory.register_function(); factory.register_function(); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 94fd467d81..c86b3f0f45 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -55,6 +55,7 @@ #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/common/string_ref.h" +#include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" @@ -1405,6 +1406,127 @@ public: } }; +class FunctionSplitByString : public IFunction { +public: + static constexpr auto name = "split_by_string"; + + static FunctionPtr create() { return std::make_shared(); } + using NullMapType = PaddedPODArray; + + String get_name() const override { return name; } + + bool is_variadic() const override { return false; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + DCHECK(is_string(arguments[0])) + << "first argument for function: " << name << " should be string" + << " and arguments[0] is " << arguments[0]->get_name(); + DCHECK(is_string(arguments[1])) + << "second argument for function: " << name << " should be string" + << " and arguments[1] is " << arguments[1]->get_name(); + return std::make_shared(make_nullable(arguments[0])); + } + + Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, + size_t result, size_t /*input_rows_count*/) override { + DCHECK_EQ(arguments.size(), 2); + + ColumnPtr src_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + ColumnPtr delimiter_column = + block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); + + DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; + auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), + ColumnArray::ColumnOffsets::create()); + + IColumn* dest_nested_column = &dest_column_ptr->get_data(); + auto& dest_offsets = dest_column_ptr->get_offsets(); + DCHECK(dest_nested_column != nullptr); + dest_nested_column->reserve(0); + dest_offsets.reserve(0); + + NullMapType* dest_nested_null_map = nullptr; + ColumnNullable* dest_nullable_col = reinterpret_cast(dest_nested_column); + dest_nested_column = dest_nullable_col->get_nested_column_ptr(); + dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); + + _execute(*src_column, *delimiter_column, *dest_nested_column, dest_offsets, + dest_nested_null_map); + block.replace_by_position(result, std::move(dest_column_ptr)); + return Status::OK(); + } + +private: + void _execute(const IColumn& src_column, const IColumn& delimiter_column, + IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) { + ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); + ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); + ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); + column_string_chars.reserve(0); + + ColumnArray::Offset64 string_pos = 0; + ColumnArray::Offset64 dest_pos = 0; + const ColumnString* src_column_string = reinterpret_cast(&src_column); + ColumnArray::Offset64 src_offsets_size = src_column_string->get_offsets().size(); + + for (size_t i = 0; i < src_offsets_size; i++) { + const StringRef delimiter_ref = delimiter_column.get_data_at(i); + const StringRef str_ref = src_column_string->get_data_at(i); + + if (str_ref.size == 0) { + dest_offsets.push_back(dest_pos); + continue; + } + if (delimiter_ref.size == 0) { + for (size_t str_pos = 0; str_pos < str_ref.size;) { + const size_t str_offset = str_pos; + const size_t old_size = column_string_chars.size(); + str_pos++; + const size_t new_size = old_size + 1; + column_string_chars.resize(new_size); + memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, 1); + (*dest_nested_null_map).push_back(false); + string_pos++; + dest_pos++; + column_string_offsets.push_back(string_pos); + } + } else { + for (size_t str_pos = 0; str_pos <= str_ref.size;) { + const size_t str_offset = str_pos; + const size_t old_size = column_string_chars.size(); + const size_t split_part_size = split_str(str_pos, str_ref, delimiter_ref); + str_pos += delimiter_ref.size; + const size_t new_size = old_size + split_part_size; + column_string_chars.resize(new_size); + if (split_part_size > 0) { + memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, + split_part_size); + } + (*dest_nested_null_map).push_back(false); + string_pos += split_part_size; + dest_pos++; + column_string_offsets.push_back(string_pos); + } + } + dest_offsets.push_back(dest_pos); + } + } + + size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) { + size_t old_size = pos; + size_t str_size = str_ref.size; + while (pos < str_size && + memcmp(str_ref.data + pos, delimiter_ref.data, delimiter_ref.size)) { + pos++; + } + return pos - old_size; + } +}; + struct SM3Sum { static constexpr auto name = "sm3sum"; using ObjectData = SM3Digest; diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index c499718ff1..0ce215a650 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -23,6 +23,7 @@ #include "function_test_util.h" #include "util/encryption_util.h" #include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_string.h" namespace doris::vectorized { diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/split_by_string.md b/docs/en/docs/sql-manual/sql-functions/string-functions/split_by_string.md new file mode 100644 index 0000000000..9c473c2dc9 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/split_by_string.md @@ -0,0 +1,112 @@ +--- +{ + "title": "split_by_string", + "language": "zh-CN" +} +--- + + + +## split_by_string + +### description + +#### Syntax + +``` +split_by_string(s, separator) +``` +Splits a string into substrings separated by a string. It uses a constant string separator of multiple characters as the separator. If the string separator is empty, it will split the string s into an array of single characters. + +#### Arguments +`separator` — The separator. Type: `String` + +`s` — The string to split. Type: `String` + +#### Returned value(s) + +Returns an array of selected substrings. Empty substrings may be selected when: + +A non-empty separator occurs at the beginning or end of the string; + +There are multiple consecutive separators; + +The original string s is empty. + +Type: `Array(String)` + +### notice + +`Only supported in vectorized engine` + +### example + +``` +SELECT split_by_string('1, 2 3, 4,5, abcde', ', '); +select split_by_string('a1b1c1d','1'); ++---------------------------------+ +| split_by_string('a1b1c1d', '1') | ++---------------------------------+ +| ['a', 'b', 'c', 'd'] | ++---------------------------------+ + +select split_by_string(',,a,b,c,',','); ++----------------------------------+ +| split_by_string(',,a,b,c,', ',') | ++----------------------------------+ +| ['', '', 'a', 'b', 'c', ''] | ++----------------------------------+ + +SELECT split_by_string(NULL,','); ++----------------------------+ +| split_by_string(NULL, ',') | ++----------------------------+ +| NULL | ++----------------------------+ + +select split_by_string('a,b,c,abcde',','); ++-------------------------------------+ +| split_by_string('a,b,c,abcde', ',') | ++-------------------------------------+ +| ['a', 'b', 'c', 'abcde'] | ++-------------------------------------+ + +select split_by_string('1,,2,3,,4,5,,abcde', ',,'); ++---------------------------------------------+ +| split_by_string('1,,2,3,,4,5,,abcde', ',,') | ++---------------------------------------------+ +| ['1', '2,3', '4,5', 'abcde'] | ++---------------------------------------------+ + +select split_by_string(',,,,',',,'); ++-------------------------------+ +| split_by_string(',,,,', ',,') | ++-------------------------------+ +| ['', '', ''] | ++-------------------------------+ + +select split_by_string(',,a,,b,,c,,',',,'); ++--------------------------------------+ +| split_by_string(',,a,,b,,c,,', ',,') | ++--------------------------------------+ +| ['', 'a', 'b', 'c', ''] | ++--------------------------------------+ +``` +### keywords + +SPLIT_BY_STRING,SPLIT \ No newline at end of file diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/split_by_string.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/split_by_string.md new file mode 100644 index 0000000000..033388160b --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/split_by_string.md @@ -0,0 +1,112 @@ +--- +{ + "title": "split_by_string", + "language": "zh-CN" +} +--- + + + +## split_by_string + +### description + +#### Syntax + +``` +split_by_string(s, separator) +``` +将字符串拆分为由字符串分隔的子字符串。它使用多个字符的常量字符串分隔符作为分隔符。如果字符串分隔符为空,它将字符串拆分为单个字符数组。 + +#### Arguments + +`separator` — 分隔符是一个字符串,是用来分割的标志字符. 类型: `String` + +`s` — 需要分割的字符串. 类型: `String` + +#### Returned value(s) + +返回一个包含子字符串的数组. 以下情况会返回空的子字符串: + +需要分割的字符串的首尾是分隔符; + +多个分隔符连续出现; + +需要分割的字符串为空,而分隔符不为空. + +Type: `Array(String)` + +### notice + +`Only supported in vectorized engine` + +### example + +``` +select split_by_string('a1b1c1d','1'); ++---------------------------------+ +| split_by_string('a1b1c1d', '1') | ++---------------------------------+ +| ['a', 'b', 'c', 'd'] | ++---------------------------------+ + +select split_by_string(',,a,b,c,',','); ++----------------------------------+ +| split_by_string(',,a,b,c,', ',') | ++----------------------------------+ +| ['', '', 'a', 'b', 'c', ''] | ++----------------------------------+ + +SELECT split_by_string(NULL,','); ++----------------------------+ +| split_by_string(NULL, ',') | ++----------------------------+ +| NULL | ++----------------------------+ + +select split_by_string('a,b,c,abcde',','); ++-------------------------------------+ +| split_by_string('a,b,c,abcde', ',') | ++-------------------------------------+ +| ['a', 'b', 'c', 'abcde'] | ++-------------------------------------+ + +select split_by_string('1,,2,3,,4,5,,abcde', ',,'); ++---------------------------------------------+ +| split_by_string('1,,2,3,,4,5,,abcde', ',,') | ++---------------------------------------------+ +| ['1', '2,3', '4,5', 'abcde'] | ++---------------------------------------------+ + +select split_by_string(',,,,',',,'); ++-------------------------------+ +| split_by_string(',,,,', ',,') | ++-------------------------------+ +| ['', '', ''] | ++-------------------------------+ + +select split_by_string(',,a,,b,,c,,',',,'); ++--------------------------------------+ +| split_by_string(',,a,,b,,c,,', ',,') | ++--------------------------------------+ +| ['', 'a', 'b', 'c', ''] | ++--------------------------------------+ +``` +### keywords + +SPLIT_BY_STRING,SPLIT \ No newline at end of file diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index d34b0ba34b..be7d70a6b9 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2321,6 +2321,10 @@ visible_functions = [ [['money_format'], 'VARCHAR', ['DECIMAL128'], '_ZN5doris15StringFunctions12money_formatEPN9doris_udf15FunctionContextERKNS1_12DecimalV2ValE', '', '', 'vec', ''], + [['split_by_char'],'ARRAY_VARCHAR',['STRING','STRING'], + '', '', '', 'vec', ''], + [['split_by_string'],'ARRAY_VARCHAR',['STRING','STRING'], + '', '', '', 'vec', ''], [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE', '', '', 'vec', 'ALWAYS_NULLABLE'], diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out new file mode 100644 index 0000000000..fd69d943f3 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out @@ -0,0 +1,89 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +['a', 'b', 'c', 'd', 'e'] + +-- !sql -- +['1', '2', '5', '5', '3'] + +-- !sql -- +[] + +-- !sql -- +[] + +-- !sql -- +[] + +-- !sql -- +['a', 'b', 'c', 'd'] + +-- !sql -- +['', '', '', ''] + +-- !sql -- +['a', 'b', 'c', 'abcde'] + +-- !sql -- +['', '', 'a', 'b', 'c', ''] + +-- !sql -- +['null'] + +-- !sql -- +['1', '2,3', '4,5', 'abcde'] + +-- !sql -- +['a', 'b', 'c', 'd', 'e'] + +-- !sql -- +[] + +-- !sql -- +[] + +-- !sql -- +[] + +-- !sql -- +['1', '2,3', '', '', '4,5, abcde'] + +-- !sql -- +['', '', ''] + +-- !sql -- +['a', 'b', 'c'] + +-- !sql -- +['a', 'b', 'c', ''] + +-- !sql -- +['', 'a', 'b', 'c', ''] + +-- !sql -- +['null'] + +-- !sql -- +1 abcde ['a', 'b', 'c', 'd', 'e'] +2 12553 ['1', '2', '5', '5', '3'] +3 [] +4 , [] +5 a [] +6 a1b1c1d 1 ['a', 'b', 'c', 'd'] +7 ,,, , ['', '', '', ''] +8 a,b,c , ['a', 'b', 'c'] +9 a,b,c, , ['a', 'b', 'c', ''] +10 \N , \N +11 a,b,c,12345, , ['a', 'b', 'c', '12345', ''] + +-- !sql -- +1 1,,2,3,,4,5,,abcde ,, ['1', '2,3', '4,5', 'abcde'] +2 abcde ['a', 'b', 'c', 'd', 'e'] +3 [] +4 , [] +5 a [] +6 1,,2,3,,,,,,4,5,,abcde ,, ['1', '2,3', '', '', '4,5', 'abcde'] +7 ,,, , ['', '', '', ''] +8 a,b,c , ['a', 'b', 'c'] +9 a,b,c, , ['a', 'b', 'c', ''] +10 \N , \N + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy new file mode 100644 index 0000000000..343ebb0634 --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_split_by_string") { + sql "set enable_vectorized_engine = true;" + + // split by char + qt_sql "select split_by_string('abcde','');" + qt_sql "select split_by_string('12553','');" + qt_sql "select split_by_string('','');" + qt_sql "select split_by_string('',',');" + qt_sql "select split_by_string('','a');" + + qt_sql "select split_by_string('a1b1c1d','1');" + qt_sql "select split_by_string(',,,',',');" + qt_sql "select split_by_string('a,b,c,abcde',',');" + qt_sql "select split_by_string(',,a,b,c,',',');" + qt_sql "select split_by_string('null',',');" + + // split by string + qt_sql "select split_by_string('1,,2,3,,4,5,,abcde', ',,');" + qt_sql "select split_by_string('abcde','');" + qt_sql "select split_by_string('','');" + qt_sql "select split_by_string('',',');" + qt_sql "select split_by_string('','a');" + + qt_sql "select split_by_string('1,,2,3,,,,,,4,5, abcde', ',,');" + qt_sql "select split_by_string(',,,,',',,');" + qt_sql "select split_by_string('a,,b,,c',',,');" + qt_sql "select split_by_string('a,,b,,c,,',',,');" + qt_sql "select split_by_string(',,a,,b,,c,,',',,');" + qt_sql "select split_by_string('null',',');" + + def tableName1 = "test_split_by_char" + + sql """DROP TABLE IF EXISTS ${tableName1}""" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName1} ( + `k1` int(11) NULL COMMENT "", + `v1` varchar(20) NULL COMMENT "", + `v2` varchar(1) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + sql """ INSERT INTO ${tableName1} VALUES(1, 'abcde', '') """ + sql """ INSERT INTO ${tableName1} VALUES(2, '12553', '') """ + sql """ INSERT INTO ${tableName1} VALUES(3, '', '') """ + sql """ INSERT INTO ${tableName1} VALUES(4, '', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(5, '', 'a') """ + sql """ INSERT INTO ${tableName1} VALUES(6, 'a1b1c1d', '1') """ + sql """ INSERT INTO ${tableName1} VALUES(7, ',,,', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(8, 'a,b,c', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(9, 'a,b,c,', ',') """ + sql """ INSERT INTO ${tableName1} VALUES(10, null, ',') """ + sql """ INSERT INTO ${tableName1} VALUES(11, 'a,b,c,12345,', ',') """ + + qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName1} ORDER BY k1" + + def tableName2 = "test_split_by_string" + + sql """DROP TABLE IF EXISTS ${tableName2}""" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName2} ( + `k1` int(11) NULL COMMENT "", + `v1` varchar(50) NULL COMMENT "", + `v2` varchar(10) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + sql """ INSERT INTO ${tableName2} VALUES(1, '1,,2,3,,4,5,,abcde', ',,') """ + sql """ INSERT INTO ${tableName2} VALUES(2, 'abcde','') """ + sql """ INSERT INTO ${tableName2} VALUES(3, '', '') """ + sql """ INSERT INTO ${tableName2} VALUES(4, '', ',') """ + sql """ INSERT INTO ${tableName2} VALUES(5, '', 'a') """ + sql """ INSERT INTO ${tableName2} VALUES(6, '1,,2,3,,,,,,4,5,,abcde', ',,') """ + sql """ INSERT INTO ${tableName2} VALUES(7, ',,,', ',') """ + sql """ INSERT INTO ${tableName2} VALUES(8, 'a,b,c', ',') """ + sql """ INSERT INTO ${tableName2} VALUES(9, 'a,b,c,', ',') """ + sql """ INSERT INTO ${tableName2} VALUES(10, null, ',') """ + + + qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1" +} \ No newline at end of file