diff --git a/be/src/exprs/string_functions.cpp b/be/src/exprs/string_functions.cpp index 7214b0bd81..b9feb4c424 100644 --- a/be/src/exprs/string_functions.cpp +++ b/be/src/exprs/string_functions.cpp @@ -761,4 +761,63 @@ StringVal StringFunctions::money_format(FunctionContext *context, const LargeInt return do_money_format(context, ss.str()); } +static int index_of(const uint8_t* source, int source_offset, int source_count, + const uint8_t* target, int target_offset, int target_count, + int from_index) { + if (from_index >= source_count) { + return (target_count == 0 ? source_count : -1); + } + if (from_index < 0) { + from_index = 0; + } + if (target_count == 0) { + return from_index; + } + const uint8_t first = target[target_offset]; + int max = source_offset + (source_count - target_count); + for (int i = source_offset + from_index; i <= max; i++) { + while (i <= max && source[i] != first) i++; // Look for first character + if (i <= max) { // Found first character, now look at the rest of v2 + int j = i + 1; + int end = j + target_count - 1; + for (int k = target_offset + 1; j < end && source[j] == target[k]; j++, k++); + if (j == end) { + return i - source_offset; // Found whole string. + } + } + } + return -1; +} + + +StringVal StringFunctions::split_part(FunctionContext* context, const StringVal& content, + const StringVal& delimiter, const IntVal& field) { + if (content.is_null || delimiter.is_null || field.is_null || field.val <= 0) { + return StringVal::null(); + } + std::vector find(field.val, -1); //store substring position + for (int i = 0; i <= field.val; i++) find[i] = -1; // init + int from = 0; + for (int i = 1; i <= field.val; i++) { // find + int last_index = i - 1; + find[last_index] = index_of(content.ptr, 0, content.len, delimiter.ptr, 0, delimiter.len, from); + from = find[last_index] + 1; + if (find[last_index] == -1) { + break; + } + } + if ((field.val > 1 && find[field.val - 2] == -1) || (field.val == 1 && find[field.val - 1] == -1)) { + // field not find return null + return StringVal::null(); + } + int start_pos; + if (field.val == 1) { // find need split first part + start_pos = 0; + } else { + start_pos = find[field.val - 2] + delimiter.len; + } + int len = (find[field.val - 1] == -1 ? content.len : find[field.val - 1]) - start_pos; + return StringVal(content.ptr + start_pos, len); +} + } diff --git a/be/src/exprs/string_functions.h b/be/src/exprs/string_functions.h index a4ca3a769d..fc7087d6af 100644 --- a/be/src/exprs/string_functions.h +++ b/be/src/exprs/string_functions.h @@ -174,6 +174,9 @@ public: ss << std::put_money(v); return AnyValUtil::from_string_temp(context, ss.str()); }; + + static StringVal split_part(FunctionContext* context, const StringVal& content, + const StringVal& delimiter, const IntVal& field); }; } diff --git a/be/test/exprs/string_functions_test.cpp b/be/test/exprs/string_functions_test.cpp index e0e355505d..fe7380a67e 100644 --- a/be/test/exprs/string_functions_test.cpp +++ b/be/test/exprs/string_functions_test.cpp @@ -123,6 +123,40 @@ TEST_F(StringFunctionsTest, money_format_decimal_v2) { ASSERT_EQ(expected, result); } +TEST_F(StringFunctionsTest, split_part) { + doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("hello")), + StringFunctions::split_part(context, StringVal("hello word"), StringVal(" "), 1)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("word")), + StringFunctions::split_part(context, StringVal("hello word"), StringVal(" "), 2)); + + ASSERT_EQ(StringVal::null(), + StringFunctions::split_part(context, StringVal("hello word"), StringVal(" "), 3)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("")), + StringFunctions::split_part(context, StringVal("hello word"), StringVal("hello"), 1)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string(" word")), + StringFunctions::split_part(context, StringVal("hello word"), StringVal("hello"), 2)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("2019年9")), + StringFunctions::split_part(context, StringVal("2019年9月8日"), StringVal("月"), 1)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("")), + StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 1)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("bcd")), + StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 2)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("bd")), + StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 3)); + + ASSERT_EQ(AnyValUtil::from_string_temp(context,std::string("")), + StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 4)); +} + } int main(int argc, char** argv) { diff --git a/docs/documentation/cn/sql-reference/sql-functions/string-functions/split_part.md b/docs/documentation/cn/sql-reference/sql-functions/string-functions/split_part.md new file mode 100644 index 0000000000..2564a57855 --- /dev/null +++ b/docs/documentation/cn/sql-reference/sql-functions/string-functions/split_part.md @@ -0,0 +1,42 @@ +# split_part + +## Syntax + +`VARCHAR split_part(VARCHAR content, VARCHAR delimiter, INT field)` + +## Description + +根据分割符拆分字符串, 返回指定的分割部分(从一开始计数)。 + +## Examples + +``` +mysql> select split_part("hello word", " ", 1); ++----------------------------------+ +| split_part('hello word', ' ', 1) | ++----------------------------------+ +| hello | ++----------------------------------+ + + +mysql> select split_part("hello word", " ", 2); ++----------------------------------+ +| split_part('hello word', ' ', 2) | ++----------------------------------+ +| word | ++----------------------------------+ + +mysql> select split_part("2019年7月8号", "月", 1); ++-----------------------------------------+ +| split_part('2019年7月8号', '月', 1) | ++-----------------------------------------+ +| 2019年7 | ++-----------------------------------------+ + +mysql> select split_part("abca", "a", 1); ++----------------------------+ +| split_part('abca', 'a', 1) | ++----------------------------+ +| | ++----------------------------+ +``` diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index b899011777..9db110b2de 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -554,6 +554,8 @@ visible_functions = [ '_ZN5doris15StringFunctions12money_formatEPN9doris_udf15FunctionContextERKNS1_10DecimalValE'], [['money_format'], 'VARCHAR', ['DECIMALV2'], '_ZN5doris15StringFunctions12money_formatEPN9doris_udf15FunctionContextERKNS1_12DecimalV2ValE'], + [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], + '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE'], # Utility functions [['sleep'], 'BOOLEAN', ['INT'],