diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index 3f1506071b..b164eb70eb 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -122,6 +122,101 @@ public: return rtrim(ltrim(str)); } + static StringRef rtrim(const StringRef& str, const StringRef& rhs) { + if (str.size == 0 || rhs.size == 0) { + return str; + } + if (rhs.size == 1) { + auto begin = 0; + int64_t end = str.size - 1; + const char blank = rhs.data[0]; +#if defined(__SSE2__) || defined(__aarch64__) + const auto pattern = _mm_set1_epi8(blank); + while (end - begin + 1 >= REGISTER_SIZE) { + const auto v_haystack = _mm_loadu_si128( + reinterpret_cast(str.data + end + 1 - REGISTER_SIZE)); + const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); + const auto mask = _mm_movemask_epi8(v_against_pattern); + int offset = __builtin_clz(~(mask << REGISTER_SIZE)); + /// means not found + if (offset == 0) { + return StringRef(str.data + begin, end - begin + 1); + } else { + end -= offset; + } + } +#endif + while (end >= begin && str.data[end] == blank) { + --end; + } + if (end < 0) { + return StringRef(""); + } + return StringRef(str.data + begin, end - begin + 1); + } + auto begin = 0; + auto end = str.size - 1; + const auto rhs_size = rhs.size; + while (end - begin + 1 >= rhs_size) { + if (memcmp(str.data + end - rhs_size + 1, rhs.data, rhs_size) == 0) { + end -= rhs.size; + } else { + break; + } + } + return StringRef(str.data + begin, end - begin + 1); + } + + static StringRef ltrim(const StringRef& str, const StringRef& rhs) { + if (str.size == 0 || rhs.size == 0) { + return str; + } + if (str.size == 1) { + auto begin = 0; + auto end = str.size - 1; + const char blank = rhs.data[0]; +#if defined(__SSE2__) || defined(__aarch64__) + const auto pattern = _mm_set1_epi8(blank); + while (end - begin + 1 >= REGISTER_SIZE) { + const auto v_haystack = + _mm_loadu_si128(reinterpret_cast(str.data + begin)); + const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); + const auto mask = _mm_movemask_epi8(v_against_pattern) ^ 0xffff; + /// zero means not found + if (mask == 0) { + begin += REGISTER_SIZE; + } else { + const auto offset = __builtin_ctz(mask); + begin += offset; + return StringRef(str.data + begin, end - begin + 1); + } + } +#endif + while (begin <= end && str.data[begin] == blank) { + ++begin; + } + return StringRef(str.data + begin, end - begin + 1); + } + auto begin = 0; + auto end = str.size - 1; + const auto rhs_size = rhs.size; + while (end - begin + 1 >= rhs_size) { + if (memcmp(str.data + begin, rhs.data, rhs_size) == 0) { + begin += rhs.size; + } else { + break; + } + } + return StringRef(str.data + begin, end - begin + 1); + } + + static StringRef trim(const StringRef& str, const StringRef& rhs) { + if (str.size == 0 || rhs.size == 0) { + return str; + } + return rtrim(ltrim(str, rhs), rhs); + } + // Gcc will do auto simd in this function static bool is_ascii(const StringRef& str) { char or_code = 0; diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 5579019452..c6599b2e3e 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -338,31 +338,28 @@ struct InitcapImpl { struct NameTrim { static constexpr auto name = "trim"; }; - struct NameLTrim { static constexpr auto name = "ltrim"; }; - struct NameRTrim { static constexpr auto name = "rtrim"; }; - template -struct TrimImpl { - static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, +struct TrimUtil { + static Status vector(const ColumnString::Chars& str_data, + const ColumnString::Offsets& str_offsets, const StringRef& rhs, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { - size_t offset_size = offsets.size(); - res_offsets.resize(offsets.size()); - + size_t offset_size = str_offsets.size(); + res_offsets.resize(str_offsets.size()); for (size_t i = 0; i < offset_size; ++i) { - const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - ColumnString::Offset size = offsets[i] - offsets[i - 1]; + const char* raw_str = reinterpret_cast(&str_data[str_offsets[i - 1]]); + ColumnString::Offset size = str_offsets[i] - str_offsets[i - 1]; StringRef str(raw_str, size); if constexpr (is_ltrim) { - str = simd::VStringFunctions::ltrim(str); + str = simd::VStringFunctions::ltrim(str, rhs); } if constexpr (is_rtrim) { - str = simd::VStringFunctions::rtrim(str); + str = simd::VStringFunctions::rtrim(str, rhs); } StringOP::push_value_string(std::string_view((char*)str.data, str.size), i, res_data, res_offsets); @@ -370,6 +367,105 @@ struct TrimImpl { return Status::OK(); } }; +// This is an implementation of a parameter for the Trim function. +template +struct Trim1Impl { + static constexpr auto name = Name::name; + + static DataTypes get_variadic_argument_types() { return {std::make_shared()}; } + + static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + const ColumnPtr column = block.get_by_position(arguments[0]).column; + if (auto col = assert_cast(column.get())) { + auto col_res = ColumnString::create(); + char blank[] = " "; + StringRef rhs(blank, 1); + TrimUtil::vector(col->get_chars(), col->get_offsets(), rhs, + col_res->get_chars(), col_res->get_offsets()); + block.replace_by_position(result, std::move(col_res)); + } else { + return Status::RuntimeError("Illegal column {} of argument of function {}", + block.get_by_position(arguments[0]).column->get_name(), + name); + } + return Status::OK(); + } +}; + +// This is an implementation of two parameters for the Trim function. +template +struct Trim2Impl { + static constexpr auto name = Name::name; + + static DataTypes get_variadic_argument_types() { + return {std::make_shared(), std::make_shared()}; + } + + static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + const ColumnPtr column = block.get_by_position(arguments[0]).column; + const auto& rcol = + assert_cast(block.get_by_position(arguments[1]).column.get()) + ->get_data_column_ptr(); + if (auto col = assert_cast(column.get())) { + if (auto col_right = assert_cast(rcol.get())) { + auto col_res = ColumnString::create(); + const char* raw_rhs = reinterpret_cast(&(col_right->get_chars()[0])); + ColumnString::Offset rhs_size = col_right->get_offsets()[0]; + StringRef rhs(raw_rhs, rhs_size); + TrimUtil::vector(col->get_chars(), col->get_offsets(), rhs, + col_res->get_chars(), col_res->get_offsets()); + block.replace_by_position(result, std::move(col_res)); + } else { + return Status::RuntimeError("Illegal column {} of argument of function {}", + block.get_by_position(arguments[1]).column->get_name(), + name); + } + + } else { + return Status::RuntimeError("Illegal column {} of argument of function {}", + block.get_by_position(arguments[0]).column->get_name(), + name); + } + return Status::OK(); + } +}; + +template +class FunctionTrim : public IFunction { +public: + static constexpr auto name = impl::name; + static FunctionPtr create() { return std::make_shared>(); } + String get_name() const override { return impl::name; } + + size_t get_number_of_arguments() const override { + return get_variadic_argument_types_impl().size(); + } + + bool get_is_injective(const Block&) override { return false; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + if (!is_string_or_fixed_string(arguments[0])) { + LOG(FATAL) << fmt::format("Illegal type {} of argument of function {}", + arguments[0]->get_name(), get_name()); + } + return arguments[0]; + } + // The second parameter of "trim" is a constant. + ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; } + + bool use_default_implementation_for_constants() const override { return true; } + + DataTypes get_variadic_argument_types_impl() const override { + return impl::get_variadic_argument_types(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + return impl::execute(context, block, arguments, result, input_rows_count); + } +}; struct UnHexImpl { static constexpr auto name = "unhex"; @@ -718,12 +814,6 @@ using FunctionToUpper = FunctionStringToString, NameTo using FunctionToInitcap = FunctionStringToString; -using FunctionLTrim = FunctionStringToString, NameLTrim>; - -using FunctionRTrim = FunctionStringToString, NameRTrim>; - -using FunctionTrim = FunctionStringToString, NameTrim>; - using FunctionUnHex = FunctionStringOperateToNullType; using FunctionToBase64 = FunctionStringOperateToNullType; using FunctionFromBase64 = FunctionStringOperateToNullType; @@ -750,9 +840,12 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); - factory.register_function(); - factory.register_function(); - factory.register_function(); + factory.register_function>>(); + factory.register_function>>(); + factory.register_function>>(); + factory.register_function>>(); + factory.register_function>>(); + factory.register_function>>(); factory.register_function(); factory.register_function>(); factory.register_function>(); diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/ltrim.md b/docs/en/docs/sql-manual/sql-functions/string-functions/ltrim.md index a29c9814b9..f534d15d47 100644 --- a/docs/en/docs/sql-manual/sql-functions/string-functions/ltrim.md +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/ltrim.md @@ -28,10 +28,10 @@ under the License. ### Description #### Syntax -`VARCHAR ltrim (VARCHAR str)` +`VARCHAR ltrim(VARCHAR str[, VARCHAR rhs])` -Remove the space that appears continuously from the beginning of the parameter str +When the 'rhs' parameter is not present, remove the continuous spaces that appear from the beginning of the 'str' parameter. Otherwise, remove 'rhs'. ### example @@ -42,6 +42,13 @@ mysql> SELECT ltrim(' ab d'); +------------------+ | ab d | +------------------+ + +mysql> SELECT ltrim('ababccaab','ab') str; ++-------+ +| str | ++-------+ +| ccaab | ++-------+ ``` ### keywords LTRIM diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/rtrim.md b/docs/en/docs/sql-manual/sql-functions/string-functions/rtrim.md index 7cade6ab52..b81d4cfbf9 100644 --- a/docs/en/docs/sql-manual/sql-functions/string-functions/rtrim.md +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/rtrim.md @@ -28,10 +28,10 @@ under the License. ### description #### Syntax -`VARCHAR rtrim(VARCHAR str)` +`VARCHAR rtrim(VARCHAR str[, VARCHAR rhs])` -Remove the space that appears continuously from the ending of the parameter str +When the 'rhs' parameter is not present, remove the continuous spaces that appear from the ending of the 'str' parameter. Otherwise, remove 'rhs'. ### example @@ -42,6 +42,13 @@ mysql> SELECT rtrim('ab d ') str; +------+ | ab d | +------+ + +mysql> SELECT rtrim('ababccaab','ab') str; ++---------+ +| str | ++---------+ +| ababcca | ++---------+ ``` ### keywords RTRIM diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/trim.md b/docs/en/docs/sql-manual/sql-functions/string-functions/trim.md index e7e26526cb..8363c57a1e 100644 --- a/docs/en/docs/sql-manual/sql-functions/string-functions/trim.md +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/trim.md @@ -28,10 +28,10 @@ under the License. ### description #### Syntax -`VARCHAR trim(VARCHAR str)` +`VARCHAR trim(VARCHAR str[, VARCHAR rhs])` -Remove the space that appears continuously from the starring and ending of the parameter str +When the 'rhs' parameter is not present, remove the continuous spaces that appear from the starting and ending of the 'str' parameter. Otherwise, remove 'rhs'. ### example @@ -42,6 +42,13 @@ mysql> SELECT trim(' ab d ') str; +------+ | ab d | +------+ + +mysql> SELECT trim('ababccaab','ab') str; ++------+ +| str | ++------+ +| cca | ++------+ ``` ### keywords TRIM diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/ltrim.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/ltrim.md index ae7e415b3e..26c8367120 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/ltrim.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/ltrim.md @@ -27,11 +27,11 @@ under the License. ## ltrim ### description #### Syntax - -`VARCHAR ltrim(VARCHAR str)` + +`VARCHAR ltrim(VARCHAR str[, VARCHAR rhs])` -将参数 str 中从左侧部分开始部分连续出现的空格去掉 +当没有rhs参数时,将参数 str 中从左侧部分开始部分连续出现的空格去掉,否则去掉rhs ### example @@ -42,6 +42,13 @@ mysql> SELECT ltrim(' ab d') str; +------+ | ab d | +------+ + +mysql> SELECT ltrim('ababccaab','ab') str; ++-------+ +| str | ++-------+ +| ccaab | ++-------+ ``` ### keywords LTRIM diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/rtrim.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/rtrim.md index f1d73bf487..aed5b7668d 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/rtrim.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/rtrim.md @@ -28,10 +28,10 @@ under the License. ### description #### Syntax -`VARCHAR rtrim(VARCHAR str)` +`VARCHAR rtrim(VARCHAR str[, VARCHAR rhs])` -将参数 str 中从右侧部分开始部分连续出现的空格去掉 +当没有rhs参数时,将参数 str 中从右侧部分开始部分连续出现的空格去掉,否则去掉rhs ### example @@ -42,6 +42,13 @@ mysql> SELECT rtrim('ab d ') str; +------+ | ab d | +------+ + +mysql> SELECT rtrim('ababccaab','ab') str; ++---------+ +| str | ++---------+ +| ababcca | ++---------+ ``` ### keywords RTRIM diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/trim.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/trim.md index a4575d6526..61f7148135 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/trim.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/trim.md @@ -28,10 +28,10 @@ under the License. ### description #### Syntax -`VARCHAR trim(VARCHAR str)` +`VARCHAR trim(VARCHAR str[, VARCHAR rhs])` -将参数 str 中右侧和左侧开始部分连续出现的空格去掉 +当没有rhs参数时,将参数 str 中右侧和左侧开始部分连续出现的空格去掉,否则去掉rhs ### example @@ -42,6 +42,13 @@ mysql> SELECT trim(' ab d ') str; +------+ | ab d | +------+ + +mysql> SELECT trim('ababccaab','ab') str; ++------+ +| str | ++------+ +| cca | ++------+ ``` ### keywords TRIM diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 39fd36f54e..4e33f3090d 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1471,9 +1471,12 @@ visible_functions = [ [['lower', 'lcase'], 'VARCHAR', ['VARCHAR'], ''], [['upper', 'ucase'], 'VARCHAR', ['VARCHAR'], ''], [['initcap'], 'VARCHAR', ['VARCHAR'], ''], + [['trim'], 'VARCHAR', ['VARCHAR','VARCHAR'], ''], [['trim'], 'VARCHAR', ['VARCHAR'], ''], [['ltrim'], 'VARCHAR', ['VARCHAR'], ''], + [['ltrim'], 'VARCHAR', ['VARCHAR','VARCHAR'], ''], [['rtrim'], 'VARCHAR', ['VARCHAR'], ''], + [['rtrim'], 'VARCHAR', ['VARCHAR','VARCHAR'], ''], [['ascii'], 'INT', ['VARCHAR'], ''], [['instr'], 'INT', ['VARCHAR', 'VARCHAR'], ''], [['locate'], 'INT', ['VARCHAR', 'VARCHAR'], ''], @@ -1557,8 +1560,11 @@ visible_functions = [ [['lower', 'lcase'], 'STRING', ['STRING'], ''], [['upper', 'ucase'], 'STRING', ['STRING'], ''], [['trim'], 'STRING', ['STRING'], ''], + [['trim'], 'STRING', ['STRING','STRING'], ''], [['ltrim'], 'STRING', ['STRING'], ''], + [['ltrim'], 'STRING', ['STRING','STRING'], ''], [['rtrim'], 'STRING', ['STRING'], ''], + [['rtrim'], 'STRING', ['STRING','STRING'], ''], [['ascii'], 'INT', ['STRING'], ''], [['instr'], 'INT', ['STRING', 'STRING'], ''], [['locate'], 'INT', ['STRING', 'STRING'], ''], diff --git a/regression-test/suites/correctness/test_trim_new_parameters.groovy b/regression-test/suites/correctness/test_trim_new_parameters.groovy new file mode 100644 index 0000000000..3209eb7aae --- /dev/null +++ b/regression-test/suites/correctness/test_trim_new_parameters.groovy @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_trim_new_parameters") { + sql """ DROP TABLE IF EXISTS tbl_trim_new_parameters """ + sql """ + CREATE TABLE tbl_trim_new_parameters ( + id INT DEFAULT '10', + username VARCHAR(32) DEFAULT '' + ) ENGINE=OLAP + AGGREGATE KEY(id,username) + DISTRIBUTED BY HASH(id) BUCKETS 10 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "in_memory" = "false", + "storage_format" = "V2" + ); + """ + sql """ + insert into tbl_trim_new_parameters values(1,'abcabccccabc') + """ + sql """ + insert into tbl_trim_new_parameters values(2,'abcabcabc') + """ + sql """ + insert into tbl_trim_new_parameters values(3,'') + """ + + List> results = sql "select id,trim(username,'abc') from tbl_trim_new_parameters order by id" + + assertEquals(results.size(), 3) + assertEquals(results[0][0], 1) + assertEquals(results[1][0], 2) + assertEquals(results[2][0], 3) + assertEquals(results[0][1], 'ccc') + assertEquals(results[1][1], '') + assertEquals(results[2][1], '') + + List> trim = sql "select trim(' abc ')" + assertEquals(trim[0][0], 'abc') + + List> ltrim = sql "select ltrim(' abc ')" + assertEquals(ltrim[0][0], 'abc ') + + List> rtrim = sql "select rtrim(' abc ')" + assertEquals(rtrim[0][0], ' abc') + + trim = sql "select trim('abcabcTTTabcabc','abc')" + assertEquals(trim[0][0], 'TTT') + + ltrim = sql "select ltrim('abcabcTTTbc','abc')" + assertEquals(ltrim[0][0], 'TTTbc') + + rtrim = sql "select rtrim('bcTTTabcabc','abc')" + assertEquals(rtrim[0][0], 'bcTTT') +}