// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include #include "exprs/math_functions.h" #include "exprs/string_functions.h" #include "udf/udf.h" #include "util/md5.h" #include "util/sm3.h" #include "util/url_parser.h" #include "vec/columns/column_decimal.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/functions/function.h" #include "vec/functions/function_helpers.h" #include "vec/utils/util.hpp" namespace doris::vectorized { inline size_t get_utf8_byte_length(unsigned char byte) { size_t char_size = 0; if (byte >= 0xFC) { char_size = 6; } else if (byte >= 0xF8) { char_size = 5; } else if (byte >= 0xF0) { char_size = 4; } else if (byte >= 0xE0) { char_size = 3; } else if (byte >= 0xC0) { char_size = 2; } else { char_size = 1; } return char_size; } inline size_t get_char_len(const std::string_view& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.length(); i += char_size) { char_size = get_utf8_byte_length(str[i]); str_index->push_back(i); ++char_len; } return char_len; } inline size_t get_char_len(const StringVal& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); str_index->push_back(i); ++char_len; } return char_len; } inline size_t get_char_len(const StringValue& str, size_t end_pos) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < std::min(str.len, end_pos); i += char_size) { char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); ++char_len; } return char_len; } struct StringOP { static void push_empty_string(int index, ColumnString::Chars& chars, ColumnString::Offsets& offsets) { chars.push_back('\0'); offsets[index] = chars.size(); } static void push_null_string(int index, ColumnString::Chars& chars, ColumnString::Offsets& offsets, NullMap& null_map) { null_map[index] = 1; push_empty_string(index, chars, offsets); } static void push_value_string(const std::string_view& string_value, int index, ColumnString::Chars& chars, ColumnString::Offsets& offsets) { chars.insert(string_value.data(), string_value.data() + string_value.size()); chars.push_back('\0'); offsets[index] = chars.size(); } }; struct SubstringUtil { static constexpr auto name = "substring"; static void substring_execute(Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { DCHECK_EQ(arguments.size(), 3); auto null_map = ColumnUInt8::create(input_rows_count, 0); ColumnPtr argument_columns[3]; for (int i = 0; i < 3; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*argument_columns[i])) { // Danger: Here must dispose the null map data first! Because // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem // of column nullable mem of null map VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data()); argument_columns[i] = nullable->get_nested_column_ptr(); } } auto res = ColumnString::create(); auto specific_str_column = assert_cast(argument_columns[0].get()); auto specific_start_column = assert_cast*>(argument_columns[1].get()); auto specific_len_column = assert_cast*>(argument_columns[2].get()); vector(specific_str_column->get_chars(), specific_str_column->get_offsets(), specific_start_column->get_data(), specific_len_column->get_data(), null_map->get_data(), res->get_chars(), res->get_offsets()); block.get_by_position(result).column = ColumnNullable::create(std::move(res), std::move(null_map)); } private: static void vector(const ColumnString::Chars& chars, const ColumnString::Offsets& offsets, const PaddedPODArray& start, const PaddedPODArray& len, NullMap& null_map, ColumnString::Chars& res_chars, ColumnString::Offsets& res_offsets) { int size = offsets.size(); res_offsets.resize(size); res_chars.reserve(chars.size()); std::vector index; for (int i = 0; i < size; ++i) { auto* raw_str = reinterpret_cast(&chars[offsets[i - 1]]); int str_size = offsets[i] - offsets[i - 1] - 1; // return null if start > src.length if (start[i] > str_size) { StringOP::push_null_string(i, res_chars, res_offsets, null_map); continue; } // return "" if len < 0 or str == 0 or start == 0 if (len[i] <= 0 || str_size == 0 || start[i] == 0) { StringOP::push_empty_string(i, res_chars, res_offsets); continue; } // reference to string_function.cpp: substring size_t byte_pos = 0; index.clear(); for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { char_size = get_utf8_byte_length((unsigned)(raw_str)[j]); index.push_back(j); if (start[i] > 0 && index.size() > start[i] + len[i]) { break; } } int fixed_pos = start[i]; if (fixed_pos < 0) { fixed_pos = index.size() + fixed_pos + 1; } if (fixed_pos > index.size()) { StringOP::push_null_string(i, res_chars, res_offsets, null_map); continue; } byte_pos = index[fixed_pos - 1]; int fixed_len = str_size - byte_pos; if (fixed_pos + len[i] <= index.size()) { fixed_len = index[fixed_pos + len[i] - 1] - byte_pos; } if (byte_pos <= str_size && fixed_len > 0) { // return StringVal(str.ptr + byte_pos, fixed_len); StringOP::push_value_string( std::string_view {reinterpret_cast(raw_str + byte_pos), (size_t)fixed_len}, i, res_chars, res_offsets); } else { StringOP::push_empty_string(i, res_chars, res_offsets); } } } }; template class FunctionSubstring : public IFunction { public: static constexpr auto name = SubstringUtil::name; String get_name() const override { return name; } static FunctionPtr create() { return std::make_shared>(); } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return make_nullable(std::make_shared()); } DataTypes get_variadic_argument_types_impl() const override { return Impl::get_variadic_argument_types(); } size_t get_number_of_arguments() const override { return get_variadic_argument_types_impl().size(); } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { return Impl::execute_impl(context, block, arguments, result, input_rows_count); } }; struct Substr3Impl { static DataTypes get_variadic_argument_types() { return {std::make_shared(), std::make_shared(), std::make_shared()}; } static Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { SubstringUtil::substring_execute(block, arguments, result, input_rows_count); return Status::OK(); } }; struct Substr2Impl { static DataTypes get_variadic_argument_types() { return {std::make_shared(), std::make_shared()}; } static Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { auto params = ColumnInt32::create(input_rows_count); auto& strlen_data = params->get_data(); auto str_col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*str_col)) { str_col = nullable->get_nested_column_ptr(); } auto& str_offset = assert_cast(str_col.get())->get_offsets(); for (int i = 0; i < input_rows_count; ++i) { strlen_data[i] = str_offset[i] - str_offset[i - 1]; } block.insert({std::move(params), std::make_shared(), "strlen"}); ColumnNumbers temp_arguments = {arguments[0], arguments[1], block.columns() - 1}; SubstringUtil::substring_execute(block, temp_arguments, result, input_rows_count); return Status::OK(); } }; class FunctionLeft : public IFunction { public: static constexpr auto name = "left"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 2; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return make_nullable(std::make_shared()); } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto int_type = std::make_shared(); size_t num_columns_without_result = block.columns(); block.insert({int_type->create_column_const(input_rows_count, to_field(1)) ->convert_to_full_column_if_const(), int_type, "const 1"}); ColumnNumbers temp_arguments(3); temp_arguments[0] = arguments[0]; temp_arguments[1] = num_columns_without_result; temp_arguments[2] = arguments[1]; SubstringUtil::substring_execute(block, temp_arguments, result, input_rows_count); return Status::OK(); } }; class FunctionRight : public IFunction { public: static constexpr auto name = "right"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 2; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return make_nullable(std::make_shared()); } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto int_type = std::make_shared(); auto params1 = ColumnInt32::create(input_rows_count); auto params2 = ColumnInt32::create(input_rows_count); size_t num_columns_without_result = block.columns(); // params1 = max(arg[1], -len(arg)) auto& index_data = params1->get_data(); auto& strlen_data = params2->get_data(); // we don't have to update null_map because FunctionSubstring will // update it // getNestedColumnIfNull arg[0] auto str_col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*str_col)) { str_col = nullable->get_nested_column_ptr(); } auto& str_offset = assert_cast(str_col.get())->get_offsets(); // getNestedColumnIfNull arg[1] auto pos_col = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*pos_col)) { pos_col = nullable->get_nested_column_ptr(); } auto& pos_data = assert_cast(pos_col.get())->get_data(); for (int i = 0; i < input_rows_count; ++i) { strlen_data[i] = str_offset[i] - str_offset[i - 1] - 1; } for (int i = 0; i < input_rows_count; ++i) { index_data[i] = std::max(-pos_data[i], -strlen_data[i]); } block.insert({std::move(params1), int_type, "index"}); block.insert({std::move(params2), int_type, "strlen"}); ColumnNumbers temp_arguments(3); temp_arguments[0] = arguments[0]; temp_arguments[1] = num_columns_without_result; temp_arguments[2] = num_columns_without_result + 1; SubstringUtil::substring_execute(block, temp_arguments, result, input_rows_count); return Status::OK(); } }; class FunctionNullOrEmpty : public IFunction { public: static constexpr auto name = "null_or_empty"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 1; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto res_map = ColumnUInt8::create(input_rows_count, 0); auto column = block.get_by_position(arguments[0]).column; if (auto* nullable = check_and_get_column(*column)) { column = nullable->get_nested_column_ptr(); VectorizedUtils::update_null_map(res_map->get_data(), nullable->get_null_map_data()); } auto str_col = assert_cast(column.get()); const auto& offsets = str_col->get_offsets(); auto& res_map_data = res_map->get_data(); for (int i = 0; i < input_rows_count; ++i) { int size = offsets[i] - offsets[i - 1] - 1; res_map_data[i] |= (size == 0); } block.replace_by_position(result, std::move(res_map)); return Status::OK(); } }; class FunctionStringConcat : public IFunction { public: static constexpr auto name = "concat"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 0; } bool is_variadic() const override { return true; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } bool use_default_implementation_for_nulls() const override { return true; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { DCHECK_GE(arguments.size(), 1); if (arguments.size() == 1) { block.get_by_position(result).column = block.get_by_position(arguments[0]).column; return Status::OK(); } int argument_size = arguments.size(); ColumnPtr argument_columns[argument_size]; std::vector offsets_list(argument_size); std::vector chars_list(argument_size); for (int i = 0; i < argument_size; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); auto col_str = assert_cast(argument_columns[i].get()); offsets_list[i] = &col_str->get_offsets(); chars_list[i] = &col_str->get_chars(); } auto res = ColumnString::create(); auto& res_data = res->get_chars(); auto& res_offset = res->get_offsets(); res_offset.resize(input_rows_count); int res_reserve_size = 0; // we could ignore null string column // but it's not necessary to ignore it for (size_t i = 0; i < offsets_list.size(); ++i) { for (size_t j = 0; j < input_rows_count; ++j) { res_reserve_size += (*offsets_list[i])[j] - (*offsets_list[i])[j - 1] - 1; } } // for each terminal zero res_reserve_size += input_rows_count; res_data.resize(res_reserve_size); for (size_t i = 0; i < input_rows_count; ++i) { int current_length = 0; for (size_t j = 0; j < offsets_list.size(); ++j) { auto& current_offsets = *offsets_list[j]; auto& current_chars = *chars_list[j]; int size = current_offsets[i] - current_offsets[i - 1] - 1; memcpy(&res_data[res_offset[i - 1]] + current_length, ¤t_chars[current_offsets[i - 1]], size); current_length += size; } // add terminal zero *(&res_data[res_offset[i - 1]] + current_length) = '\0'; current_length++; res_offset[i] = res_offset[i - 1] + current_length; } block.get_by_position(result).column = std::move(res); return Status::OK(); } }; // concat_ws (string,string....) // TODO: avoid use fmtlib class FunctionStringConcatWs : public IFunction { public: static constexpr auto name = "concat_ws"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 0; } bool is_variadic() const override { return true; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { const IDataType* first_type = arguments[0].get(); if (first_type->is_nullable()) return make_nullable(std::make_shared()); else return std::make_shared(); } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { DCHECK_GE(arguments.size(), 2); auto null_map = ColumnUInt8::create(input_rows_count, 0); // we create a zero column to simply implement auto const_null_map = ColumnUInt8::create(input_rows_count, 0); auto res = ColumnString::create(); bool is_null_type = block.get_by_position(arguments[0]).type.get()->is_nullable(); size_t argument_size = arguments.size(); std::vector offsets_list(argument_size); std::vector chars_list(argument_size); std::vector null_list(argument_size); ColumnPtr argument_columns[argument_size]; ColumnPtr argument_null_columns[argument_size]; for (size_t i = 0; i < argument_size; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*argument_columns[i])) { // Danger: Here must dispose the null map data first! Because // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem // of column nullable mem of null map null_list[i] = &nullable->get_null_map_data(); argument_null_columns[i] = nullable->get_null_map_column_ptr(); argument_columns[i] = nullable->get_nested_column_ptr(); } else { null_list[i] = &const_null_map->get_data(); } auto col_str = assert_cast(argument_columns[i].get()); offsets_list[i] = &col_str->get_offsets(); chars_list[i] = &col_str->get_chars(); } auto& res_data = res->get_chars(); auto& res_offset = res->get_offsets(); res_offset.resize(input_rows_count); VectorizedUtils::update_null_map(null_map->get_data(), *null_list[0]); fmt::memory_buffer buffer; std::vector views; for (size_t i = 0; i < input_rows_count; ++i) { auto& seq_offsets = *offsets_list[0]; auto& seq_chars = *chars_list[0]; auto& seq_nullmap = *null_list[0]; if (seq_nullmap[i]) { res_data.push_back('\0'); res_offset[i] = res_data.size(); continue; } int seq_size = seq_offsets[i] - seq_offsets[i - 1] - 1; const char* seq_data = reinterpret_cast(&seq_chars[seq_offsets[i - 1]]); std::string_view seq(seq_data, seq_size); buffer.clear(); views.clear(); for (size_t j = 1; j < argument_size; ++j) { auto& current_offsets = *offsets_list[j]; auto& current_chars = *chars_list[j]; auto& current_nullmap = *null_list[j]; int size = current_offsets[i] - current_offsets[i - 1] - 1; const char* ptr = reinterpret_cast(¤t_chars[current_offsets[i - 1]]); if (!current_nullmap[i]) { views.emplace_back(ptr, size); } } fmt::format_to(buffer, "{}", fmt::join(views, seq)); StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, res_offset); } if (is_null_type) { block.get_by_position(result).column = ColumnNullable::create(std::move(res), std::move(null_map)); } else { block.get_by_position(result).column = std::move(res); } return Status::OK(); } }; class FunctionStringRepeat : public IFunction { public: static constexpr auto name = "repeat"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 2; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { DCHECK_EQ(arguments.size(), 2); auto res = ColumnString::create(); ColumnPtr argument_ptr[2]; argument_ptr[0] = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); argument_ptr[1] = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); if (auto* col1 = check_and_get_column(*argument_ptr[0])) { if (auto* col2 = check_and_get_column(*argument_ptr[1])) { vector_vector(col1->get_chars(), col1->get_offsets(), col2->get_data(), res->get_chars(), res->get_offsets()); block.replace_by_position(result, std::move(res)); return Status::OK(); } } return Status::RuntimeError(fmt::format("not support {}", get_name())); } void vector_vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, const ColumnInt32::Container& repeats, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { size_t input_row_size = offsets.size(); // fmt::memory_buffer buffer; res_offsets.resize(input_row_size); for (ssize_t i = 0; i < input_row_size; ++i) { buffer.clear(); const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); int size = offsets[i] - offsets[i - 1] - 1; int repeat = repeats[i]; // assert size * repeat won't exceed DCHECK_LE(static_cast(size) * repeat, std::numeric_limits::max()); for (int i = 0; i < repeat; ++i) { buffer.append(raw_str, raw_str + size); } StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, res_offsets); } } }; template class FunctionStringPad : public IFunction { public: static constexpr auto name = Impl::name; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 3; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } bool use_default_implementation_for_nulls() const override { return true; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { DCHECK_GE(arguments.size(), 3); auto null_map = ColumnUInt8::create(input_rows_count, 0); // we create a zero column to simply implement auto const_null_map = ColumnUInt8::create(input_rows_count, 0); auto res = ColumnString::create(); size_t argument_size = arguments.size(); ColumnPtr argument_columns[argument_size]; for (size_t i = 0; i < argument_size; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*argument_columns[i])) { // Danger: Here must dispose the null map data first! Because // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem // of column nullable mem of null map VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data()); argument_columns[i] = nullable->get_nested_column_ptr(); } } auto& null_map_data = null_map->get_data(); auto& res_offsets = res->get_offsets(); auto& res_chars = res->get_chars(); res_offsets.resize(input_rows_count); auto strcol = assert_cast(argument_columns[0].get()); auto& strcol_offsets = strcol->get_offsets(); auto& strcol_chars = strcol->get_chars(); auto col_len = assert_cast(argument_columns[1].get()); auto& col_len_data = col_len->get_data(); auto padcol = assert_cast(argument_columns[2].get()); auto& padcol_offsets = padcol->get_offsets(); auto& padcol_chars = padcol->get_chars(); std::vector str_index; std::vector pad_index; fmt::memory_buffer buffer; for (size_t i = 0; i < input_rows_count; ++i) { str_index.clear(); pad_index.clear(); buffer.clear(); if (null_map_data[i] || col_len_data[i] < 0) { // return NULL when input string is NULL or input length is invalid number null_map_data[i] = true; StringOP::push_empty_string(i, res_chars, res_offsets); } else { int str_len = strcol_offsets[i] - strcol_offsets[i - 1] - 1; const char* str_data = reinterpret_cast(&strcol_chars[strcol_offsets[i - 1]]); int pad_len = padcol_offsets[i] - padcol_offsets[i - 1] - 1; const char* pad_data = reinterpret_cast(&padcol_chars[padcol_offsets[i - 1]]); size_t str_char_size = get_char_len(std::string_view(str_data, str_len), &str_index); size_t pad_char_size = get_char_len(std::string_view(pad_data, pad_len), &pad_index); if (col_len_data[i] <= str_char_size) { // truncate the input string if (col_len_data[i] < str_char_size) { buffer.append(str_data, str_data + str_index[col_len_data[i]]); } else { buffer.append(str_data, str_data + str_len); } StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_chars, res_offsets); continue; } if (pad_char_size == 0) { // return NULL when the string to be paded is missing null_map_data[i] = true; StringOP::push_empty_string(i, res_chars, res_offsets); continue; } int32_t pad_byte_len = 0; int32_t pad_times = (col_len_data[i] - str_char_size) / pad_char_size; int32_t pad_remainder = (col_len_data[i] - str_char_size) % pad_char_size; pad_byte_len = pad_times * pad_len; pad_byte_len += pad_index[pad_remainder]; int32_t byte_len = str_len + pad_byte_len; // StringVal result(context, byte_len); if constexpr (Impl::is_lpad) { int pad_idx = 0; int result_index = 0; // Prepend chars of pad. while (result_index++ < pad_byte_len) { buffer.push_back(pad_data[pad_idx++]); pad_idx = pad_idx % pad_len; } // Append given string. buffer.append(str_data, str_data + str_len); StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_chars, res_offsets); } else { // is rpad buffer.append(str_data, str_data + str_len); // Append chars of pad until desired length int pad_idx = 0; int result_len = str_len; while (result_len++ < byte_len) { buffer.push_back(pad_data[pad_idx++]); pad_idx = pad_idx % pad_len; } StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_chars, res_offsets); } } } block.get_by_position(result).column = ColumnNullable::create(std::move(res), std::move(null_map)); return Status::OK(); } }; class FunctionSplitPart : public IFunction { public: static constexpr auto name = "split_part"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 3; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return make_nullable(std::make_shared()); } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { DCHECK_EQ(arguments.size(), 3); auto null_map = ColumnUInt8::create(input_rows_count, 0); // Create a zero column to simply implement auto const_null_map = ColumnUInt8::create(input_rows_count, 0); auto res = ColumnString::create(); auto& null_map_data = null_map->get_data(); auto& res_offsets = res->get_offsets(); auto& res_chars = res->get_chars(); res_offsets.resize(input_rows_count); size_t argument_size = arguments.size(); ColumnPtr argument_columns[argument_size]; for (size_t i = 0; i < argument_size; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); if (auto* nullable = check_and_get_column(*argument_columns[i])) { // Danger: Here must dispose the null map data first! Because // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem // of column nullable mem of null map VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data()); argument_columns[i] = nullable->get_nested_column_ptr(); } } auto str_col = assert_cast(argument_columns[0].get()); auto delimiter_col = assert_cast(argument_columns[1].get()); auto part_num_col = assert_cast(argument_columns[2].get()); auto& part_num_col_data = part_num_col->get_data(); for (size_t i = 0; i < input_rows_count; ++i) { if (part_num_col_data[i] <= 0) { StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); continue; } auto delimiter = delimiter_col->get_data_at(i); auto delimiter_str = delimiter_col->get_data_at(i).to_string(); auto part_number = part_num_col_data[i]; auto str = str_col->get_data_at(i); if (delimiter.size == 0) { StringOP::push_empty_string(i, res_chars, res_offsets); } else if (delimiter.size == 1) { // If delimiter is a char, use memchr to split int32_t pre_offset = -1; int32_t offset = -1; int32_t num = 0; while (num < part_number) { pre_offset = offset; size_t n = str.size - offset - 1; const char* pos = reinterpret_cast( memchr(str.data + offset + 1, delimiter_str[0], n)); if (pos != nullptr) { offset = pos - str.data; num++; } else { offset = str.size; num = (num == 0) ? 0 : num + 1; break; } } if (num == part_number) { StringOP::push_value_string( std::string_view { reinterpret_cast(str.data + pre_offset + 1), (size_t)offset - pre_offset - 1}, i, res_chars, res_offsets); } else { StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); } } else { // If delimiter is a string, use memmem to split int32_t pre_offset = -delimiter.size; int32_t offset = -delimiter.size; int32_t num = 0; while (num < part_number) { pre_offset = offset; size_t n = str.size - offset - delimiter.size; char* pos = reinterpret_cast(memmem(str.data + offset + delimiter.size, n, delimiter.data, delimiter.size)); if (pos != nullptr) { offset = pos - str.data; num++; } else { offset = str.size; num = (num == 0) ? 0 : num + 1; break; } } if (num == part_number) { StringOP::push_value_string( std::string_view {reinterpret_cast(str.data + pre_offset + delimiter.size), (size_t)offset - pre_offset - delimiter.size}, i, res_chars, res_offsets); } else { StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); } } } block.get_by_position(result).column = ColumnNullable::create(std::move(res), std::move(null_map)); return Status::OK(); } }; struct SM3Sum { static constexpr auto name = "sm3sum"; using ObjectData = SM3Digest; }; struct MD5Sum { static constexpr auto name = "md5sum"; using ObjectData = Md5Digest; }; template class FunctionStringMd5AndSM3 : public IFunction { public: static constexpr auto name = Impl::name; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 0; } bool is_variadic() const override { return true; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } bool use_default_implementation_for_nulls() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { DCHECK_GE(arguments.size(), 1); int argument_size = arguments.size(); ColumnPtr argument_columns[argument_size]; std::vector offsets_list(argument_size); std::vector chars_list(argument_size); for (int i = 0; i < argument_size; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); if (auto col_str = assert_cast(argument_columns[i].get())) { offsets_list[i] = &col_str->get_offsets(); chars_list[i] = &col_str->get_chars(); } else { return Status::RuntimeError(fmt::format( "Illegal column {} of argument of function {}", block.get_by_position(arguments[0]).column->get_name(), get_name())); } } auto res = ColumnString::create(); auto& res_data = res->get_chars(); auto& res_offset = res->get_offsets(); res_offset.resize(input_rows_count); for (size_t i = 0; i < input_rows_count; ++i) { using ObjectData = typename Impl::ObjectData; ObjectData digest; for (size_t j = 0; j < offsets_list.size(); ++j) { auto& current_offsets = *offsets_list[j]; auto& current_chars = *chars_list[j]; int size = current_offsets[i] - current_offsets[i - 1] - 1; if (size < 1) { continue; } digest.update(¤t_chars[current_offsets[i - 1]], size); } digest.digest(); StringOP::push_value_string(std::string_view(digest.hex().c_str(), digest.hex().size()), i, res_data, res_offset); } block.replace_by_position(result, std::move(res)); return Status::OK(); } }; class FunctionStringParseUrl : public IFunction { public: static constexpr auto name = "parse_url"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 0; } bool is_variadic() const override { return true; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return make_nullable(std::make_shared()); } bool use_default_implementation_for_nulls() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto null_map = ColumnUInt8::create(input_rows_count, 0); auto& null_map_data = null_map->get_data(); auto res = ColumnString::create(); auto& res_offsets = res->get_offsets(); auto& res_chars = res->get_chars(); res_offsets.resize(input_rows_count); size_t argument_size = arguments.size(); bool has_key = argument_size >= 3; ColumnPtr argument_columns[argument_size]; for (size_t i = 0; i < argument_size; ++i) { argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const(); } const auto* url_col = check_and_get_column(argument_columns[0].get()); const auto* part_col = check_and_get_column(argument_columns[1].get()); const ColumnString* key_col = nullptr; if (has_key) { key_col = check_and_get_column(argument_columns[2].get()); } if (!url_col || !part_col || (has_key && !key_col)) { return Status::InternalError("Not supported input arguments types"); } for (size_t i = 0; i < input_rows_count; ++i) { if (null_map_data[i]) { StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); continue; } auto part = part_col->get_data_at(i); StringValue p(const_cast(part.data), part.size); UrlParser::UrlPart url_part = UrlParser::get_url_part(p); StringValue url_key; if (has_key) { auto key = key_col->get_data_at(i); url_key = StringValue(const_cast(key.data), key.size); } auto source = url_col->get_data_at(i); StringValue url_val(const_cast(source.data), source.size); StringValue parse_res; bool success = false; if (has_key) { success = UrlParser::parse_url_key(url_val, url_part, url_key, &parse_res); } else { success = UrlParser::parse_url(url_val, url_part, &parse_res); } if (!success) { // url is malformed, or url_part is invalid. if (url_part == UrlParser::INVALID) { return Status::RuntimeError(fmt::format( "Invalid URL part: {}\n{}", std::string(part.data, part.size), "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', " "'FILE', 'USERINFO', 'PORT' and 'QUERY')")); } else { StringOP::push_null_string(i, res_chars, res_offsets, null_map_data); continue; } } StringOP::push_value_string(std::string_view(parse_res.ptr, parse_res.len), i, res_chars, res_offsets); } block.get_by_position(result).column = ColumnNullable::create(std::move(res), std::move(null_map)); return Status::OK(); } }; template class FunctionMoneyFormat : public IFunction { public: static constexpr auto name = "money_format"; static FunctionPtr create() { return std::make_shared>(); } String get_name() const override { return name; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } DataTypes get_variadic_argument_types_impl() const override { return Impl::get_variadic_argument_types(); } size_t get_number_of_arguments() const override { return 1; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto res_column = ColumnString::create(); ColumnPtr argument_column = block.get_by_position(arguments[0]).column; auto result_column = assert_cast(res_column.get()); auto data_column = assert_cast(argument_column.get()); Impl::execute(context, result_column, data_column, input_rows_count); block.replace_by_position(result, std::move(res_column)); return Status::OK(); } }; struct MoneyFormatDoubleImpl { using ColumnType = ColumnVector; static DataTypes get_variadic_argument_types() { return {std::make_shared()}; } static void execute(FunctionContext* context, ColumnString* result_column, const ColumnType* data_column, size_t input_rows_count) { for (size_t i = 0; i < input_rows_count; i++) { double value = MathFunctions::my_double_round(data_column->get_element(i), 2, false, false); StringVal str = StringFunctions::do_money_format(context, fmt::format("{:.2f}", value)); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } }; struct MoneyFormatInt64Impl { using ColumnType = ColumnVector; static DataTypes get_variadic_argument_types() { return {std::make_shared()}; } static void execute(FunctionContext* context, ColumnString* result_column, const ColumnType* data_column, size_t input_rows_count) { for (size_t i = 0; i < input_rows_count; i++) { Int64 value = data_column->get_element(i); StringVal str = StringFunctions::do_money_format(context, value); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } }; struct MoneyFormatInt128Impl { using ColumnType = ColumnVector; static DataTypes get_variadic_argument_types() { return {std::make_shared()}; } static void execute(FunctionContext* context, ColumnString* result_column, const ColumnType* data_column, size_t input_rows_count) { for (size_t i = 0; i < input_rows_count; i++) { Int128 value = data_column->get_element(i); StringVal str = StringFunctions::do_money_format(context, value); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } }; struct MoneyFormatDecimalImpl { using ColumnType = ColumnDecimal; static DataTypes get_variadic_argument_types() { return {std::make_shared>(27, 9)}; } static void execute(FunctionContext* context, ColumnString* result_column, const ColumnType* data_column, size_t input_rows_count) { for (size_t i = 0; i < input_rows_count; i++) { DecimalV2Val value = DecimalV2Val(data_column->get_element(i)); DecimalV2Value rounded(0); DecimalV2Value::from_decimal_val(value).round(&rounded, 2, HALF_UP); StringVal str = StringFunctions::do_money_format( context, rounded.int_value(), abs(rounded.frac_value() / 10000000)); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } }; class FunctionStringLocatePos : public IFunction { public: static constexpr auto name = "locate"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 3; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } DataTypes get_variadic_argument_types_impl() const override { return {std::make_shared(), std::make_shared(), std::make_shared()}; } bool is_variadic() const override { return true; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto col_substr = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); auto col_str = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); auto col_pos = block.get_by_position(arguments[2]).column->convert_to_full_column_if_const(); ColumnInt32::MutablePtr col_res = ColumnInt32::create(); auto& vec_pos = reinterpret_cast(col_pos.get())->get_data(); auto& vec_res = col_res->get_data(); vec_res.resize(input_rows_count); for (int i = 0; i < input_rows_count; ++i) { vec_res[i] = locate_pos(col_substr->get_data_at(i).to_string_val(), col_str->get_data_at(i).to_string_val(), vec_pos[i]); } block.replace_by_position(result, std::move(col_res)); return Status::OK(); } private: int locate_pos(StringVal substr, StringVal str, int start_pos) { if (substr.len == 0) { if (start_pos <= 0) { return 0; } else if (start_pos == 1) { return 1; } else if (start_pos > str.len) { return 0; } else { return start_pos; } } // Hive returns 0 for *start_pos <= 0, // but throws an exception for *start_pos > str->len. // Since returning 0 seems to be Hive's error condition, return 0. std::vector index; size_t char_len = get_char_len(str, &index); if (start_pos <= 0 || start_pos > str.len || start_pos > char_len) { return 0; } StringValue substr_sv = StringValue::from_string_val(substr); StringSearch search(&substr_sv); // Input start_pos starts from 1. StringValue adjusted_str(reinterpret_cast(str.ptr) + index[start_pos - 1], str.len - index[start_pos - 1]); int32_t match_pos = search.search(&adjusted_str); if (match_pos >= 0) { // Hive returns the position in the original string starting from 1. return start_pos + get_char_len(adjusted_str, match_pos); } else { return 0; } } }; class FunctionReplace : public IFunction { public: static constexpr auto name = "replace"; static FunctionPtr create() { return std::make_shared(); } String get_name() const override { return name; } size_t get_number_of_arguments() const override { return 3; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(); } DataTypes get_variadic_argument_types_impl() const override { return {std::make_shared(), std::make_shared(), std::make_shared()}; } bool use_default_implementation_for_constants() const override { return true; } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto col_origin = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); auto col_old = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); auto col_new = block.get_by_position(arguments[2]).column->convert_to_full_column_if_const(); ColumnString::MutablePtr col_res = ColumnString::create(); for (int i = 0; i < input_rows_count; ++i) { StringRef origin_str = assert_cast(col_origin.get())->get_data_at(i); StringRef old_str = assert_cast(col_old.get())->get_data_at(i); StringRef new_str = assert_cast(col_new.get())->get_data_at(i); std::string result = replace(origin_str.to_string(), old_str.to_string_view(), new_str.to_string_view()); col_res->insert_data(result.data(), result.length()); } block.replace_by_position(result, std::move(col_res)); return Status::OK(); } private: std::string replace(std::string str, std::string_view old_str, std::string_view new_str) { std::string::size_type pos = 0; std::string::size_type oldLen = old_str.size(); std::string::size_type newLen = new_str.size(); while ((pos = str.find(old_str, pos)) != std::string::npos) { str.replace(pos, oldLen, new_str); pos += newLen; } return str; } }; } // namespace doris::vectorized