From e1654bc6ef8c2a5f1e061ee26c1b5fb2d3fa275a Mon Sep 17 00:00:00 2001 From: Zhengguo Yang Date: Tue, 8 Nov 2022 09:15:26 +0800 Subject: [PATCH] [Enhancement](function) add to_bitmap() function with int type (#13973) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit to_bitmap function only support string param only,add to_bitmap() function with int type, this can avoid convert int type to string and then convert string to int --- be/src/exprs/bitmap_function.cpp | 30 +++ be/src/exprs/bitmap_function.h | 2 + .../functions/function_always_not_nullable.h | 88 ++++--- be/src/vec/functions/function_bitmap.cpp | 231 +++++++++++------- be/src/vec/functions/hll_hash.cpp | 59 +++-- gensrc/script/doris_builtins_functions.py | 18 +- 6 files changed, 276 insertions(+), 152 deletions(-) diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index 1d4fc4df47..2b40bd7322 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -197,6 +197,36 @@ StringVal BitmapFunctions::to_bitmap_with_check(doris_udf::FunctionContext* ctx, return serialize(ctx, &bitmap); } +StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, + const doris_udf::BigIntVal& src) { + BitmapValue bitmap; + if (LIKELY(!src.is_null && src.val >= 0)) { + bitmap.add(src.val); + } + return serialize(ctx, &bitmap); +} + +StringVal BitmapFunctions::to_bitmap_with_check(doris_udf::FunctionContext* ctx, + const doris_udf::BigIntVal& src) { + BitmapValue bitmap; + + if (!src.is_null) { + if (LIKELY(src.val >= 0)) { + bitmap.add(src.val); + } else { + std::stringstream ss; + ss << "The input: " << src.val + << " is not valid, to_bitmap only support bigint value from 0 to " + "18446744073709551615 currently, cannot load negative values to column with" + " to_bitmap MV on it."; + ctx->set_error(ss.str().c_str()); + return StringVal::null(); + } + } + + return serialize(ctx, &bitmap); +} + StringVal BitmapFunctions::bitmap_hash(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) { BitmapValue bitmap; diff --git a/be/src/exprs/bitmap_function.h b/be/src/exprs/bitmap_function.h index 7de2363d3a..a09c4b641d 100644 --- a/be/src/exprs/bitmap_function.h +++ b/be/src/exprs/bitmap_function.h @@ -69,6 +69,8 @@ public: static StringVal bitmap_serialize(FunctionContext* ctx, const StringVal& src); static StringVal to_bitmap(FunctionContext* ctx, const StringVal& src); static StringVal to_bitmap_with_check(FunctionContext* ctx, const StringVal& src); + static StringVal to_bitmap(FunctionContext* ctx, const BigIntVal& src); + static StringVal to_bitmap_with_check(FunctionContext* ctx, const BigIntVal& src); static StringVal bitmap_hash(FunctionContext* ctx, const StringVal& src); static StringVal bitmap_hash64(FunctionContext* ctx, const StringVal& src); static StringVal bitmap_or(FunctionContext* ctx, const StringVal& src, const StringVal& dst); diff --git a/be/src/vec/functions/function_always_not_nullable.h b/be/src/vec/functions/function_always_not_nullable.h index d7f437bcdd..a86fe4da95 100644 --- a/be/src/vec/functions/function_always_not_nullable.h +++ b/be/src/vec/functions/function_always_not_nullable.h @@ -41,9 +41,46 @@ public: bool use_default_implementation_for_constants() const override { return true; } bool use_default_implementation_for_nulls() const override { return false; } + template + Status execute_internal(const ColumnPtr& column, const DataTypePtr& data_type, + MutableColumnPtr& column_result) { + auto type_error = [&]() { + return Status::RuntimeError("Illegal column {} of argument of function {}", + column->get_name(), get_name()); + }; + if constexpr (is_nullable) { + const ColumnNullable* col_nullable = check_and_get_column(column.get()); + const ColumnType* col = + check_and_get_column(col_nullable->get_nested_column_ptr().get()); + const ColumnUInt8* col_nullmap = check_and_get_column( + col_nullable->get_null_map_column_ptr().get()); + + if (col != nullptr && col_nullmap != nullptr) { + if constexpr (WithReturn) { + RETURN_IF_ERROR( + Function::vector_nullable(col, col_nullmap->get_data(), column_result)); + } else { + Function::vector_nullable(col, col_nullmap->get_data(), column_result); + } + } else { + return type_error(); + } + } else { + const ColumnType* col = check_and_get_column(column.get()); + if constexpr (WithReturn) { + RETURN_IF_ERROR(Function::vector(col, column_result)); + } else { + Function::vector(col, column_result); + } + } + return Status::OK(); + } + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { - auto column = block.get_by_position(arguments[0]).column; + const ColumnPtr& column = block.get_by_position(arguments[0]).column; + const DataTypePtr& data_type = block.get_by_position(arguments[0]).type; + WhichDataType which(data_type); MutableColumnPtr column_result = get_return_type_impl({})->create_column(); column_result->resize(input_rows_count); @@ -53,44 +90,29 @@ public: block.get_by_position(arguments[0]).column->get_name(), get_name()); }; - - if (const ColumnNullable* col_nullable = - check_and_get_column(column.get())) { - const ColumnString* col = - check_and_get_column(col_nullable->get_nested_column_ptr().get()); - const ColumnUInt8* col_nullmap = check_and_get_column( - col_nullable->get_null_map_column_ptr().get()); - - if (col != nullptr && col_nullmap != nullptr) { - if constexpr (WithReturn) { - RETURN_IF_ERROR(Function::vector_nullable(col->get_chars(), col->get_offsets(), - col_nullmap->get_data(), - column_result)); - } else { - Function::vector_nullable(col->get_chars(), col->get_offsets(), - col_nullmap->get_data(), column_result); - } - - block.replace_by_position(result, std::move(column_result)); - return Status::OK(); + Status status = Status::OK(); + if (which.is_nullable()) { + const DataTypePtr& nested_data_type = + static_cast(data_type.get())->get_nested_type(); + WhichDataType nested_which(nested_data_type); + if (nested_which.is_string_or_fixed_string()) { + status = execute_internal(column, data_type, column_result); + } else if (nested_which.is_int64()) { + status = execute_internal(column, data_type, column_result); } else { return type_error(); } - } else if (const ColumnString* col = check_and_get_column(column.get())) { - if constexpr (WithReturn) { - RETURN_IF_ERROR( - Function::vector(col->get_chars(), col->get_offsets(), column_result)); - } else { - Function::vector(col->get_chars(), col->get_offsets(), column_result); - } - block.replace_by_position(result, std::move(column_result)); - return Status::OK(); + } else if (which.is_string_or_fixed_string()) { + status = execute_internal(column, data_type, column_result); + } else if (which.is_int64()) { + status = execute_internal(column, data_type, column_result); } else { return type_error(); } - - block.replace_by_position(result, std::move(column_result)); - return Status::OK(); + if (status.ok()) { + block.replace_by_position(result, std::move(column_result)); + } + return status; } }; diff --git a/be/src/vec/functions/function_bitmap.cpp b/be/src/vec/functions/function_bitmap.cpp index b4e6382d15..5f4483253b 100644 --- a/be/src/vec/functions/function_bitmap.cpp +++ b/be/src/vec/functions/function_bitmap.cpp @@ -43,34 +43,52 @@ struct ToBitmap { static constexpr auto name = "to_bitmap"; using ReturnType = DataTypeBitMap; - static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - MutableColumnPtr& col_res) { - execute(data, offsets, nullptr, col_res); + template + static void vector(const ColumnType* col, MutableColumnPtr& col_res) { + execute(col, nullptr, col_res); } - - static void vector_nullable(const ColumnString::Chars& data, - const ColumnString::Offsets& offsets, const NullMap& nullmap, + template + static void vector_nullable(const ColumnType* col, const NullMap& nullmap, MutableColumnPtr& col_res) { - execute(data, offsets, &nullmap, col_res); + execute(col, &nullmap, col_res); } - template - static void execute(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - const NullMap* nullmap, MutableColumnPtr& col_res) { - auto* res_column = reinterpret_cast(col_res.get()); - auto& res_data = res_column->get_data(); - size_t size = offsets.size(); + template + static void execute(const ColumnType* col, const NullMap* nullmap, MutableColumnPtr& col_res) { + if constexpr (std::is_same_v) { + const ColumnString::Chars& data = col->get_chars(); + const ColumnString::Offsets& offsets = col->get_offsets(); - for (size_t i = 0; i < size; ++i) { - if (arg_is_nullable && ((*nullmap)[i])) { - continue; - } else { - const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1]; - StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; - uint64_t int_value = StringParser::string_to_unsigned_int( - raw_str, str_size, &parse_result); - if (LIKELY(parse_result == StringParser::PARSE_SUCCESS)) { - res_data[i].add(int_value); + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + + for (size_t i = 0; i < size; ++i) { + if (arg_is_nullable && ((*nullmap)[i])) { + continue; + } else { + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1]; + StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; + uint64_t int_value = StringParser::string_to_unsigned_int( + raw_str, str_size, &parse_result); + if (LIKELY(parse_result == StringParser::PARSE_SUCCESS)) { + res_data[i].add(int_value); + } + } + } + } else if constexpr (std::is_same_v) { + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = col->size(); + + for (size_t i = 0; i < size; ++i) { + if (arg_is_nullable && ((*nullmap)[i])) { + continue; + } else { + int64_t int_value = col->get_data()[i]; + if (LIKELY(int_value >= 0)) { + res_data[i].add(int_value); + } } } } @@ -81,45 +99,76 @@ struct ToBitmapWithCheck { static constexpr auto name = "to_bitmap_with_check"; using ReturnType = DataTypeBitMap; - static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - MutableColumnPtr& col_res) { - return execute(data, offsets, nullptr, col_res); + template + static Status vector(const ColumnType* col, MutableColumnPtr& col_res) { + return execute(col, nullptr, col_res); } - - static Status vector_nullable(const ColumnString::Chars& data, - const ColumnString::Offsets& offsets, const NullMap& nullmap, + template + static Status vector_nullable(const ColumnType* col, const NullMap& nullmap, MutableColumnPtr& col_res) { - return execute(data, offsets, &nullmap, col_res); + return execute(col, &nullmap, col_res); } - template - static Status execute(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - const NullMap* nullmap, MutableColumnPtr& col_res) { - auto* res_column = reinterpret_cast(col_res.get()); - auto& res_data = res_column->get_data(); - size_t size = offsets.size(); + template + static Status execute(const ColumnType* col, const NullMap* nullmap, + MutableColumnPtr& col_res) { + if constexpr (std::is_same_v) { + const ColumnString::Chars& data = col->get_chars(); + const ColumnString::Offsets& offsets = col->get_offsets(); + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); - for (size_t i = 0; i < size; ++i) { - if (arg_is_nullable && ((*nullmap)[i])) { - continue; - } else { - const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1]; - StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; - uint64_t int_value = StringParser::string_to_unsigned_int( - raw_str, str_size, &parse_result); - if (LIKELY(parse_result == StringParser::PARSE_SUCCESS)) { - res_data[i].add(int_value); + for (size_t i = 0; i < size; ++i) { + if (arg_is_nullable && ((*nullmap)[i])) { + continue; } else { - std::stringstream ss; - ss << "The input: " << std::string(raw_str, str_size) - << " is not valid, to_bitmap only support bigint value from 0 to " - "18446744073709551615 currently, cannot create MV with to_bitmap on " - "column with negative values or cannot load negative values to column " - "with to_bitmap MV on it."; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1]; + StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; + uint64_t int_value = StringParser::string_to_unsigned_int( + raw_str, str_size, &parse_result); + if (LIKELY(parse_result == StringParser::PARSE_SUCCESS)) { + res_data[i].add(int_value); + } else { + std::stringstream ss; + ss << "The input: " << std::string(raw_str, str_size) + << " is not valid, to_bitmap only support bigint value from 0 to " + "18446744073709551615 currently, cannot create MV with to_bitmap on " + "column with negative values or cannot load negative values to " + "column " + "with to_bitmap MV on it."; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } } } + } else if constexpr (std::is_same_v) { + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = col->size(); + + for (size_t i = 0; i < size; ++i) { + if (arg_is_nullable && ((*nullmap)[i])) { + continue; + } else { + int64_t int_value = col->get_data()[i]; + if (LIKELY(int_value >= 0)) { + res_data[i].add(int_value); + } else { + std::stringstream ss; + ss << "The input: " << int_value + << " is not valid, to_bitmap only support bigint value from 0 to " + "18446744073709551615 currently, cannot create MV with to_bitmap on " + "column with negative values or cannot load negative values to " + "column " + "with to_bitmap MV on it."; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + } + } + } else { + return Status::InternalError("not support type"); } return Status::OK(); } @@ -209,38 +258,16 @@ struct BitmapHash { using ReturnType = DataTypeBitMap; - static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - MutableColumnPtr& col_res) { - auto* res_column = reinterpret_cast(col_res.get()); - auto& res_data = res_column->get_data(); - size_t size = offsets.size(); + template + static void vector(const ColumnType* col, MutableColumnPtr& col_res) { + if constexpr (std::is_same_v) { + const ColumnString::Chars& data = col->get_chars(); + const ColumnString::Offsets& offsets = col->get_offsets(); + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); - for (size_t i = 0; i < size; ++i) { - const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1]; - if constexpr (HashBits == 32) { - uint32_t hash_value = - HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); - res_data[i].add(hash_value); - } else { - uint64_t hash_value = 0; - murmur_hash3_x64_64(raw_str, str_size, 0, &hash_value); - res_data[i].add(hash_value); - } - } - } - - static void vector_nullable(const ColumnString::Chars& data, - const ColumnString::Offsets& offsets, const NullMap& nullmap, - MutableColumnPtr& col_res) { - auto* res_column = reinterpret_cast(col_res.get()); - auto& res_data = res_column->get_data(); - size_t size = offsets.size(); - - for (size_t i = 0; i < size; ++i) { - if (nullmap[i]) { - continue; - } else { + for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); size_t str_size = offsets[i] - offsets[i - 1]; if constexpr (HashBits == 32) { @@ -255,6 +282,36 @@ struct BitmapHash { } } } + + template + static void vector_nullable(const ColumnType* col, const NullMap& nullmap, + MutableColumnPtr& col_res) { + if constexpr (std::is_same_v) { + const ColumnString::Chars& data = col->get_chars(); + const ColumnString::Offsets& offsets = col->get_offsets(); + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + + for (size_t i = 0; i < size; ++i) { + if (nullmap[i]) { + continue; + } else { + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1]; + if constexpr (HashBits == 32) { + uint32_t hash_value = HashUtil::murmur_hash3_32(raw_str, str_size, + HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } else { + uint64_t hash_value = 0; + murmur_hash3_x64_64(raw_str, str_size, 0, &hash_value); + res_data[i].add(hash_value); + } + } + } + } + } }; class FunctionBitmapCount : public IFunction { diff --git a/be/src/vec/functions/hll_hash.cpp b/be/src/vec/functions/hll_hash.cpp index 66e2cf089b..715014dd5d 100644 --- a/be/src/vec/functions/hll_hash.cpp +++ b/be/src/vec/functions/hll_hash.cpp @@ -26,33 +26,16 @@ struct HLLHash { static constexpr auto name = "hll_hash"; using ReturnType = DataTypeHLL; + template + static void vector(const ColumnType* col, MutableColumnPtr& col_res) { + if constexpr (std::is_same_v) { + const ColumnString::Chars& data = col->get_chars(); + const ColumnString::Offsets& offsets = col->get_offsets(); + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); - static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - MutableColumnPtr& col_res) { - auto* res_column = reinterpret_cast(col_res.get()); - auto& res_data = res_column->get_data(); - size_t size = offsets.size(); - - for (size_t i = 0; i < size; ++i) { - const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1]; - uint64_t hash_value = - HashUtil::murmur_hash64A(raw_str, str_size, HashUtil::MURMUR_SEED); - res_data[i].update(hash_value); - } - } - - static void vector_nullable(const ColumnString::Chars& data, - const ColumnString::Offsets& offsets, const NullMap& nullmap, - MutableColumnPtr& col_res) { - auto* res_column = reinterpret_cast(col_res.get()); - auto& res_data = res_column->get_data(); - size_t size = offsets.size(); - - for (size_t i = 0; i < size; ++i) { - if (nullmap[i]) { - continue; - } else { + for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); size_t str_size = offsets[i] - offsets[i - 1]; uint64_t hash_value = @@ -61,6 +44,30 @@ struct HLLHash { } } } + + template + static void vector_nullable(const ColumnType* col, const NullMap& nullmap, + MutableColumnPtr& col_res) { + if constexpr (std::is_same_v) { + const ColumnString::Chars& data = col->get_chars(); + const ColumnString::Offsets& offsets = col->get_offsets(); + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + + for (size_t i = 0; i < size; ++i) { + if (nullmap[i]) { + continue; + } else { + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1]; + uint64_t hash_value = + HashUtil::murmur_hash64A(raw_str, str_size, HashUtil::MURMUR_SEED); + res_data[i].update(hash_value); + } + } + } + } }; using FunctionHLLHash = FunctionAlwaysNotNullable; diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 2b19b9c182..cc14485d08 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2509,18 +2509,24 @@ visible_functions = [ [['to_bitmap_with_check'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions20to_bitmap_with_checkEPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], - [['bitmap_hash'], 'BITMAP', ['VARCHAR'], - '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], - [['bitmap_hash64'], 'BITMAP', ['VARCHAR'], - '_ZN5doris15BitmapFunctions11bitmap_hash64EPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['to_bitmap'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['to_bitmap_with_check'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions20to_bitmap_with_checkEPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], + [['to_bitmap'], 'BITMAP', ['BIGINT'], + '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9BigIntValE', + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], + [['to_bitmap_with_check'], 'BITMAP', ['BIGINT'], + '_ZN5doris15BitmapFunctions20to_bitmap_with_checkEPN9doris_udf15FunctionContextERKNS1_9BigIntValE', + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], + [['bitmap_hash'], 'BITMAP', ['VARCHAR'], + '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], + [['bitmap_hash64'], 'BITMAP', ['VARCHAR'], + '_ZN5doris15BitmapFunctions11bitmap_hash64EPN9doris_udf15FunctionContextERKNS1_9StringValE', + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_hash'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions11bitmap_hash64EPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'],