diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index aa6888091f..7e405f3f6a 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -306,7 +306,8 @@ void BitmapFunctions::bitmap_union(FunctionContext* ctx, const StringVal& src, S // the dst value could be null void BitmapFunctions::nullable_bitmap_init(FunctionContext* ctx, StringVal* dst) { - dst->is_null = true; + dst->ptr = nullptr; + dst->len = 0; } void BitmapFunctions::bitmap_intersect(FunctionContext* ctx, const StringVal& src, StringVal* dst) { @@ -314,7 +315,7 @@ void BitmapFunctions::bitmap_intersect(FunctionContext* ctx, const StringVal& sr return; } // if dst is null, the src input is the first value - if (dst->is_null) { + if (UNLIKELY(dst->ptr == nullptr)) { dst->is_null = false; dst->len = sizeof(BitmapValue); dst->ptr = (uint8_t*)new BitmapValue((char*)src.ptr); @@ -358,21 +359,17 @@ BigIntVal BitmapFunctions::bitmap_min(FunctionContext* ctx, const StringVal& src StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) { - BitmapValue bitmap; - if (!src.is_null) { - StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; - uint64_t int_value = StringParser::string_to_unsigned_int( - reinterpret_cast(src.ptr), src.len, &parse_result); - if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { - std::stringstream error_msg; - error_msg << "The input: " << std::string(reinterpret_cast(src.ptr), src.len) - << " is not valid, to_bitmap only support bigint value from 0 to " - "18446744073709551615 currently"; - ctx->set_error(error_msg.str().c_str()); - return StringVal::null(); - } - bitmap.add(int_value); + if (src.is_null) { + return StringVal::null(); } + StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; + uint64_t int_value = StringParser::string_to_unsigned_int( + reinterpret_cast(src.ptr), src.len, &parse_result); + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { + return StringVal::null(); + } + BitmapValue bitmap; + bitmap.add(int_value); return serialize(ctx, &bitmap); } @@ -473,8 +470,8 @@ StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& lhs, return serialize(ctx, &bitmap); } -StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& lhs, - int num_args, const StringVal* bitmap_strs) { +StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& lhs, int num_args, + const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return StringVal::null(); @@ -518,8 +515,8 @@ StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& lhs return serialize(ctx, &bitmap); } -StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& lhs, - int num_args, const StringVal* bitmap_strs) { +StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& lhs, int num_args, + const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return StringVal::null(); @@ -562,8 +559,8 @@ BigIntVal BitmapFunctions::bitmap_and_count(FunctionContext* ctx, const StringVa } } -BigIntVal BitmapFunctions::bitmap_and_count(FunctionContext* ctx, const StringVal& lhs, int num_args, - const StringVal* bitmap_strs) { +BigIntVal BitmapFunctions::bitmap_and_count(FunctionContext* ctx, const StringVal& lhs, + int num_args, const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return BigIntVal::null(); @@ -653,8 +650,8 @@ StringVal BitmapFunctions::bitmap_xor(FunctionContext* ctx, const StringVal& lhs return serialize(ctx, &bitmap); } -StringVal BitmapFunctions::bitmap_xor(FunctionContext* ctx, const StringVal& lhs, - int num_args, const StringVal* bitmap_strs) { +StringVal BitmapFunctions::bitmap_xor(FunctionContext* ctx, const StringVal& lhs, int num_args, + const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return StringVal::null(); @@ -697,8 +694,8 @@ BigIntVal BitmapFunctions::bitmap_xor_count(FunctionContext* ctx, const StringVa } } -BigIntVal BitmapFunctions::bitmap_xor_count(FunctionContext* ctx, const StringVal& lhs, int num_args, - const StringVal* bitmap_strs) { +BigIntVal BitmapFunctions::bitmap_xor_count(FunctionContext* ctx, const StringVal& lhs, + int num_args, const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return BigIntVal::null(); diff --git a/be/src/exprs/hll_function.cpp b/be/src/exprs/hll_function.cpp index af8b3e162a..f363bd878a 100644 --- a/be/src/exprs/hll_function.cpp +++ b/be/src/exprs/hll_function.cpp @@ -99,7 +99,7 @@ BigIntVal HllFunctions::hll_get_value(FunctionContext*, const StringVal& src) { BigIntVal HllFunctions::hll_cardinality(FunctionContext* ctx, const StringVal& input) { if (input.is_null) { - return BigIntVal::null(); + return BigIntVal(); } StringVal dst; hll_init(ctx, &dst); diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index 4d72f070bb..939421656e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -32,18 +32,26 @@ struct AggregateFunctionBitmapUnionOp { static constexpr auto name = "bitmap_union"; template - static void add(BitmapValue& res, const T& data) { + static void add(BitmapValue& res, const T& data, bool& is_first) { res.add(data); } - static void add(BitmapValue& res, const BitmapValue& data) { res |= data; } + static void add(BitmapValue& res, const BitmapValue& data, bool& is_first) { res |= data; } static void merge(BitmapValue& res, const BitmapValue& data) { res |= data; } }; struct AggregateFunctionBitmapIntersectOp { static constexpr auto name = "bitmap_intersect"; - static void add(BitmapValue& res, const BitmapValue& data) { res &= data; } + + static void add(BitmapValue& res, const BitmapValue& data, bool& is_first) { + if (UNLIKELY(is_first)) { + res = data; + is_first = false; + } else { + res &= data; + } + } static void merge(BitmapValue& res, const BitmapValue& data) { res &= data; } }; @@ -51,10 +59,11 @@ struct AggregateFunctionBitmapIntersectOp { template struct AggregateFunctionBitmapData { BitmapValue value; + bool is_first = true; template void add(const T& data) { - Op::add(value, data); + Op::add(value, data, is_first); } void merge(const BitmapValue& data) { Op::merge(value, data); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp index 3b2aba0552..1d9219c704 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp @@ -22,6 +22,7 @@ namespace doris::vectorized { +template AggregateFunctionPtr create_aggregate_function_HLL_union_agg(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -29,9 +30,12 @@ AggregateFunctionPtr create_aggregate_function_HLL_union_agg(const std::string& assert_no_parameters(name, parameters); assert_arity_at_most<1>(name, argument_types); - return std::make_shared(argument_types); + return std::make_shared>>>( + argument_types); } +template AggregateFunctionPtr create_aggregate_function_HLL_union(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -39,13 +43,17 @@ AggregateFunctionPtr create_aggregate_function_HLL_union(const std::string& name assert_no_parameters(name, parameters); assert_arity_at_most<1>(name, argument_types); - return std::make_shared(argument_types); + return std::make_shared>>>(argument_types); } void register_aggregate_function_HLL_union_agg(AggregateFunctionSimpleFactory& factory) { - factory.register_function("hll_union_agg", create_aggregate_function_HLL_union_agg); - factory.register_function("hll_union", create_aggregate_function_HLL_union); - factory.register_function("hll_raw_agg", create_aggregate_function_HLL_union); + factory.register_function("hll_union_agg", create_aggregate_function_HLL_union_agg); + factory.register_function("hll_union_agg", create_aggregate_function_HLL_union_agg, true); + + factory.register_function("hll_union", create_aggregate_function_HLL_union); + factory.register_function("hll_union", create_aggregate_function_HLL_union, true); + factory.register_alias("hll_union", "hll_raw_agg"); } } // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h index 72652c9623..fe335b0509 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h @@ -23,18 +23,17 @@ #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_hll.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/io/io_helper.h" -#include "vec/data_types/data_type_hll.h" namespace doris::vectorized { +template struct AggregateFunctionHLLData { HyperLogLog dst_hll {}; - void add(const HyperLogLog& src) { dst_hll.merge(src); } - void merge(const AggregateFunctionHLLData& rhs) { dst_hll.merge(rhs.dst_hll); } void write(BufferWritable& buf) const { @@ -52,32 +51,66 @@ struct AggregateFunctionHLLData { Int64 get_cardinality() const { return dst_hll.estimate_cardinality(); } - HyperLogLog get() const { - return dst_hll; - } + HyperLogLog get() const { return dst_hll; } + void add(const IColumn* column, size_t row_num) { + if constexpr (is_nullable) { + auto* nullable_column = check_and_get_column(*column); + if (nullable_column->is_null_at(row_num)) { + return; + } + const auto& sources = + static_cast((nullable_column->get_nested_column())); + dst_hll.merge(sources.get_element(row_num)); + + } else { + const auto& sources = static_cast(*column); + dst_hll.merge(sources.get_element(row_num)); + } + } }; -class AggregateFunctionHLLUnionAgg - : public IAggregateFunctionDataHelper { +template +struct AggregateFunctionHLLUnionImpl : Data { + void insert_result_into(IColumn& to) const { + assert_cast(to).get_data().emplace_back(this->get()); + } + + static DataTypePtr get_return_type() { return std::make_shared(); } + + static const char* name() { return "hll_union"; } +}; + +template +struct AggregateFunctionHLLUnionAggImpl : Data { + void insert_result_into(IColumn& to) const { + assert_cast(to).get_data().emplace_back(this->get_cardinality()); + } + + static DataTypePtr get_return_type() { return std::make_shared(); } + + static const char* name() { return "hll_union_agg"; } +}; + +template +class AggregateFunctionHLLUnion + : public IAggregateFunctionDataHelper> { public: - virtual String get_name() const override { return "hll_union_agg"; } + AggregateFunctionHLLUnion(const DataTypes& argument_types) + : IAggregateFunctionDataHelper>(argument_types, + {}) {} - AggregateFunctionHLLUnionAgg(const DataTypes& argument_types_) - : IAggregateFunctionDataHelper(argument_types_, {}) {} + String get_name() const override { return Data::name(); } - AggregateFunctionHLLUnionAgg(const IDataType& data_type, const DataTypes& argument_types_) - : IAggregateFunctionDataHelper(argument_types_, {}) {} + DataTypePtr get_return_type() const override { return Data::get_return_type(); } - virtual DataTypePtr get_return_type() const override { - return std::make_shared(); + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + this->data(place).insert_result_into(to); } void add(AggregateDataPtr __restrict place, const IColumn** columns, size_t row_num, - Arena*) const override { - const auto& column = static_cast(*columns[0]); - this->data(place).add(column.get_element(row_num)); + Arena* arena) const override { + this->data(place).add(columns[0], row_num); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, @@ -93,32 +126,9 @@ public: Arena*) const override { this->data(place).read(buf); } - - virtual void insert_result_into(ConstAggregateDataPtr __restrict place, - IColumn& to) const override { - auto& column = static_cast&>(to); - column.get_data().push_back(this->data(place).get_cardinality()); - } -}; - -class AggregateFunctionHLLUnion final : public AggregateFunctionHLLUnionAgg { -public: - String get_name() const override { return "hll_union"; } - - AggregateFunctionHLLUnion(const DataTypes& argument_types_) - : AggregateFunctionHLLUnionAgg {argument_types_} {} - - AggregateFunctionHLLUnion(const IDataType& data_type, const DataTypes& argument_types_) - : AggregateFunctionHLLUnionAgg(data_type, argument_types_) {} - - DataTypePtr get_return_type() const override { return std::make_shared(); } - - void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { - auto& column = static_cast(to); - column.get_data().emplace_back(this->data(place).get()); - } }; +template AggregateFunctionPtr create_aggregate_function_HLL_union(const std::string& name, const DataTypes& argument_types, const Array& parameters, diff --git a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp index 3594d514d3..ce78397794 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp @@ -32,7 +32,7 @@ void register_aggregate_function_reader(AggregateFunctionSimpleFactory& factory) register_function_reader("min", create_aggregate_function_min); register_function_reader("replace_if_not_null", create_aggregate_function_replace_if_not_null); register_function_reader("bitmap_union", create_aggregate_function_bitmap_union); - register_function_reader("hll_union", create_aggregate_function_HLL_union); + register_function_reader("hll_union", create_aggregate_function_HLL_union); } void register_aggregate_function_reader_no_spread(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp index 9a4d6d5957..c153d32e0b 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp @@ -54,7 +54,6 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_uniq(instance); register_aggregate_function_bitmap(instance); register_aggregate_function_combinator_distinct(instance); - register_aggregate_function_HLL_union_agg(instance); register_aggregate_function_reader(instance); // register aggregate function for agg reader register_aggregate_function_window_rank(instance); register_aggregate_function_stddev_variance(instance); @@ -68,6 +67,7 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_reader_no_spread(instance); register_aggregate_function_window_lead_lag(instance); + register_aggregate_function_HLL_union_agg(instance); }); return instance; } diff --git a/be/src/vec/functions/function_bit.cpp b/be/src/vec/functions/function_bit.cpp index 2a8db7b4a4..0f4fb87d27 100644 --- a/be/src/vec/functions/function_bit.cpp +++ b/be/src/vec/functions/function_bit.cpp @@ -22,6 +22,7 @@ #include "vec/functions/function_binary_arithmetic.h" #include "vec/functions/function_unary_arithmetic.h" #include "vec/functions/simple_function_factory.h" +#include "vec/functions/function_totype.h" namespace doris::vectorized { @@ -78,15 +79,39 @@ struct BitXorImpl { } }; +struct NameBitLength { + static constexpr auto name = "bit_length"; +}; + +struct BitLengthImpl { + using ReturnType = DataTypeInt32; + static constexpr auto TYPE_INDEX = TypeIndex::String; + using Type = String; + using ReturnColumnType = ColumnVector; + + static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, + PaddedPODArray& res) { + auto size = offsets.size(); + res.resize(size); + for (int i = 0; i < size; ++i) { + int str_size = offsets[i] - offsets[i - 1] - 1; + res[i] = (str_size * 8); + } + return Status::OK(); + } +}; + using FunctionBitAnd = FunctionBinaryArithmetic; using FunctionBitNot = FunctionUnaryArithmetic; using FunctionBitOr = FunctionBinaryArithmetic; using FunctionBitXor = FunctionBinaryArithmetic; +using FunctionBitLength = FunctionUnaryToType; void register_function_bit(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); } } // namespace doris::vectorized diff --git a/be/src/vec/functions/function_bitmap.cpp b/be/src/vec/functions/function_bitmap.cpp index 1fe114636d..614289e072 100644 --- a/be/src/vec/functions/function_bitmap.cpp +++ b/be/src/vec/functions/function_bitmap.cpp @@ -26,6 +26,7 @@ #include "vec/functions/function_string.h" #include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" +#include "vec/functions/function_always_not_nullable.h" namespace doris::vectorized { @@ -36,18 +37,11 @@ struct BitmapEmpty { static auto init_value() { return BitmapValue {}; } }; -struct NameToBitmap { +struct ToBitmap { static constexpr auto name = "to_bitmap"; -}; - -struct ToBitmapImpl { - using ReturnType = DataTypeBitMap; - static constexpr auto TYPE_INDEX = TypeIndex::String; - using Type = String; - using ReturnColumnType = ColumnBitmap; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - std::vector& res) { + std::vector& res, NullMap& null_map) { auto size = offsets.size(); res.reserve(size); for (size_t i = 0; i < size; ++i) { @@ -56,15 +50,11 @@ struct ToBitmapImpl { StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; uint64_t int_value = StringParser::string_to_unsigned_int(raw_str, str_size, &parse_result); - - // TODO: which where cause problem in to_bitmap(null), rethink how to slove the problem - // of null - // if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { - // return Status::RuntimeError( - // fmt::format("The input: {:.{}} is not valid, to_bitmap only support bigint " - // "value from 0 to 18446744073709551615 currently", - // raw_str, str_size)); - // } + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { + res.emplace_back(); + null_map[i] = 1; + continue; + } res.emplace_back(); res.back().add(int_value); } @@ -72,76 +62,154 @@ struct ToBitmapImpl { } }; -struct NameBitmapFromString { - static constexpr auto name = "bitmap_from_string"; -}; - struct BitmapFromString { - using ReturnType = DataTypeBitMap; - static constexpr auto TYPE_INDEX = TypeIndex::String; - using Type = String; - using ReturnColumnType = ColumnBitmap; + static constexpr auto name = "bitmap_from_string"; + static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - std::vector& res) { + std::vector& res, NullMap& null_map) { auto size = offsets.size(); res.reserve(size); std::vector bits; for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - int str_size = offsets[i] - offsets[i - 1] - 1; - if (SplitStringAndParse({raw_str, str_size}, ",", &safe_strtou64, &bits)) { - res.emplace_back(bits); - } else { + int64_t str_size = offsets[i] - offsets[i - 1] - 1; + + if ((str_size > INT32_MAX) || + !(SplitStringAndParse({raw_str, (int)str_size}, ",", &safe_strtou64, &bits))) { res.emplace_back(); + null_map[i] = 1; + continue; } + res.emplace_back(bits); bits.clear(); } return Status::OK(); } }; -struct NameBitmapHash { - static constexpr auto name = "bitmap_hash"; -}; +template +class FunctionBitmapAlwaysNull : public IFunction { +public: + static constexpr auto name = Impl::name; -struct BitmapHash { - using ReturnType = DataTypeBitMap; - static constexpr auto TYPE_INDEX = TypeIndex::String; - using Type = String; - using ReturnColumnType = ColumnBitmap; - static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - std::vector& res) { - auto size = offsets.size(); - res.reserve(size); - for (size_t i = 0; i < size; ++i) { - const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; - uint32_t hash_value = - HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); - res.emplace_back(); - res.back().add(hash_value); - } + String get_name() const override { return name; } + + static FunctionPtr create() { return std::make_shared(); } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared()); + } + + size_t get_number_of_arguments() const override { return 1; } + + bool use_default_implementation_for_nulls() const override { return true; } + + bool use_default_implementation_for_constants() const override { return true; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + auto res_null_map = ColumnUInt8::create(input_rows_count, 0); + auto res_data_column = ColumnBitmap::create(); + auto& null_map = res_null_map->get_data(); + auto& res = res_data_column->get_data(); + + ColumnPtr argument_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const ColumnString* str_column = check_and_get_column(argument_column.get()); + const ColumnString::Chars& data = str_column->get_chars(); + const ColumnString::Offsets& offsets = str_column->get_offsets(); + + Impl::vector(data, offsets, res, null_map); + + block.get_by_position(result).column = + ColumnNullable::create(std::move(res_data_column), std::move(res_null_map)); return Status::OK(); } }; -struct NameBitmapCount { - static constexpr auto name = "bitmap_count"; +struct BitmapHash { + static constexpr auto name = "bitmap_hash"; + + using ReturnType = DataTypeBitMap; + + static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, + MutableColumnPtr& col_res) { + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + + for (size_t i = 0; i < size; ++i) { + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1] - 1; + uint32_t hash_value = + HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } + } + + static void vector_nullable(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, const NullMap& nullmap, + MutableColumnPtr& col_res) { + auto* res_column = reinterpret_cast(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + + for (size_t i = 0; i < size; ++i) { + if (nullmap[i]) { + continue; + } else { + const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1] - 1; + uint32_t hash_value = + HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } + } + } }; -struct BitmapCount { - using ReturnType = DataTypeInt64; - static constexpr auto TYPE_INDEX = TypeIndex::BitMap; - using Type = DataTypeBitMap::FieldType; - using ReturnColumnType = ColumnVector; - using ReturnColumnContainer = ColumnVector::Container; +class FunctionBitmapCount : public IFunction { +public: + static constexpr auto name = "bitmap_count"; - static Status vector(const std::vector& data, ReturnColumnContainer& res) { - size_t size = data.size(); - res.reserve(size); - for (size_t i = 0; i < size; ++i) { - res.push_back(data[i].cardinality()); + String get_name() const override { return name; } + + static FunctionPtr create() { return std::make_shared(); } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared(); + } + + size_t get_number_of_arguments() const override { return 1; } + + bool use_default_implementation_for_nulls() const override { return false; } + + bool use_default_implementation_for_constants() const override { return true; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + auto res_data_column = ColumnInt64::create(); + auto& res = res_data_column->get_data(); + auto data_null_map = ColumnUInt8::create(input_rows_count, 0); + auto& null_map = data_null_map->get_data(); + + auto column = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + if (auto* nullable = check_and_get_column(*column)) { + VectorizedUtils::update_null_map(null_map, nullable->get_null_map_data()); + column = nullable->get_nested_column_ptr(); } + auto str_col = assert_cast(column.get()); + const auto& col_data = str_col->get_data(); + + res.reserve(input_rows_count); + for (size_t i = 0; i < input_rows_count; ++i) { + if (null_map[i]) { + res.push_back(0); + continue; + } + res.push_back(col_data[i].cardinality()); + } + block.replace_by_position(result, std::move(res_data_column)); return Status::OK(); } }; @@ -428,10 +496,9 @@ public: }; using FunctionBitmapEmpty = FunctionConst; -using FunctionToBitmap = FunctionUnaryToType; -using FunctionBitmapFromString = FunctionUnaryToType; -using FunctionBitmapHash = FunctionUnaryToType; -using FunctionBitmapCount = FunctionUnaryToType; +using FunctionToBitmap = FunctionBitmapAlwaysNull; +using FunctionBitmapFromString = FunctionBitmapAlwaysNull; +using FunctionBitmapHash = FunctionAlwaysNotNullable; using FunctionBitmapMin = FunctionBitmapSingle; using FunctionBitmapMax = FunctionBitmapSingle; diff --git a/be/src/vec/functions/function_encryption.cpp b/be/src/vec/functions/function_encryption.cpp index 3c84d44c90..f175735d35 100644 --- a/be/src/vec/functions/function_encryption.cpp +++ b/be/src/vec/functions/function_encryption.cpp @@ -102,7 +102,7 @@ static void exectue_result(std::vector& offsets_li int key_size = (*offsets_list[1])[i] - (*offsets_list[1])[i - 1] - 1; const auto key_raw = reinterpret_cast(&(*chars_list[1])[(*offsets_list[1])[i - 1]]); - if (*src_raw == '\0' || src_size == 0) { + if (*src_raw == '\0' && src_size == 0) { StringOP::push_null_string(i, result_data, result_offset, null_map); return; } diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 0375994d67..730a6e4742 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -462,7 +462,7 @@ struct ToBase64Impl { auto source = reinterpret_cast(&data[offsets[i - 1]]); size_t srclen = offsets[i] - offsets[i - 1] - 1; - if (*source == '\0' || srclen == 0) { + if (*source == '\0' && srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } @@ -502,7 +502,7 @@ struct FromBase64Impl { auto source = reinterpret_cast(&data[offsets[i - 1]]); size_t srclen = offsets[i] - offsets[i - 1] - 1; - if (*source == '\0' || srclen == 0) { + if (*source == '\0' && srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index d366d6498f..0ce0cf7663 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -174,7 +174,7 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl block->get_by_position(i).column->convert_to_full_column_if_const(); const auto& column = block->get_by_position(i).column; - if (desc->is_nullable() && desc->type() == TYPE_OBJECT) { + if (desc->type() == TYPE_OBJECT && column->is_nullable()) { const auto& null_map = vectorized::check_and_get_column(*column) ->get_null_map_data(); diff --git a/be/test/exprs/bitmap_function_test.cpp b/be/test/exprs/bitmap_function_test.cpp index d789164294..cffc9d71b2 100644 --- a/be/test/exprs/bitmap_function_test.cpp +++ b/be/test/exprs/bitmap_function_test.cpp @@ -86,24 +86,19 @@ TEST_F(BitmapFunctionsTest, to_bitmap_null) { StringVal input = StringVal::null(); StringVal result = BitmapFunctions::to_bitmap(ctx, input); - BitmapValue bitmap; - StringVal expected = convert_bitmap_to_string(ctx, bitmap); - - ASSERT_EQ(expected, result); + ASSERT_EQ(StringVal::null(), result); } TEST_F(BitmapFunctionsTest, to_bitmap_invalid_argument) { StringVal input = AnyValUtil::from_string_temp(ctx, std::string("-1")); StringVal result = BitmapFunctions::to_bitmap(ctx, input); ASSERT_EQ(StringVal::null(), result); - ASSERT_TRUE(ctx->has_error()); } TEST_F(BitmapFunctionsTest, to_bitmap_out_of_range) { StringVal input = AnyValUtil::from_string_temp(ctx, std::string("18446744073709551616")); StringVal result = BitmapFunctions::to_bitmap(ctx, input); ASSERT_EQ(StringVal::null(), result); - ASSERT_TRUE(ctx->has_error()); } TEST_F(BitmapFunctionsTest, bitmap_union_int) { diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 47bfea4ee1..61b1e529bc 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -1017,6 +1017,21 @@ TEST(function_string_test, function_replace) { check_function(func_name, input_types, data_set); } +TEST(function_string_test, function_bit_length_test) { + std::string func_name = "bit_length"; + InputTypeSet input_types = {TypeIndex::String}; + DataSet data_set = {{{Null()}, {Null()}}, + {{std::string("@!#")}, 24}, + {{std::string("")}, 0}, + {{std::string("ò&ø")}, 40}, + {{std::string("@@")}, 16}, + {{std::string("你好")}, 48}, + {{std::string("hello你好")}, 88}, + {{std::string("313233")}, 48}, + {{std::string("EFBC9F")}, 48}}; + check_function(func_name, input_types, data_set); +} + } // namespace doris::vectorized int main(int argc, char** argv) { diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js index 823ce61b46..628022ef38 100644 --- a/docs/.vuepress/sidebar/en.js +++ b/docs/.vuepress/sidebar/en.js @@ -481,7 +481,8 @@ module.exports = [ "bitand", "bitor", "bitxor", - "bitnot" + "bitnot", + "bit_length" ], }, { diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js index c77c5beeb5..886abe7808 100644 --- a/docs/.vuepress/sidebar/zh-CN.js +++ b/docs/.vuepress/sidebar/zh-CN.js @@ -483,7 +483,8 @@ module.exports = [ "bitand", "bitor", "bitxor", - "bitnot" + "bitnot", + "bit_length" ], }, { diff --git a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md b/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md index 076ca91c7d..a2ea7903d7 100644 --- a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md +++ b/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md @@ -30,7 +30,9 @@ under the License. `BITMAP TO_BITMAP(expr)` -Convert an unsigned bigint (ranging from 0 to 18446744073709551615) to a bitmap containing that value. Mainly be used to load integer value into bitmap column, e.g., +Convert an unsigned bigint (ranging from 0 to 18446744073709551615) to a bitmap containing that value. +Null will be return when the input value is not in this range. +Mainly be used to load integer value into bitmap column, e.g., ``` cat data | curl --location-trusted -u user:passwd -T - -H "columns: dt,page,user_id, user_id=to_bitmap(user_id)" http://host:8410/api/test/testDb/_stream_load @@ -45,6 +47,13 @@ mysql> select bitmap_count(to_bitmap(10)); +-----------------------------+ | 1 | +-----------------------------+ + +MySQL> select bitmap_to_string(to_bitmap(-1)); ++---------------------------------+ +| bitmap_to_string(to_bitmap(-1)) | ++---------------------------------+ +| NULL | ++---------------------------------+ ``` ## keyword diff --git a/docs/en/sql-reference/sql-functions/bitwise-functions/bit_length.md b/docs/en/sql-reference/sql-functions/bitwise-functions/bit_length.md new file mode 100644 index 0000000000..9f56a1ff04 --- /dev/null +++ b/docs/en/sql-reference/sql-functions/bitwise-functions/bit_length.md @@ -0,0 +1,55 @@ +--- +{ +"title": "bit_length", +"language": "en" +} +--- + + + +# bit_length +## description +### Syntax + +`INT bit_length(VARCHAR str)` + +Return length of argument in bits. + +## example + +``` +MySQL> select bit_length("doris"); ++---------------------+ +| bit_length('doris') | ++---------------------+ +| 40 | ++---------------------+ + +MySQL [test]> select bit_length("hello world"); ++---------------------------+ +| bit_length('hello world') | ++---------------------------+ +| 88 | ++---------------------------+ +``` + +## keyword + + bit_length diff --git a/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md b/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md index feae5d3fe9..022e6d7599 100644 --- a/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md +++ b/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md @@ -31,6 +31,7 @@ under the License. `BITMAP TO_BITMAP(expr)` 输入为取值在 0 ~ 18446744073709551615 区间的 unsigned bigint ,输出为包含该元素的bitmap。 +当输入值不在此范围时, 会返回NULL。 该函数主要用于stream load任务将整型字段导入Doris表的bitmap字段。例如 ``` @@ -46,6 +47,13 @@ mysql> select bitmap_count(to_bitmap(10)); +-----------------------------+ | 1 | +-----------------------------+ + +MySQL> select bitmap_to_string(to_bitmap(-1)); ++---------------------------------+ +| bitmap_to_string(to_bitmap(-1)) | ++---------------------------------+ +| NULL | ++---------------------------------+ ``` ## keyword diff --git a/docs/zh-CN/sql-reference/sql-functions/bitwise-functions/bit_length.md b/docs/zh-CN/sql-reference/sql-functions/bitwise-functions/bit_length.md new file mode 100644 index 0000000000..c0005fade1 --- /dev/null +++ b/docs/zh-CN/sql-reference/sql-functions/bitwise-functions/bit_length.md @@ -0,0 +1,55 @@ +--- +{ +"title": "bit_length", +"language": "zh-CN" +} +--- + + + +# bit_length +## description +### Syntax + +`INT bit_length(VARCHAR str)` + +返回字符串的bit位数 + +## example + +``` +MySQL> select bit_length("doris"); ++---------------------+ +| bit_length('doris') | ++---------------------+ +| 40 | ++---------------------+ + +MySQL [test]> select bit_length("hello world"); ++---------------------------+ +| bit_length('hello world') | ++---------------------------+ +| 88 | ++---------------------------+ +``` + +## keyword + + bit_length diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java index 82e4035fe4..d64df26bb7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java @@ -49,7 +49,7 @@ public class AggregateFunction extends Function { private static final Logger LOG = LogManager.getLogger(AggregateFunction.class); public static ImmutableSet NOT_NULLABLE_AGGREGATE_FUNCTION_NAME_SET = - ImmutableSet.of("row_number", "rank", "dense_rank", FunctionSet.COUNT, "ndv", FunctionSet.BITMAP_UNION_INT, FunctionSet.BITMAP_UNION_COUNT, "ndv_no_finalize"); + ImmutableSet.of("row_number", "rank", "dense_rank", "hll_union_agg", "hll_union", "bitmap_union", "bitmap_intersect", FunctionSet.COUNT, "ndv", FunctionSet.BITMAP_UNION_INT, FunctionSet.BITMAP_UNION_COUNT, "ndv_no_finalize"); // Set if different from retType_, null otherwise. private Type intermediateType; diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index bf1a895eb5..3a5e8d1598 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1140,19 +1140,19 @@ visible_functions = [ [['to_bitmap'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_hash'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['to_bitmap'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_hash'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_count'], 'BIGINT', ['BITMAP'], '_ZN5doris15BitmapFunctions12bitmap_countEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_and_not_count'], 'BIGINT', ['BITMAP','BITMAP'], '_ZN5doris15BitmapFunctions20bitmap_and_not_countEPN9doris_udf15FunctionContextERKNS1_9StringValES6_', '', '', 'vec', ''], @@ -1194,10 +1194,10 @@ visible_functions = [ '', '', 'vec', ''], [['bitmap_from_string'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions18bitmap_from_stringEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_from_string'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions18bitmap_from_stringEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_contains'], 'BOOLEAN', ['BITMAP','BIGINT'], '_ZN5doris15BitmapFunctions15bitmap_containsEPN9doris_udf15FunctionContextERKNS1_9StringValERKNS1_9BigIntValE', '', '', 'vec', ''],