diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index 5e38ab8f79..e45f7244b9 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -20,141 +20,12 @@ #include "exprs/anyval_util.h" #include "gutil/strings/numbers.h" #include "gutil/strings/split.h" +#include "util/bitmap_intersect.h" #include "util/bitmap_value.h" #include "util/string_parser.hpp" namespace doris { -namespace detail { - -const int DATETIME_PACKED_TIME_BYTE_SIZE = 8; -const int DATETIME_TYPE_BYTE_SIZE = 4; - -const int DECIMAL_BYTE_SIZE = 16; - -// get_val start -template -T get_val(const ValType& x) { - DCHECK(!x.is_null); - return x.val; -} - -template <> -StringValue get_val(const StringVal& x) { - DCHECK(!x.is_null); - return StringValue::from_string_val(x); -} - -template <> -DateTimeValue get_val(const DateTimeVal& x) { - return DateTimeValue::from_datetime_val(x); -} - -template <> -DecimalV2Value get_val(const DecimalV2Val& x) { - return DecimalV2Value::from_decimal_val(x); -} -// get_val end - -// serialize_size start -template -int32_t serialize_size(const T& v) { - return sizeof(T); -} - -template <> -int32_t serialize_size(const DateTimeValue& v) { - return DATETIME_PACKED_TIME_BYTE_SIZE + DATETIME_TYPE_BYTE_SIZE; -} - -template <> -int32_t serialize_size(const DecimalV2Value& v) { - return DECIMAL_BYTE_SIZE; -} - -template <> -int32_t serialize_size(const StringValue& v) { - return v.len + 4; -} -// serialize_size end - -// write_to start -template -char* write_to(const T& v, char* dest) { - size_t type_size = sizeof(T); - memcpy(dest, &v, type_size); - dest += type_size; - return dest; -} - -template <> -char* write_to(const DateTimeValue& v, char* dest) { - DateTimeVal value; - v.to_datetime_val(&value); - *(int64_t*)dest = value.packed_time; - dest += DATETIME_PACKED_TIME_BYTE_SIZE; - *(int*)dest = value.type; - dest += DATETIME_TYPE_BYTE_SIZE; - return dest; -} - -template <> -char* write_to(const DecimalV2Value& v, char* dest) { - __int128 value = v.value(); - memcpy(dest, &value, DECIMAL_BYTE_SIZE); - dest += DECIMAL_BYTE_SIZE; - return dest; -} - -template <> -char* write_to(const StringValue& v, char* dest) { - *(int32_t*)dest = v.len; - dest += 4; - memcpy(dest, v.ptr, v.len); - dest += v.len; - return dest; -} -// write_to end - -// read_from start -template -void read_from(const char** src, T* result) { - size_t type_size = sizeof(T); - memcpy(result, *src, type_size); - *src += type_size; -} - -template <> -void read_from(const char** src, DateTimeValue* result) { - DateTimeVal value; - value.is_null = false; - value.packed_time = *(int64_t*)(*src); - *src += DATETIME_PACKED_TIME_BYTE_SIZE; - value.type = *(int*)(*src); - *src += DATETIME_TYPE_BYTE_SIZE; - *result = DateTimeValue::from_datetime_val(value); - ; -} - -template <> -void read_from(const char** src, DecimalV2Value* result) { - __int128 v = 0; - memcpy(&v, *src, DECIMAL_BYTE_SIZE); - *src += DECIMAL_BYTE_SIZE; - *result = DecimalV2Value(v); -} - -template <> -void read_from(const char** src, StringValue* result) { - int32_t length = *(int32_t*)(*src); - *src += 4; - *result = StringValue((char*)*src, length); - *src += length; -} -// read_from end - -} // namespace detail - static StringVal serialize(FunctionContext* ctx, BitmapValue* value) { if (!value) { BitmapValue empty_bitmap; @@ -168,98 +39,6 @@ static StringVal serialize(FunctionContext* ctx, BitmapValue* value) { } } -// Calculate the intersection of two or more bitmaps -// Usage: intersect_count(bitmap_column_to_count, filter_column, filter_values ...) -// Example: intersect_count(user_id, event, 'A', 'B', 'C'), meaning find the intersect count of user_id in all A/B/C 3 bitmaps -// Todo(kks) Use Array type instead of variable arguments -template -struct BitmapIntersect { -public: - BitmapIntersect() {} - - explicit BitmapIntersect(const char* src) { deserialize(src); } - - void add_key(const T key) { - BitmapValue empty_bitmap; - _bitmaps[key] = empty_bitmap; - } - - void update(const T& key, const BitmapValue& bitmap) { - if (_bitmaps.find(key) != _bitmaps.end()) { - _bitmaps[key] |= bitmap; - } - } - - void merge(const BitmapIntersect& other) { - for (auto& kv : other._bitmaps) { - if (_bitmaps.find(kv.first) != _bitmaps.end()) { - _bitmaps[kv.first] |= kv.second; - } else { - _bitmaps[kv.first] = kv.second; - } - } - } - - // intersection - BitmapValue intersect() const { - BitmapValue result; - auto it = _bitmaps.begin(); - result |= it->second; - it++; - for (; it != _bitmaps.end(); it++) { - result &= it->second; - } - return result; - } - - // calculate the intersection for _bitmaps's bitmap values - int64_t intersect_count() const { - if (_bitmaps.empty()) { - return 0; - } - return intersect().cardinality(); - } - - // the serialize size - size_t size() { - size_t size = 4; - for (auto& kv : _bitmaps) { - size += detail::serialize_size(kv.first); - ; - size += kv.second.getSizeInBytes(); - } - return size; - } - - //must call size() first - void serialize(char* dest) { - char* writer = dest; - *(int32_t*)writer = _bitmaps.size(); - writer += 4; - for (auto& kv : _bitmaps) { - writer = detail::write_to(kv.first, writer); - kv.second.write(writer); - writer += kv.second.getSizeInBytes(); - } - } - - void deserialize(const char* src) { - const char* reader = src; - int32_t bitmaps_size = *(int32_t*)reader; - reader += 4; - for (int32_t i = 0; i < bitmaps_size; i++) { - T key; - detail::read_from(&reader, &key); - BitmapValue bitmap(reader); - reader += bitmap.getSizeInBytes(); - _bitmaps[key] = bitmap; - } - } - -private: - std::map _bitmaps; -}; - void BitmapFunctions::init() {} void BitmapFunctions::bitmap_init(FunctionContext* ctx, StringVal* dst) { @@ -403,7 +182,7 @@ StringVal BitmapFunctions::bitmap_serialize(FunctionContext* ctx, const StringVa return result; } -// This is a init function for intersect_count not for bitmap_intersect. +// This is a init function for intersect_count not for bitmap_intersect, not for _orthogonal_bitmap_intersect(bitmap,t,t) template void BitmapFunctions::bitmap_intersect_init(FunctionContext* ctx, StringVal* dst) { dst->is_null = false; @@ -414,12 +193,14 @@ void BitmapFunctions::bitmap_intersect_init(FunctionContext* ctx, StringVal* dst for (int i = 2; i < ctx->get_num_constant_args(); ++i) { DCHECK(ctx->is_arg_constant(i)); ValType* arg = reinterpret_cast(ctx->get_constant_arg(i)); - intersect->add_key(detail::get_val(*arg)); + intersect->add_key(detail::Helper::get_val(*arg)); } dst->ptr = (uint8_t*)intersect; } +// This is a update function for intersect_count/ORTHOGONAL_BITMAP_INTERSECT_COUNT/ORTHOGONAL_BITMAP_INTERSECT(bitmap,t,t) +// not for bitmap_intersect(Bitmap) template void BitmapFunctions::bitmap_intersect_update(FunctionContext* ctx, const StringVal& src, const ValType& key, int num_key, const ValType* keys, @@ -427,13 +208,14 @@ void BitmapFunctions::bitmap_intersect_update(FunctionContext* ctx, const String auto* dst_bitmap = reinterpret_cast*>(dst->ptr); // zero size means the src input is a agg object if (src.len == 0) { - dst_bitmap->update(detail::get_val(key), + dst_bitmap->update(detail::Helper::get_val(key), *reinterpret_cast(src.ptr)); } else { - dst_bitmap->update(detail::get_val(key), BitmapValue((char*)src.ptr)); + dst_bitmap->update(detail::Helper::get_val(key), BitmapValue((char*)src.ptr)); } } +//only for intersect_count(bitmap,t,t) template void BitmapFunctions::bitmap_intersect_merge(FunctionContext* ctx, const StringVal& src, const StringVal* dst) { @@ -441,6 +223,7 @@ void BitmapFunctions::bitmap_intersect_merge(FunctionContext* ctx, const StringV dst_bitmap->merge(BitmapIntersect((char*)src.ptr)); } +//only for intersect_count(bitmap,t,t) template StringVal BitmapFunctions::bitmap_intersect_serialize(FunctionContext* ctx, const StringVal& src) { auto* src_bitmap = reinterpret_cast*>(src.ptr); @@ -450,6 +233,7 @@ StringVal BitmapFunctions::bitmap_intersect_serialize(FunctionContext* ctx, cons return result; } +//only for intersect_count(bitmap,t,t) template BigIntVal BitmapFunctions::bitmap_intersect_finalize(FunctionContext* ctx, const StringVal& src) { auto* src_bitmap = reinterpret_cast*>(src.ptr); @@ -928,13 +712,15 @@ StringVal BitmapFunctions::bitmap_subset_limit(FunctionContext* ctx, const Strin return serialize(ctx, &ret_bitmap); } - +// init ORTHOGONAL_BITMAP_UNION_COUNT(bitmap) +// update bitmap_union() void BitmapFunctions::orthogonal_bitmap_union_count_init(FunctionContext* ctx, StringVal* dst) { dst->is_null = false; dst->len = sizeof(BitmapValue); dst->ptr = (uint8_t*)new BitmapValue(); } +// serialize for ORTHOGONAL_BITMAP_UNION_COUNT(bitmap) StringVal BitmapFunctions::orthogonal_bitmap_count_serialize(FunctionContext* ctx, const StringVal& src) { if (src.is_null) { @@ -950,7 +736,7 @@ StringVal BitmapFunctions::orthogonal_bitmap_count_serialize(FunctionContext* ct return result; } -// This is a init function for bitmap_intersect. +// This is a init function for orthogonal_bitmap_intersect(bitmap,t,t). template void BitmapFunctions::orthogonal_bitmap_intersect_init(FunctionContext* ctx, StringVal* dst) { // constant args start from index 2 @@ -961,7 +747,7 @@ void BitmapFunctions::orthogonal_bitmap_intersect_init(FunctionContext* ctx, Str for (int i = 2; i < ctx->get_num_constant_args(); ++i) { ValType* arg = reinterpret_cast(ctx->get_constant_arg(i)); - intersect->add_key(detail::get_val(*arg)); + intersect->add_key(detail::Helper::get_val(*arg)); } dst->ptr = (uint8_t*)intersect; @@ -972,7 +758,7 @@ void BitmapFunctions::orthogonal_bitmap_intersect_init(FunctionContext* ctx, Str } } -// This is a init function for intersect_count. +// This is a init function for orthogonal_bitmap_intersect_count(bitmap,t,t). template void BitmapFunctions::orthogonal_bitmap_intersect_count_init(FunctionContext* ctx, StringVal* dst) { if (ctx->get_num_constant_args() > 1) { @@ -983,7 +769,7 @@ void BitmapFunctions::orthogonal_bitmap_intersect_count_init(FunctionContext* ct // constant args start from index 2 for (int i = 2; i < ctx->get_num_constant_args(); ++i) { ValType* arg = reinterpret_cast(ctx->get_constant_arg(i)); - intersect->add_key(detail::get_val(*arg)); + intersect->add_key(detail::Helper::get_val(*arg)); } dst->ptr = (uint8_t*)intersect; @@ -995,6 +781,9 @@ void BitmapFunctions::orthogonal_bitmap_intersect_count_init(FunctionContext* ct } } +// This is a serialize function for orthogonal_bitmap_intersect(bitmap,t,t). +// merge is ths simple bitmap_union() function LINE(80); +// finalize is the bitmap_serialize() function LINE(173) template StringVal BitmapFunctions::orthogonal_bitmap_intersect_serialize(FunctionContext* ctx, const StringVal& src) { @@ -1014,6 +803,8 @@ BigIntVal BitmapFunctions::orthogonal_bitmap_intersect_finalize(FunctionContext* return result; } +// This is a merge function for orthogonal_bitmap_intersect_count(bitmap,t,t). +// and merge for ORTHOGONAL_BITMAP_UNION_COUNT(bitmap) void BitmapFunctions::orthogonal_bitmap_count_merge(FunctionContext* context, const StringVal& src, StringVal* dst) { if (dst->len != sizeof(int64_t)) { @@ -1027,6 +818,8 @@ void BitmapFunctions::orthogonal_bitmap_count_merge(FunctionContext* context, co *(int64_t*)dst->ptr += *(int64_t*)src.ptr; } +// This is a finalize function for orthogonal_bitmap_intersect_count(bitmap,t,t). +// finalize for ORTHOGONAL_BITMAP_UNION_COUNT(bitmap) BigIntVal BitmapFunctions::orthogonal_bitmap_count_finalize(FunctionContext* context, const StringVal& src) { auto* pval = reinterpret_cast(src.ptr); @@ -1035,6 +828,7 @@ BigIntVal BitmapFunctions::orthogonal_bitmap_count_finalize(FunctionContext* con return result; } +// This is a serialize function for orthogonal_bitmap_intersect_count(bitmap,t,t). template StringVal BitmapFunctions::orthogonal_bitmap_intersect_count_serialize(FunctionContext* ctx, const StringVal& src) { diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h index f15b26571e..13b3852a5d 100644 --- a/be/src/runtime/string_value.h +++ b/be/src/runtime/string_value.h @@ -90,6 +90,7 @@ struct StringValue { StringValue(char* ptr, int len) : ptr(ptr), len(len) {} StringValue(const char* ptr, int len) : ptr(const_cast(ptr)), len(len) {} StringValue() : ptr(nullptr), len(0) {} + StringValue(const StringRef& str) : ptr(const_cast(str.data)), len(str.size) {} /// Construct a StringValue from 's'. 's' must be valid for as long as /// this object is valid. diff --git a/be/src/util/bitmap_intersect.h b/be/src/util/bitmap_intersect.h new file mode 100644 index 0000000000..dcda6ae5a5 --- /dev/null +++ b/be/src/util/bitmap_intersect.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once +#include "runtime/string_value.h" +#include "udf/udf.h" +#include "util/bitmap_value.h" + +namespace doris { + +namespace detail { +class Helper { +public: + static const int DATETIME_PACKED_TIME_BYTE_SIZE = 8; + static const int DATETIME_TYPE_BYTE_SIZE = 4; + static const int DECIMAL_BYTE_SIZE = 16; + + // get_val start + template + static T get_val(const ValType& x) { + DCHECK(!x.is_null); + return x.val; + } + + // serialize_size start + template + static int32_t serialize_size(const T& v) { + return sizeof(T); + } + + // write_to start + template + static char* write_to(const T& v, char* dest) { + size_t type_size = sizeof(T); + memcpy(dest, &v, type_size); + dest += type_size; + return dest; + } + + // read_from start + template + static void read_from(const char** src, T* result) { + size_t type_size = sizeof(T); + memcpy(result, *src, type_size); + *src += type_size; + } +}; + +template <> +inline StringValue Helper::get_val(const StringVal& x) { + DCHECK(!x.is_null); + return StringValue::from_string_val(x); +} + +template <> +inline DateTimeValue Helper::get_val(const DateTimeVal& x) { + return DateTimeValue::from_datetime_val(x); +} + +template <> +inline DecimalV2Value Helper::get_val(const DecimalV2Val& x) { + return DecimalV2Value::from_decimal_val(x); +} +// get_val end + +template <> +inline char* Helper::write_to(const DateTimeValue& v, char* dest) { + DateTimeVal value; + v.to_datetime_val(&value); + *(int64_t*)dest = value.packed_time; + dest += DATETIME_PACKED_TIME_BYTE_SIZE; + *(int*)dest = value.type; + dest += DATETIME_TYPE_BYTE_SIZE; + return dest; +} + +template <> +inline char* Helper::write_to(const DecimalV2Value& v, char* dest) { + __int128 value = v.value(); + memcpy(dest, &value, DECIMAL_BYTE_SIZE); + dest += DECIMAL_BYTE_SIZE; + return dest; +} + +template <> +inline char* Helper::write_to(const StringValue& v, char* dest) { + *(int32_t*)dest = v.len; + dest += 4; + memcpy(dest, v.ptr, v.len); + dest += v.len; + return dest; +} +// write_to end + +template <> +inline int32_t Helper::serialize_size(const DateTimeValue& v) { + return Helper::DATETIME_PACKED_TIME_BYTE_SIZE + Helper::DATETIME_TYPE_BYTE_SIZE; +} + +template <> +inline int32_t Helper::serialize_size(const DecimalV2Value& v) { + return Helper::DECIMAL_BYTE_SIZE; +} + +template <> +inline int32_t Helper::serialize_size(const StringValue& v) { + return v.len + 4; +} +// serialize_size end + +template <> +inline void Helper::read_from(const char** src, DateTimeValue* result) { + DateTimeVal value; + value.is_null = false; + value.packed_time = *(int64_t*)(*src); + *src += DATETIME_PACKED_TIME_BYTE_SIZE; + value.type = *(int*)(*src); + *src += DATETIME_TYPE_BYTE_SIZE; + *result = DateTimeValue::from_datetime_val(value); +} + +template <> +inline void Helper::read_from(const char** src, DecimalV2Value* result) { + __int128 v = 0; + memcpy(&v, *src, DECIMAL_BYTE_SIZE); + *src += DECIMAL_BYTE_SIZE; + *result = DecimalV2Value(v); +} + +template <> +inline void Helper::read_from(const char** src, StringValue* result) { + int32_t length = *(int32_t*)(*src); + *src += 4; + *result = StringValue((char*)*src, length); + *src += length; +} +// read_from end + +} // namespace detail + +// Calculate the intersection of two or more bitmaps +// Usage: intersect_count(bitmap_column_to_count, filter_column, filter_values ...) +// Example: intersect_count(user_id, event, 'A', 'B', 'C'), meaning find the intersect count of user_id in all A/B/C 3 bitmaps +// Todo(kks) Use Array type instead of variable arguments +template +struct BitmapIntersect { +public: + BitmapIntersect() = default; + + explicit BitmapIntersect(const char* src) { deserialize(src); } + + void add_key(const T key) { + BitmapValue empty_bitmap; + _bitmaps[key] = empty_bitmap; + } + + void update(const T& key, const BitmapValue& bitmap) { + if (_bitmaps.find(key) != _bitmaps.end()) { + _bitmaps[key] |= bitmap; + } + } + + void merge(const BitmapIntersect& other) { + for (auto& kv : other._bitmaps) { + if (_bitmaps.find(kv.first) != _bitmaps.end()) { + _bitmaps[kv.first] |= kv.second; + } else { + _bitmaps[kv.first] = kv.second; + } + } + } + + // intersection + BitmapValue intersect() const { + BitmapValue result; + auto it = _bitmaps.begin(); + result |= it->second; + it++; + for (; it != _bitmaps.end(); it++) { + result &= it->second; + } + return result; + } + + // calculate the intersection for _bitmaps's bitmap values + int64_t intersect_count() const { + if (_bitmaps.empty()) { + return 0; + } + return intersect().cardinality(); + } + + // the serialize size + size_t size() { + size_t size = 4; + for (auto& kv : _bitmaps) { + size += detail::Helper::serialize_size(kv.first); + size += kv.second.getSizeInBytes(); + } + return size; + } + + //must call size() first + void serialize(char* dest) { + char* writer = dest; + *(int32_t*)writer = _bitmaps.size(); + writer += 4; + for (auto& kv : _bitmaps) { + writer = detail::Helper::write_to(kv.first, writer); + kv.second.write(writer); + writer += kv.second.getSizeInBytes(); + } + } + + void deserialize(const char* src) { + const char* reader = src; + int32_t bitmaps_size = *(int32_t*)reader; + reader += 4; + for (int32_t i = 0; i < bitmaps_size; i++) { + T key; + detail::Helper::read_from(&reader, &key); + BitmapValue bitmap(reader); + reader += bitmap.getSizeInBytes(); + _bitmaps[key] = bitmap; + } + } + +private: + std::map _bitmaps; +}; + +} // namespace doris diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index 5c4d5c7b36..fc50adca0a 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -40,6 +40,7 @@ set(VEC_FILES aggregate_functions/aggregate_function_percentile_approx.cpp aggregate_functions/aggregate_function_simple_factory.cpp aggregate_functions/aggregate_function_java_udaf.h + aggregate_functions/aggregate_function_orthogonal_bitmap.cpp columns/collator.cpp columns/column.cpp columns/column_array.cpp diff --git a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp new file mode 100644 index 0000000000..470a6c8388 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h" + +#include + +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +template