[Vectorized][Feature] support some bitmap functions (#8138)

This commit is contained in:
zhangstar333
2022-02-23 11:42:16 +08:00
committed by GitHub
parent 9120de205e
commit 31ab569c1d
7 changed files with 451 additions and 169 deletions

View File

@ -18,25 +18,21 @@
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionBitmap.h
// and modified by Doris
#include "util/string_parser.hpp"
#include "vec/functions/function_totype.h"
#include "vec/functions/function_const.h"
#include "vec/functions/simple_function_factory.h"
#include "vec/functions/function_string.h"
#include "gutil/strings/numbers.h"
#include "gutil/strings/split.h"
#include "util/string_parser.hpp"
#include "vec/functions/function_const.h"
#include "vec/functions/function_string.h"
#include "vec/functions/function_totype.h"
#include "vec/functions/simple_function_factory.h"
namespace doris::vectorized {
struct BitmapEmpty {
static constexpr auto name = "bitmap_empty";
using ReturnColVec = ColumnBitmap;
static DataTypePtr get_return_type() {
return std::make_shared<DataTypeBitMap>();
}
static auto init_value() {
return BitmapValue{};
}
static DataTypePtr get_return_type() { return std::make_shared<DataTypeBitMap>(); }
static auto init_value() { return BitmapValue {}; }
};
struct NameToBitmap {
@ -62,12 +58,12 @@ struct ToBitmapImpl {
// TODO: which where cause problem in to_bitmap(null), rethink how to slove the problem
// of null
// if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) {
// return Status::RuntimeError(
// fmt::format("The input: {:.{}} is not valid, to_bitmap only support bigint "
// "value from 0 to 18446744073709551615 currently",
// raw_str, str_size));
// }
// if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) {
// return Status::RuntimeError(
// fmt::format("The input: {:.{}} is not valid, to_bitmap only support bigint "
// "value from 0 to 18446744073709551615 currently",
// raw_str, str_size));
// }
res.emplace_back();
res.back().add(int_value);
}
@ -120,7 +116,7 @@ struct BitmapHash {
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1] - 1;
uint32_t hash_value =
HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED);
HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED);
res.emplace_back();
res.back().add(hash_value);
}
@ -149,69 +145,6 @@ struct BitmapCount {
}
};
struct NameBitmapAnd {
static constexpr auto name = "bitmap_and";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapAnd {
using ResultDataType = DataTypeBitMap;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
static Status vector_vector(const TData& lvec, const TData& rvec, TData& res) {
size_t size = lvec.size();
for (size_t i = 0; i < size; ++i) {
res[i] = lvec[i];
res[i] &= rvec[i];
}
return Status::OK();
}
};
struct NameBitmapOr {
static constexpr auto name = "bitmap_or";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapOr {
using ResultDataType = DataTypeBitMap;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
static Status vector_vector(const TData& lvec, const TData& rvec, TData& res) {
size_t size = lvec.size();
for (size_t i = 0; i < size; ++i) {
res[i] = lvec[i];
res[i] |= rvec[i];
}
return Status::OK();
}
};
struct NameBitmapXor {
static constexpr auto name = "bitmap_xor";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapXor {
using ResultDataType = DataTypeBitMap;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
static Status vector_vector(const TData& lvec, const TData& rvec, TData& res) {
size_t size = lvec.size();
for (size_t i = 0; i < size; ++i) {
res[i] = lvec[i];
res[i] ^= rvec[i];
}
return Status::OK();
}
};
struct NameBitmapNot {
static constexpr auto name = "bitmap_not";
};
@ -233,6 +166,56 @@ struct BitmapNot {
}
};
struct NameBitmapAndNot {
static constexpr auto name = "bitmap_and_not";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapAndNot {
using ResultDataType = DataTypeBitMap;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
static Status vector_vector(const TData& lvec, const TData& rvec, TData& res) {
size_t size = lvec.size();
BitmapValue mid_data;
for (size_t i = 0; i < size; ++i) {
mid_data = lvec[i];
mid_data &= rvec[i];
res[i] = lvec[i];
res[i] -= mid_data;
mid_data.clear();
}
return Status::OK();
}
};
struct NameBitmapAndNotCount {
static constexpr auto name = "bitmap_and_not_count";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapAndNotCount {
using ResultDataType = DataTypeInt64;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
using ResTData = typename ColumnVector<Int64>::Container;
static Status vector_vector(const TData& lvec, const TData& rvec, ResTData& res) {
size_t size = lvec.size();
BitmapValue mid_data;
for (size_t i = 0; i < size; ++i) {
mid_data = lvec[i];
mid_data &= rvec[i];
res[i] = lvec[i].andnot_cardinality(mid_data);
mid_data.clear();
}
return Status::OK();
}
};
struct NameBitmapContains {
static constexpr auto name = "bitmap_contains";
};
@ -278,6 +261,30 @@ struct BitmapHasAny {
}
};
struct NameBitmapHasAll {
static constexpr auto name = "bitmap_has_all";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapHasAll {
using ResultDataType = DataTypeUInt8;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
using ResTData = typename ColumnVector<UInt8>::Container;
static Status vector_vector(const TData& lvec, const TData& rvec, ResTData& res) {
size_t size = lvec.size();
for (size_t i = 0; i < size; ++i) {
uint64_t lhs_cardinality = lvec[i].cardinality();
auto bitmap = const_cast<BitmapValue&>(lvec[i]);
bitmap |= rvec[i];
res[i] = bitmap.cardinality() == lhs_cardinality;
}
return Status::OK();
}
};
struct NameBitmapMin {
static constexpr auto name = "bitmap_min";
};
@ -345,108 +352,148 @@ struct BitmapToString {
}
};
struct NameBitmapAndCount {
static constexpr auto name = "bitmap_and_count";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapAndCount {
using ResultDataType = DataTypeInt64;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
using ResTData = typename ColumnVector<Int64>::Container;
struct SubBitmap {
static constexpr auto name = "sub_bitmap";
using TData1 = std::vector<BitmapValue>;
using TData2 = typename ColumnVector<Int64>::Container;
static Status vector_vector(const TData& lvec, const TData& rvec, ResTData& res) {
size_t size = lvec.size();
BitmapValue val;
for (size_t i = 0; i < size; ++i) {
val |= lvec[i];
val &= rvec[i];
res[i] = val.cardinality();
val.clear();
static Status vector_vector(const TData1& bitmap_data, const TData2& offset_data,
const TData2& limit_data, NullMap& null_map,
size_t input_rows_count, TData1& res) {
for (int i = 0; i < input_rows_count; ++i) {
if (null_map[i]) {
continue;
}
if (limit_data[i] <= 0) {
null_map[i] = 1;
continue;
}
if (const_cast<TData1&>(bitmap_data)[i].offset_limit(offset_data[i], limit_data[i],
&res[i]) == 0) {
null_map[i] = 1;
}
}
return Status::OK();
}
};
struct NameBitmapOrCount {
static constexpr auto name = "bitmap_or_count";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapOrCount {
using ResultDataType = DataTypeInt64;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
using ResTData = typename ColumnVector<Int64>::Container;
struct BitmapSubsetLimit {
static constexpr auto name = "bitmap_subset_limit";
using TData1 = std::vector<BitmapValue>;
using TData2 = typename ColumnVector<Int64>::Container;
static Status vector_vector(const TData& lvec, const TData& rvec, ResTData& res) {
size_t size = lvec.size();
BitmapValue val;
for (size_t i = 0; i < size; ++i) {
val |= lvec[i];
val |= rvec[i];
res[i] = val.cardinality();
val.clear();
static Status vector_vector(const TData1& bitmap_data, const TData2& offset_data,
const TData2& limit_data, NullMap& null_map,
size_t input_rows_count, TData1& res) {
for (int i = 0; i < input_rows_count; ++i) {
if (null_map[i]) {
continue;
}
if (offset_data[i] < 0 || limit_data[i] < 0) {
null_map[i] = 1;
continue;
}
const_cast<TData1&>(bitmap_data)[i].sub_limit(offset_data[i], limit_data[i], &res[i]);
}
return Status::OK();
}
};
struct NameBitmapXorCount {
static constexpr auto name = "bitmap_xor_count";
};
template <typename LeftDataType, typename RightDataType>
struct BitmapXorCount {
using ResultDataType = DataTypeInt64;
using T0 = typename LeftDataType::FieldType;
using T1 = typename RightDataType::FieldType;
using TData = std::vector<BitmapValue>;
using ResTData = typename ColumnVector<Int64>::Container;
struct BitmapSubsetInRange {
static constexpr auto name = "bitmap_subset_in_range";
using TData1 = std::vector<BitmapValue>;
using TData2 = typename ColumnVector<Int64>::Container;
static Status vector_vector(const TData& lvec, const TData& rvec, ResTData& res) {
size_t size = lvec.size();
BitmapValue val;
for (size_t i = 0; i < size; ++i) {
val |= lvec[i];
val ^= rvec[i];
res[i] = val.cardinality();
val.clear();
static Status vector_vector(const TData1& bitmap_data, const TData2& range_start,
const TData2& range_end, NullMap& null_map, size_t input_rows_count,
TData1& res) {
for (int i = 0; i < input_rows_count; ++i) {
if (null_map[i]) {
continue;
}
if (range_start[i] >= range_end[i] || range_start[i] < 0 || range_end[i] < 0) {
null_map[i] = 1;
continue;
}
const_cast<TData1&>(bitmap_data)[i].sub_range(range_start[i], range_end[i], &res[i]);
}
return Status::OK();
}
};
template <typename Impl>
class FunctionBitmapSubs : public IFunction {
public:
static constexpr auto name = Impl::name;
String get_name() const override { return name; }
static FunctionPtr create() { return std::make_shared<FunctionBitmapSubs>(); }
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return make_nullable(std::make_shared<DataTypeBitMap>());
}
size_t get_number_of_arguments() const override { return 3; }
bool use_default_implementation_for_nulls() const override { return false; }
bool use_default_implementation_for_constants() const override { return true; }
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
size_t result, size_t input_rows_count) override {
DCHECK_EQ(arguments.size(), 3);
auto res_null_map = ColumnUInt8::create(input_rows_count, 0);
auto res_data_column = ColumnBitmap::create(input_rows_count);
ColumnPtr argument_columns[3];
for (int i = 0; i < 3; ++i) {
argument_columns[i] =
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
if (auto* nullable = check_and_get_column<ColumnNullable>(*argument_columns[i])) {
VectorizedUtils::update_null_map(res_null_map->get_data(),
nullable->get_null_map_data());
argument_columns[i] = nullable->get_nested_column_ptr();
}
}
auto bitmap_column = assert_cast<const ColumnBitmap*>(argument_columns[0].get());
auto offset_column = assert_cast<const ColumnVector<Int64>*>(argument_columns[1].get());
auto limit_column = assert_cast<const ColumnVector<Int64>*>(argument_columns[2].get());
Impl::vector_vector(bitmap_column->get_data(), offset_column->get_data(),
limit_column->get_data(), res_null_map->get_data(), input_rows_count,
res_data_column->get_data());
block.get_by_position(result).column =
ColumnNullable::create(std::move(res_data_column), std::move(res_null_map));
return Status::OK();
}
};
using FunctionBitmapEmpty = FunctionConst<BitmapEmpty, false>;
using FunctionToBitmap = FunctionUnaryToType<ToBitmapImpl, NameToBitmap>;
using FunctionBitmapFromString = FunctionUnaryToType<BitmapFromString,NameBitmapFromString>;
using FunctionBitmapFromString = FunctionUnaryToType<BitmapFromString, NameBitmapFromString>;
using FunctionBitmapHash = FunctionUnaryToType<BitmapHash, NameBitmapHash>;
using FunctionBitmapCount = FunctionUnaryToType<BitmapCount, NameBitmapCount>;
using FunctionBitmapAndCount =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapAndCount, NameBitmapAndCount>;
using FunctionBitmapOrCount =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapOrCount, NameBitmapOrCount>;
using FunctionBitmapXorCount =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapXorCount, NameBitmapXorCount>;
using FunctionBitmapMin = FunctionUnaryToType<BitmapMin, NameBitmapMin>;
using FunctionBitmapMax = FunctionUnaryToType<BitmapMax, NameBitmapMax>;
using FunctionBitmapToString = FunctionUnaryToType<BitmapToString, NameBitmapToString>;
using FunctionBitmapAnd =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapAnd, NameBitmapAnd>;
using FunctionBitmapOr =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapOr, NameBitmapOr>;
using FunctionBitmapXor =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapXor, NameBitmapXor>;
using FunctionBitmapNot =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapNot, NameBitmapNot>;
using FunctionBitmapAndNot =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapAndNot, NameBitmapAndNot>;
using FunctionBitmapAndNotCount = FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap,
BitmapAndNotCount, NameBitmapAndNotCount>;
using FunctionBitmapContains =
FunctionBinaryToType<DataTypeBitMap, DataTypeInt64, BitmapContains, NameBitmapContains>;
using FunctionBitmapHasAny =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapHasAny, NameBitmapHasAny>;
using FunctionBitmapHasAll =
FunctionBinaryToType<DataTypeBitMap, DataTypeBitMap, BitmapHasAll, NameBitmapHasAll>;
using FunctionSubBitmap = FunctionBitmapSubs<SubBitmap>;
using FunctionBitmapSubsetLimit = FunctionBitmapSubs<BitmapSubsetLimit>;
using FunctionBitmapSubsetInRange = FunctionBitmapSubs<BitmapSubsetInRange>;
void register_function_bitmap(SimpleFunctionFactory& factory) {
factory.register_function<FunctionBitmapEmpty>();
@ -454,18 +501,18 @@ void register_function_bitmap(SimpleFunctionFactory& factory) {
factory.register_function<FunctionBitmapFromString>();
factory.register_function<FunctionBitmapHash>();
factory.register_function<FunctionBitmapCount>();
factory.register_function<FunctionBitmapAndCount>();
factory.register_function<FunctionBitmapOrCount>();
factory.register_function<FunctionBitmapXorCount>();
factory.register_function<FunctionBitmapMin>();
factory.register_function<FunctionBitmapMax>();
factory.register_function<FunctionBitmapToString>();
factory.register_function<FunctionBitmapAnd>();
factory.register_function<FunctionBitmapOr>();
factory.register_function<FunctionBitmapXor>();
factory.register_function<FunctionBitmapNot>();
factory.register_function<FunctionBitmapAndNot>();
factory.register_function<FunctionBitmapAndNotCount>();
factory.register_function<FunctionBitmapContains>();
factory.register_function<FunctionBitmapHasAny>();
factory.register_function<FunctionBitmapHasAll>();
factory.register_function<FunctionSubBitmap>();
factory.register_function<FunctionBitmapSubsetLimit>();
factory.register_function<FunctionBitmapSubsetInRange>();
}
} // namespace doris::vectorized