From d969047b50397ca20bc22e8841aa6f4abf2f50e2 Mon Sep 17 00:00:00 2001 From: Pxl Date: Tue, 28 Nov 2023 19:46:00 +0800 Subject: [PATCH] [Refactor](join) refactor of hash join (#27557) Improve the performance under the tpch data set by reconstructing the join related code and the use of hash table Co-authored-by: HappenLee Co-authored-by: BiteTheDDDDt --- be/src/exprs/bitmapfilter_predicate.h | 8 +- be/src/exprs/block_bloom_filter.hpp | 25 +- be/src/exprs/bloom_filter_func.h | 328 +++---- be/src/exprs/hybrid_set.h | 117 ++- be/src/exprs/minmax_predicate.h | 303 +++---- be/src/exprs/runtime_filter.cpp | 111 +-- be/src/exprs/runtime_filter.h | 6 +- be/src/exprs/runtime_filter_slots.h | 33 +- be/src/exprs/runtime_filter_slots_cross.h | 24 +- be/src/olap/bloom_filter_predicate.h | 65 +- be/src/pipeline/exec/hashjoin_build_sink.cpp | 187 ++-- be/src/pipeline/exec/hashjoin_build_sink.h | 16 +- .../pipeline/exec/hashjoin_probe_operator.cpp | 2 + .../pipeline/exec/hashjoin_probe_operator.h | 8 +- .../exec/join_build_sink_operator.cpp | 8 +- .../pipeline/exec/join_build_sink_operator.h | 4 +- .../exec/nested_loop_join_build_operator.h | 8 +- be/src/pipeline/exec/set_sink_operator.cpp | 13 +- be/src/pipeline/exec/set_sink_operator.h | 2 +- be/src/pipeline/exec/set_source_operator.cpp | 6 +- be/src/pipeline/pipeline_x/dependency.h | 7 +- be/src/vec/columns/column.h | 9 +- be/src/vec/columns/column_array.cpp | 11 + be/src/vec/columns/column_array.h | 3 + be/src/vec/columns/column_complex.h | 15 + be/src/vec/columns/column_const.h | 5 + be/src/vec/columns/column_decimal.h | 12 + be/src/vec/columns/column_dictionary.h | 34 +- .../vec/columns/column_fixed_length_object.h | 22 + be/src/vec/columns/column_map.cpp | 11 + be/src/vec/columns/column_map.h | 3 + be/src/vec/columns/column_nothing.h | 5 + be/src/vec/columns/column_nullable.cpp | 10 + be/src/vec/columns/column_nullable.h | 3 + be/src/vec/columns/column_object.cpp | 11 + be/src/vec/columns/column_object.h | 3 + be/src/vec/columns/column_string.cpp | 37 + be/src/vec/columns/column_string.h | 3 + be/src/vec/columns/column_struct.cpp | 9 + be/src/vec/columns/column_struct.h | 3 + be/src/vec/columns/column_vector.cpp | 14 + be/src/vec/columns/column_vector.h | 2 + be/src/vec/columns/predicate_column.h | 5 + be/src/vec/common/hash_table/hash_map.h | 344 ++++++++ .../vec/common/hash_table/hash_map_context.h | 129 ++- be/src/vec/common/hash_table/hash_table.h | 1 - .../common/hash_table/hash_table_set_build.h | 9 +- be/src/vec/core/block.cpp | 15 +- be/src/vec/exec/join/join_op.h | 72 +- .../vec/exec/join/process_hash_table_probe.h | 34 +- .../exec/join/process_hash_table_probe_impl.h | 816 +++--------------- be/src/vec/exec/join/vhash_join_node.cpp | 149 ++-- be/src/vec/exec/join/vhash_join_node.h | 187 +--- be/src/vec/exec/join/vjoin_node_base.cpp | 4 +- be/src/vec/exec/join/vjoin_node_base.h | 4 +- .../vec/exec/join/vnested_loop_join_node.cpp | 4 +- be/src/vec/exec/join/vnested_loop_join_node.h | 8 +- be/src/vec/exec/vset_operation_node.cpp | 72 +- be/src/vec/exec/vset_operation_node.h | 5 +- be/src/vec/exprs/vbloom_predicate.cpp | 37 +- .../runtime/shared_hash_table_controller.h | 11 +- be/test/exprs/bloom_filter_predicate_test.cpp | 3 - .../query_p0/join/mark_join/mark_join.out | 19 + .../nereids_syntax_p0/sub_query_alias.groovy | 2 +- .../sub_query_correlated.groovy | 12 +- .../suites/nereids_syntax_p0/view.groovy | 4 +- .../query_p0/join/mark_join/mark_join.groovy | 64 ++ 67 files changed, 1579 insertions(+), 1937 deletions(-) create mode 100644 regression-test/data/query_p0/join/mark_join/mark_join.out create mode 100644 regression-test/suites/query_p0/join/mark_join/mark_join.groovy diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h index 743a55c4b6..8df488cf87 100644 --- a/be/src/exprs/bitmapfilter_predicate.h +++ b/be/src/exprs/bitmapfilter_predicate.h @@ -31,7 +31,7 @@ namespace doris { class BitmapFilterFuncBase : public FilterFuncBase { public: virtual void insert(const void* data) = 0; - virtual void insert_many(const std::vector bitmaps) = 0; + virtual void insert_many(const std::vector& bitmaps) = 0; virtual bool empty() = 0; virtual Status assign(BitmapValue* bitmap_value) = 0; virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; } @@ -60,7 +60,7 @@ public: void insert(const void* data) override; - void insert_many(const std::vector bitmaps) override; + void insert_many(const std::vector& bitmaps) override; uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number) override; @@ -75,7 +75,7 @@ public: return Status::OK(); } - void light_copy(BitmapFilterFuncBase* bloomfilter_func) override; + void light_copy(BitmapFilterFuncBase* bitmapfilter_func) override; size_t size() const override { return _bitmap_value->cardinality(); } @@ -108,7 +108,7 @@ void BitmapFilterFunc::insert(const void* data) { } template -void BitmapFilterFunc::insert_many(const std::vector bitmaps) { +void BitmapFilterFunc::insert_many(const std::vector& bitmaps) { if (bitmaps.empty()) { return; } diff --git a/be/src/exprs/block_bloom_filter.hpp b/be/src/exprs/block_bloom_filter.hpp index 654867d6cc..f31d7f7d4c 100644 --- a/be/src/exprs/block_bloom_filter.hpp +++ b/be/src/exprs/block_bloom_filter.hpp @@ -20,6 +20,7 @@ #pragma once +#include "vec/common/string_ref.h" #ifdef __AVX2__ #include @@ -72,14 +73,7 @@ public: // non-equal values will have the same hash value) is 0. void insert(uint32_t hash) noexcept; // Same as above with convenience of hashing the key. - void insert(const Slice& key) noexcept { - if (key.data) { - insert(HashUtil::murmur_hash3_32(key.data, key.size, _hash_seed)); - } - } - - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - void insert_crc32_hash(const Slice& key) noexcept { + void insert(const StringRef& key) noexcept { if (key.data) { insert(HashUtil::crc_hash(key.data, key.size, _hash_seed)); } @@ -123,22 +117,13 @@ public: #endif } // Same as above with convenience of hashing the key. - bool find(const Slice& key) const noexcept { - if (key.data) { - return find(HashUtil::murmur_hash3_32(key.data, key.size, _hash_seed)); - } else { - return false; - } - } - - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - bool find_crc32_hash(const Slice& key) const noexcept { + bool find(const StringRef& key) const noexcept { if (key.data) { return find(HashUtil::crc_hash(key.data, key.size, _hash_seed)); - } else { - return false; } + return false; } + // Computes the logical OR of this filter with 'other' and stores the result in this // filter. // Notes: diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index dfb775cc0a..0323d44315 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -20,6 +20,7 @@ #include "exprs/block_bloom_filter.hpp" #include "exprs/runtime_filter.h" #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep +#include "vec/common/string_ref.h" namespace doris { @@ -53,27 +54,12 @@ public: return _bloom_filter->find(data); } - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - template - bool test_new_hash(T data) const { - if constexpr (std::is_same_v) { - return _bloom_filter->find_crc32_hash(data); - } else { - return _bloom_filter->find(data); - } - } - - void add_bytes(const char* data, size_t len) { _bloom_filter->insert(Slice(data, len)); } - - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - void add_bytes_new_hash(const char* data, size_t len) { - _bloom_filter->insert_crc32_hash(Slice(data, len)); - } + void add_bytes(const char* data, size_t len) { _bloom_filter->insert(StringRef(data, len)); } // test_element/find_element only used on vectorized engine template bool test_element(T element) const { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { return _bloom_filter->find(element); } else { return _bloom_filter->find(HashUtil::fixed_len_to_uint32(element)); @@ -82,7 +68,7 @@ public: template void add_element(T element) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { _bloom_filter->insert(element); } else { _bloom_filter->insert(HashUtil::fixed_len_to_uint32(element)); @@ -96,8 +82,6 @@ private: // Only Used In RuntimeFilter class BloomFilterFuncBase : public FilterFuncBase { public: - BloomFilterFuncBase() : _inited(false) {} - virtual ~BloomFilterFuncBase() = default; Status init(int64_t expect_num, double fpp) { @@ -112,9 +96,8 @@ public: Status init_with_fixed_length() { if (_build_bf_exactly) { return Status::OK(); - } else { - return init_with_fixed_length(_bloom_filter_length); } + return init_with_fixed_length(_bloom_filter_length); } Status init_with_cardinality(const size_t build_bf_cardinality) { @@ -127,10 +110,10 @@ public: // Handle case where ndv == 1 => ceil(log2(m/8)) < 0. int log_filter_size = std::max(0, (int)(std::ceil(std::log(m / 8) / std::log(2)))); - return init_with_fixed_length(((int64_t)1) << log_filter_size); - } else { - return Status::OK(); + _bloom_filter_length = std::min(((int64_t)1) << log_filter_size, _bloom_filter_length); + return init_with_fixed_length(_bloom_filter_length); } + return Status::OK(); } Status init_with_fixed_length(int64_t bloom_filter_length) { @@ -157,36 +140,35 @@ public: // allocate memory again. if (_inited) { DCHECK(bloomfilter_func != nullptr); - auto other_func = static_cast(bloomfilter_func); + auto* other_func = static_cast(bloomfilter_func); if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) { - LOG(WARNING) << "bloom filter size not the same: already allocated bytes = " - << _bloom_filter_alloced << ", expected allocated bytes = " - << other_func->_bloom_filter_alloced; - return Status::InvalidArgument("bloom filter size invalid"); + return Status::InvalidArgument( + "bloom filter size not the same: already allocated bytes {}, expected " + "allocated bytes {}", + _bloom_filter_alloced, other_func->_bloom_filter_alloced); } return _bloom_filter->merge(other_func->_bloom_filter.get()); } { std::lock_guard l(_lock); if (!_inited) { - auto other_func = static_cast(bloomfilter_func); + auto* other_func = static_cast(bloomfilter_func); DCHECK(_bloom_filter == nullptr); DCHECK(bloomfilter_func != nullptr); _bloom_filter = bloomfilter_func->_bloom_filter; _bloom_filter_alloced = other_func->_bloom_filter_alloced; _inited = true; return Status::OK(); - } else { - DCHECK(bloomfilter_func != nullptr); - auto other_func = static_cast(bloomfilter_func); - if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) { - LOG(WARNING) << "bloom filter size not the same: already allocated bytes = " - << _bloom_filter_alloced << ", expected allocated bytes = " - << other_func->_bloom_filter_alloced; - return Status::InvalidArgument("bloom filter size invalid"); - } - return _bloom_filter->merge(other_func->_bloom_filter.get()); } + DCHECK(bloomfilter_func != nullptr); + auto* other_func = static_cast(bloomfilter_func); + if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) { + return Status::InvalidArgument( + "bloom filter size not the same: already allocated bytes {}, expected " + "allocated bytes {}", + _bloom_filter_alloced, other_func->_bloom_filter_alloced); + } + return _bloom_filter->merge(other_func->_bloom_filter.get()); } } @@ -208,7 +190,7 @@ public: size_t get_size() const { return _bloom_filter ? _bloom_filter->size() : 0; } void light_copy(BloomFilterFuncBase* bloomfilter_func) { - auto other_func = static_cast(bloomfilter_func); + auto* other_func = static_cast(bloomfilter_func); _bloom_filter_alloced = other_func->_bloom_filter_alloced; _bloom_filter = other_func->_bloom_filter; _inited = other_func->_inited; @@ -216,62 +198,47 @@ public: virtual void insert(const void* data) = 0; - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - virtual void insert_crc32_hash(const void* data) = 0; - virtual bool find(const void* data) const = 0; - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - virtual bool find_crc32_hash(const void* data) const = 0; - virtual bool find_olap_engine(const void* data) const = 0; virtual bool find_uint32_t(uint32_t data) const = 0; - virtual void insert_fixed_len(const char* data, const int* offsets, int number) = 0; + virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0; - virtual void insert_fixed_len(const char* data) = 0; + virtual void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) = 0; virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number, bool is_parse_column) = 0; - virtual void find_fixed_len(const char* data, const uint8* nullmap, int number, - uint8* results) = 0; - protected: // bloom filter size int32_t _bloom_filter_alloced; std::shared_ptr _bloom_filter; - bool _inited; + bool _inited {}; std::mutex _lock; int64_t _bloom_filter_length; bool _build_bf_exactly = false; }; -template -struct CommonFindOp { - // test_batch/find_batch/find_batch_olap_engine only used on vectorized engine - void insert_batch(BloomFilterAdaptor& bloom_filter, const char* data, const int* offsets, - int number) const { - for (int i = 0; i < number; i++) { - bloom_filter.add_element(*((T*)data + offsets[i])); - } - } +struct BaseOp { + virtual ~BaseOp() = default; - void insert_single(BloomFilterAdaptor& bloom_filter, const char* data) const { - bloom_filter.add_element(*((T*)data)); - } + virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, + const void* data) const = 0; - uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, - const uint8* nullmap, uint16_t* offsets, int number, - const bool is_parse_column) const { + uint16_t find_batch_olap_engine_with_element_size(const BloomFilterAdaptor& bloom_filter, + const char* data, const uint8* nullmap, + uint16_t* offsets, int number, + const bool is_parse_column, + size_t element_size) const { uint16_t new_size = 0; if (is_parse_column) { if (nullmap == nullptr) { for (int i = 0; i < number; i++) { uint16_t idx = offsets[i]; - if (!bloom_filter.test_element(*((T*)data + idx))) { + if (!find_olap_engine(bloom_filter, data + element_size * idx)) { continue; } offsets[new_size++] = idx; @@ -282,7 +249,7 @@ struct CommonFindOp { if (nullmap[idx]) { continue; } - if (!bloom_filter.test_element(*((T*)data + idx))) { + if (!find_olap_engine(bloom_filter, data + element_size * idx)) { continue; } offsets[new_size++] = idx; @@ -291,7 +258,7 @@ struct CommonFindOp { } else { if (nullmap == nullptr) { for (int i = 0; i < number; i++) { - if (!bloom_filter.test_element(*((T*)data + i))) { + if (!find_olap_engine(bloom_filter, data + element_size * i)) { continue; } offsets[new_size++] = i; @@ -301,7 +268,7 @@ struct CommonFindOp { if (nullmap[i]) { continue; } - if (!bloom_filter.test_element(*((T*)data + i))) { + if (!find_olap_engine(bloom_filter, data + element_size * i)) { continue; } offsets[new_size++] = i; @@ -310,28 +277,71 @@ struct CommonFindOp { } return new_size; } +}; - void find_batch(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap, - int number, uint8* results) const { - for (int i = 0; i < number; i++) { - results[i] = false; - if (nullmap != nullptr && nullmap[i]) { - continue; +template +struct CommonFindOp : BaseOp { + uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, + const uint8* nullmap, uint16_t* offsets, int number, + const bool is_parse_column) { + return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets, + number, is_parse_column, sizeof(T)); + } + + void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, + size_t start) const { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = nullable->get_nested_column(); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + const T* data = (T*)col.get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + bloom_filter.add_element(*(data + i)); + } } - if (!bloom_filter.test_element(*((T*)data + i))) { - continue; + } else { + const T* data = (T*)column->get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + bloom_filter.add_element(*(data + i)); + } + } + } + + void find_batch(const BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, + uint8_t* results) const { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + const T* data = (T*)nullable->get_nested_column().get_raw_data().data; + for (size_t i = 0; i < column->size(); i++) { + if (!nullmap[i]) { + results[i] = bloom_filter.test_element(data[i]); + } else { + results[i] = false; + } + } + } else { + const T* data = (T*)column->get_raw_data().data; + for (size_t i = 0; i < column->size(); i++) { + results[i] = bloom_filter.test_element(data[i]); } - results[i] = true; } } void insert(BloomFilterAdaptor& bloom_filter, const void* data) const { - bloom_filter.add_bytes((char*)data, sizeof(T)); + bloom_filter.add_element(((T*)data)[0]); } bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const { - return bloom_filter.test(Slice((char*)data, sizeof(T))); + return bloom_filter.test_element(((T*)data)[0]); } - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { return find(bloom_filter, data); } bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const { @@ -339,64 +349,82 @@ struct CommonFindOp { } }; -struct StringFindOp { - void insert_batch(BloomFilterAdaptor& bloom_filter, const char* data, const int* offsets, - int number) const { - LOG(FATAL) << "StringFindOp does not support insert_batch"; - } - - void insert_single(BloomFilterAdaptor& bloom_filter, const char* data) const { - LOG(FATAL) << "StringFindOp does not support insert_single"; - } - +struct StringFindOp : public BaseOp { uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap, uint16_t* offsets, int number, - const bool is_parse_column) const { - LOG(FATAL) << "StringFindOp does not support find_batch_olap_engine"; - return 0; + const bool is_parse_column) { + return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets, + number, is_parse_column, sizeof(StringRef)); } - void find_batch(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap, - int number, uint8* results) const { - LOG(FATAL) << "StringFindOp does not support find_batch"; + static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, + size_t start) { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = + assert_cast(nullable->get_nested_column()); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + bloom_filter.add_element(col.get_data_at(i)); + } + } + } else { + const auto& col = assert_cast(column.get()); + for (size_t i = start; i < column->size(); i++) { + bloom_filter.add_element(col->get_data_at(i)); + } + } } - void insert(BloomFilterAdaptor& bloom_filter, const void* data) const { + static void find_batch(const BloomFilterAdaptor& bloom_filter, + const vectorized::ColumnPtr& column, uint8_t* results) { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = + assert_cast(nullable->get_nested_column()); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + for (size_t i = 0; i < column->size(); i++) { + if (!nullmap[i]) { + results[i] = bloom_filter.test_element(col.get_data_at(i)); + } else { + results[i] = false; + } + } + } else { + const auto& col = assert_cast(column.get()); + for (size_t i = 0; i < column->size(); i++) { + results[i] = bloom_filter.test_element(col->get_data_at(i)); + } + } + } + + static void insert(BloomFilterAdaptor& bloom_filter, const void* data) { const auto* value = reinterpret_cast(data); if (value) { bloom_filter.add_bytes(value->data, value->size); } } - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - void insert_crc32_hash(BloomFilterAdaptor& bloom_filter, const void* data) const { - const auto* value = reinterpret_cast(data); - if (value) { - bloom_filter.add_bytes_new_hash(value->data, value->size); - } - } - - bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const { + static bool find(const BloomFilterAdaptor& bloom_filter, const void* data) { const auto* value = reinterpret_cast(data); if (value == nullptr) { return false; } - return bloom_filter.test(Slice(value->data, value->size)); + return bloom_filter.test(*value); } - //This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - bool find_crc32_hash(const BloomFilterAdaptor& bloom_filter, const void* data) const { - const auto* value = reinterpret_cast(data); - if (value == nullptr) { - return false; - } - return bloom_filter.test_new_hash(Slice(value->data, value->size)); - } - - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { return StringFindOp::find(bloom_filter, data); } - bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const { + + static bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) { return bloom_filter.test(data); } }; @@ -404,7 +432,8 @@ struct StringFindOp { // We do not need to judge whether data is empty, because null will not appear // when filer used by the storage engine struct FixedStringFindOp : public StringFindOp { - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* input_data) const { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, + const void* input_data) const override { const auto* value = reinterpret_cast(input_data); int64_t size = value->size; const char* data = value->data; @@ -412,7 +441,7 @@ struct FixedStringFindOp : public StringFindOp { while (size > 0 && data[size - 1] == '\0') { size--; } - return bloom_filter.test(Slice(value->data, size)); + return bloom_filter.test(StringRef(value->data, size)); } }; @@ -449,37 +478,13 @@ public: dummy.insert(*_bloom_filter, data); } - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - void insert_crc32_hash(const void* data) override { - if constexpr (std::is_same_v::FindOp, StringFindOp> || - std::is_same_v::FindOp, - FixedStringFindOp>) { - DCHECK(_bloom_filter != nullptr); - dummy.insert_crc32_hash(*_bloom_filter, data); - } else { - insert(data); - } - } - - void insert_fixed_len(const char* data, const int* offsets, int number) override { + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { DCHECK(_bloom_filter != nullptr); - dummy.insert_batch(*_bloom_filter, data, offsets, number); + dummy.insert_batch(*_bloom_filter, column, start); } - void insert_fixed_len(const char* data) override { - DCHECK(_bloom_filter != nullptr); - dummy.insert_single(*_bloom_filter, data); - } - - uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, - int number, const bool is_parse_column) override { - return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, - is_parse_column); - } - - void find_fixed_len(const char* data, const uint8* nullmap, int number, - uint8* results) override { - dummy.find_batch(*_bloom_filter, data, nullmap, number, results); + void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) override { + dummy.find_batch(*_bloom_filter, column, results); } bool find(const void* data) const override { @@ -487,23 +492,18 @@ public: return dummy.find(*_bloom_filter, data); } - // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - bool find_crc32_hash(const void* data) const override { - if constexpr (std::is_same_v::FindOp, StringFindOp> || - std::is_same_v::FindOp, - FixedStringFindOp>) { - DCHECK(_bloom_filter != nullptr); - return dummy.find_crc32_hash(*_bloom_filter, data); - } - return find(data); - } - bool find_olap_engine(const void* data) const override { return dummy.find_olap_engine(*_bloom_filter, data); } bool find_uint32_t(uint32_t data) const override { return dummy.find(*_bloom_filter, data); } + uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, + int number, bool is_parse_column) override { + return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, + is_parse_column); + } + private: typename BloomFilterTypeTraits::FindOp dummy; }; diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h index 6a90bdd47c..9151dc7d3b 100644 --- a/be/src/exprs/hybrid_set.h +++ b/be/src/exprs/hybrid_set.h @@ -29,7 +29,7 @@ namespace doris { -#define FIXED_CONTAINER_MAX_SIZE 8 +constexpr int FIXED_CONTAINER_MAX_SIZE = 8; /** * Fix Container can use simd to improve performance. 1 <= N <= 8 can be improved performance by test. FIXED_CONTAINER_MAX_SIZE = 8. @@ -44,7 +44,7 @@ public: class Iterator; - FixedContainer() : _size(0) { static_assert(N >= 0 && N <= FIXED_CONTAINER_MAX_SIZE); } + FixedContainer() { static_assert(N >= 0 && N <= FIXED_CONTAINER_MAX_SIZE); } ~FixedContainer() = default; @@ -141,7 +141,7 @@ public: private: std::array _data; - size_t _size; + size_t _size {}; }; /** @@ -183,7 +183,7 @@ public: // use in vectorize execute engine virtual void insert(void* data, size_t) = 0; - virtual void insert_fixed_len(const char* data, const int* offsets, int number) = 0; + virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0; virtual void insert(HybridSetBase* set) { HybridSetBase::IteratorBase* iter = set->begin(); @@ -199,11 +199,6 @@ public: // use in vectorize execute engine virtual bool find(const void* data, size_t) const = 0; - virtual void find_fixed_len(const char* __restrict data, const uint8* __restrict null_map, - int number, uint8* __restrict results) { - LOG(FATAL) << "HybridSetBase not support find_fixed_len"; - } - virtual void find_batch(const doris::vectorized::IColumn& column, size_t rows, doris::vectorized::ColumnUInt8::Container& results) { LOG(FATAL) << "HybridSetBase not support find_batch"; @@ -275,21 +270,29 @@ public: if (data == nullptr) { return; } - - if constexpr (sizeof(ElementType) >= 16) { - // for large int, it will core dump with no memcpy - ElementType value; - memcpy(&value, data, sizeof(ElementType)); - _set.insert(value); - } else { - _set.insert(*reinterpret_cast(data)); - } + _set.insert(*reinterpret_cast(data)); } - void insert(void* data, size_t) override { insert(data); } + void insert(void* data, size_t /*unused*/) override { insert(data); } - void insert_fixed_len(const char* data, const int* offsets, int number) override { - for (int i = 0; i < number; i++) { - insert((void*)((ElementType*)data + offsets[i])); + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = nullable->get_nested_column(); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + const ElementType* data = (ElementType*)col.get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + _set.insert(*(data + i)); + } + } + } else { + const ElementType* data = (ElementType*)column->get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + _set.insert(*(data + i)); + } } } @@ -303,21 +306,7 @@ public: return _set.find(*reinterpret_cast(data)); } - bool find(const void* data, size_t) const override { return find(data); } - - void find_fixed_len(const char* __restrict data, const uint8* __restrict null_map, int number, - uint8* __restrict results) override { - ElementType* value = (ElementType*)data; - if (null_map == nullptr) { - for (int i = 0; i < number; i++) { - results[i] = _set.find(value[i]); - } - } else { - for (int i = 0; i < number; i++) { - results[i] = _set.find(value[i]) & !null_map[i]; - } - } - } + bool find(const void* data, size_t /*unused*/) const override { return find(data); } void find_batch(const doris::vectorized::IColumn& column, size_t rows, doris::vectorized::ColumnUInt8::Container& results) override { @@ -414,8 +403,26 @@ public: _set.insert(str_value); } - void insert_fixed_len(const char* data, const int* offsets, int number) override { - LOG(FATAL) << "string set not support insert_fixed_len"; + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = + assert_cast(nullable->get_nested_column()); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + _set.insert(col.get_data_at(i).to_string()); + } + } + } else { + const auto& col = assert_cast(column.get()); + for (size_t i = start; i < column->size(); i++) { + _set.insert(col->get_data_at(i).to_string()); + } + } } int size() override { return _set.size(); } @@ -425,7 +432,7 @@ public: return false; } - auto* value = reinterpret_cast(data); + const auto* value = reinterpret_cast(data); std::string str_value(const_cast(value->data), value->size); return _set.find(str_value); } @@ -461,7 +468,7 @@ public: void _find_batch(const doris::vectorized::IColumn& column, size_t rows, const doris::vectorized::NullMap* null_map, doris::vectorized::ColumnUInt8::Container& results) { - auto& col = assert_cast(column); + const auto& col = assert_cast(column); const uint8_t* __restrict null_map_data; if constexpr (is_nullable) { null_map_data = null_map->data(); @@ -538,8 +545,26 @@ public: _set.insert(sv); } - void insert_fixed_len(const char* data, const int* offsets, int number) override { - LOG(FATAL) << "string set not support insert_fixed_len"; + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = + assert_cast(nullable->get_nested_column()); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + _set.insert(col.get_data_at(i)); + } + } + } else { + const auto& col = assert_cast(column.get()); + for (size_t i = start; i < column->size(); i++) { + _set.insert(col->get_data_at(i)); + } + } } int size() override { return _set.size(); } @@ -549,7 +574,7 @@ public: return false; } - auto* value = reinterpret_cast(data); + const auto* value = reinterpret_cast(data); return _set.find(*value); } @@ -588,10 +613,10 @@ public: void _find_batch(const doris::vectorized::IColumn& column, size_t rows, const doris::vectorized::NullMap* null_map, doris::vectorized::ColumnUInt8::Container& results) { - auto& col = assert_cast(column); + const auto& col = assert_cast(column); const uint32_t* __restrict offset = col.get_offsets().data(); const uint8_t* __restrict data = col.get_chars().data(); - uint8_t* __restrict cursor = const_cast(data); + auto* __restrict cursor = const_cast(data); const uint8_t* __restrict null_map_data; if constexpr (is_nullable) { null_map_data = null_map->data(); diff --git a/be/src/exprs/minmax_predicate.h b/be/src/exprs/minmax_predicate.h index cdf898292f..fcf2ef44a1 100644 --- a/be/src/exprs/minmax_predicate.h +++ b/be/src/exprs/minmax_predicate.h @@ -17,17 +17,22 @@ #pragma once +#include + #include "common/object_pool.h" #include "runtime/type_limit.h" +#include "vec/columns/column.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/common/assert_cast.h" namespace doris { // only used in Runtime Filter class MinMaxFuncBase { public: virtual void insert(const void* data) = 0; - virtual void insert_fixed_len(const char* data, const int* offsets, int number) = 0; + virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0; virtual bool find(void* data) = 0; - virtual bool is_empty() = 0; virtual void* get_max() = 0; virtual void* get_min() = 0; // assign minmax data @@ -37,7 +42,7 @@ public: virtual ~MinMaxFuncBase() = default; }; -template +template class MinMaxNumFunc : public MinMaxFuncBase { public: MinMaxNumFunc() = default; @@ -50,32 +55,78 @@ public: T val_data = *reinterpret_cast(data); - if (_empty) { - _min = val_data; - _max = val_data; - _empty = false; - return; + if constexpr (NeedMin) { + if (val_data < _min) { + _min = val_data; + } } - if (val_data < _min) { - _min = val_data; - } else if (val_data > _max) { - _max = val_data; + + if constexpr (NeedMax) { + if (val_data > _max) { + _max = val_data; + } } } - void insert_fixed_len(const char* data, const int* offsets, int number) override { - if (!number) { + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { + if (column->empty()) { return; } - if (_empty) { - _min = *((T*)data + offsets[0]); - _max = *((T*)data + offsets[0]); + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = nullable->get_nested_column(); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + + if constexpr (std::is_same_v) { + const auto& column_string = assert_cast(col); + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + if constexpr (NeedMin) { + _min = std::min(_min, column_string.get_data_at(i)); + } + if constexpr (NeedMax) { + _max = std::max(_max, column_string.get_data_at(i)); + } + } + } + } else { + const T* data = (T*)col.get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + if constexpr (NeedMin) { + _min = std::min(_min, *(data + i)); + } + if constexpr (NeedMax) { + _max = std::max(_max, *(data + i)); + } + } + } + } + } else { + if constexpr (std::is_same_v) { + const auto& column_string = assert_cast(*column); + for (size_t i = start; i < column->size(); i++) { + if constexpr (NeedMin) { + _min = std::min(_min, column_string.get_data_at(i)); + } + if constexpr (NeedMax) { + _max = std::max(_max, column_string.get_data_at(i)); + } + } + } else { + const T* data = (T*)column->get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + if constexpr (NeedMin) { + _min = std::min(_min, *(data + i)); + } + if constexpr (NeedMax) { + _max = std::max(_max, *(data + i)); + } + } + } } - for (int i = _empty; i < number; i++) { - _min = std::min(_min, *((T*)data + offsets[i])); - _max = std::max(_max, *((T*)data + offsets[i])); - } - _empty = false; } bool find(void* data) override { @@ -84,40 +135,55 @@ public: } T val_data = *reinterpret_cast(data); - return val_data >= _min && val_data <= _max; + if constexpr (NeedMin) { + if (val_data < _min) { + return false; + } + } + if constexpr (NeedMax) { + if (val_data > _max) { + return false; + } + } + return true; } Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override { if constexpr (std::is_same_v) { - MinMaxNumFunc* other_minmax = static_cast*>(minmax_func); - - if (other_minmax->_min < _min) { - auto& other_min = other_minmax->_min; - auto str = pool->add(new std::string(other_min.data, other_min.size)); - _min.data = str->data(); - _min.size = str->length(); + auto* other_minmax = static_cast*>(minmax_func); + if constexpr (NeedMin) { + if (other_minmax->_min < _min) { + auto& other_min = other_minmax->_min; + auto* str = pool->add(new std::string(other_min.data, other_min.size)); + _min.data = str->data(); + _min.size = str->length(); + } } - if (other_minmax->_max > _max) { - auto& other_max = other_minmax->_max; - auto str = pool->add(new std::string(other_max.data, other_max.size)); - _max.data = str->data(); - _max.size = str->length(); + if constexpr (NeedMax) { + if (other_minmax->_max > _max) { + auto& other_max = other_minmax->_max; + auto* str = pool->add(new std::string(other_max.data, other_max.size)); + _max.data = str->data(); + _max.size = str->length(); + } } } else { - MinMaxNumFunc* other_minmax = static_cast*>(minmax_func); - if (other_minmax->_min < _min) { - _min = other_minmax->_min; + auto* other_minmax = static_cast*>(minmax_func); + if constexpr (NeedMin) { + if (other_minmax->_min < _min) { + _min = other_minmax->_min; + } } - if (other_minmax->_max > _max) { - _max = other_minmax->_max; + if constexpr (NeedMax) { + if (other_minmax->_max > _max) { + _max = other_minmax->_max; + } } } return Status::OK(); } - bool is_empty() override { return _empty; } - void* get_max() override { return &_max; } void* get_min() override { return &_min; } @@ -131,161 +197,12 @@ public: protected: T _max = type_limit::min(); T _min = type_limit::max(); - // we use _empty to avoid compare twice - bool _empty = true; }; template -class MinNumFunc : public MinMaxNumFunc { -public: - MinNumFunc() = default; - ~MinNumFunc() override = default; - - void insert(const void* data) override { - if (data == nullptr) { - return; - } - - T val_data = *reinterpret_cast(data); - - if (this->_empty) { - this->_min = val_data; - this->_empty = false; - return; - } - if (val_data < this->_min) { - this->_min = val_data; - } - } - - void insert_fixed_len(const char* data, const int* offsets, int number) override { - if (!number) { - return; - } - if (this->_empty) { - this->_min = *((T*)data + offsets[0]); - } - for (int i = this->_empty; i < number; i++) { - this->_min = std::min(this->_min, *((T*)data + offsets[i])); - } - this->_empty = false; - } - - bool find(void* data) override { - if (data == nullptr) { - return false; - } - - T val_data = *reinterpret_cast(data); - return val_data >= this->_min; - } - - Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override { - if constexpr (std::is_same_v) { - MinNumFunc* other_minmax = assert_cast*>(minmax_func); - if (other_minmax->_min < this->_min) { - auto& other_min = other_minmax->_min; - auto str = pool->add(new std::string(other_min.data, other_min.size)); - this->_min.data = str->data(); - this->_min.size = str->length(); - } - } else { - MinNumFunc* other_minmax = assert_cast*>(minmax_func); - if (other_minmax->_min < this->_min) { - this->_min = other_minmax->_min; - } - } - - return Status::OK(); - } - - //min filter the max is useless, so return nullptr directly - void* get_max() override { - DCHECK(false); - return nullptr; - } - - Status assign(void* min_data, void* max_data) override { - this->_min = *(T*)min_data; - return Status::OK(); - } -}; +using MinNumFunc = MinMaxNumFunc; template -class MaxNumFunc : public MinMaxNumFunc { -public: - MaxNumFunc() = default; - ~MaxNumFunc() override = default; - - void insert(const void* data) override { - if (data == nullptr) { - return; - } - - T val_data = *reinterpret_cast(data); - - if (this->_empty) { - this->_max = val_data; - this->_empty = false; - return; - } - if (val_data > this->_max) { - this->_max = val_data; - } - } - - void insert_fixed_len(const char* data, const int* offsets, int number) override { - if (!number) { - return; - } - if (this->_empty) { - this->_max = *((T*)data + offsets[0]); - } - for (int i = this->_empty; i < number; i++) { - this->_max = std::max(this->_max, *((T*)data + offsets[i])); - } - this->_empty = false; - } - - bool find(void* data) override { - if (data == nullptr) { - return false; - } - - T val_data = *reinterpret_cast(data); - return val_data <= this->_max; - } - - Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override { - if constexpr (std::is_same_v) { - MinMaxNumFunc* other_minmax = assert_cast*>(minmax_func); - - if (other_minmax->_max > this->_max) { - auto& other_max = other_minmax->_max; - auto str = pool->add(new std::string(other_max.data, other_max.size)); - this->_max.data = str->data(); - this->_max.size = str->length(); - } - } else { - MinMaxNumFunc* other_minmax = assert_cast*>(minmax_func); - if (other_minmax->_max > this->_max) { - this->_max = other_minmax->_max; - } - } - - return Status::OK(); - } - - //max filter the min is useless, so return nullptr directly - void* get_min() override { - DCHECK(false); - return nullptr; - } - - Status assign(void* min_data, void* max_data) override { - this->_max = *(T*)max_data; - return Status::OK(); - } -}; +using MaxNumFunc = MinMaxNumFunc; } // namespace doris diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index aac153b08b..31bf202598 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -52,6 +52,7 @@ #include "util/string_parser.hpp" #include "vec/columns/column.h" #include "vec/columns/column_complex.h" +#include "vec/columns/column_nullable.h" #include "vec/common/assert_cast.h" #include "vec/core/wide_integer.h" #include "vec/core/wide_integer_to_string.h" @@ -286,10 +287,7 @@ public: _pool(pool), _column_return_type(params->column_return_type), _filter_type(params->filter_type), - _filter_id(params->filter_id), - _use_batch( - IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)), - _use_new_hash(_be_exec_version >= 2) {} + _filter_id(params->filter_id) {} // for a 'tmp' runtime predicate wrapper // only could called assign method or as a param for merge RuntimePredicateWrapper(RuntimeState* state, ObjectPool* pool, PrimitiveType column_type, @@ -299,10 +297,7 @@ public: _pool(pool), _column_return_type(column_type), _filter_type(type), - _filter_id(filter_id), - _use_batch( - IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)), - _use_new_hash(_be_exec_version >= 2) {} + _filter_id(filter_id) {} RuntimePredicateWrapper(QueryContext* query_ctx, ObjectPool* pool, const RuntimeFilterParams* params) @@ -311,10 +306,7 @@ public: _pool(pool), _column_return_type(params->column_return_type), _filter_type(params->filter_type), - _filter_id(params->filter_id), - _use_batch( - IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)), - _use_new_hash(_be_exec_version >= 2) {} + _filter_id(params->filter_id) {} // for a 'tmp' runtime predicate wrapper // only could called assign method or as a param for merge RuntimePredicateWrapper(QueryContext* query_ctx, ObjectPool* pool, PrimitiveType column_type, @@ -324,10 +316,7 @@ public: _pool(pool), _column_return_type(column_type), _filter_type(type), - _filter_id(filter_id), - _use_batch( - IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)), - _use_new_hash(_be_exec_version >= 2) {} + _filter_id(filter_id) {} // init runtime filter wrapper // alloc memory to init runtime filter function Status init(const RuntimeFilterParams* params) { @@ -389,23 +378,10 @@ public: void insert_to_bloom_filter(BloomFilterFuncBase* bloom_filter) const { if (_context.hybrid_set->size() > 0) { - auto it = _context.hybrid_set->begin(); - - if (_use_batch) { - while (it->has_next()) { - bloom_filter->insert_fixed_len((char*)it->get_value()); - it->next(); - } - } else { - while (it->has_next()) { - if (_use_new_hash) { - bloom_filter->insert_crc32_hash(it->get_value()); - } else { - bloom_filter->insert(it->get_value()); - } - - it->next(); - } + auto* it = _context.hybrid_set->begin(); + while (it->has_next()) { + bloom_filter->insert(it->get_value()); + it->next(); } } } @@ -428,20 +404,12 @@ public: break; } case RuntimeFilterType::BLOOM_FILTER: { - if (_use_new_hash) { - _context.bloom_filter_func->insert_crc32_hash(data); - } else { - _context.bloom_filter_func->insert(data); - } + _context.bloom_filter_func->insert(data); break; } case RuntimeFilterType::IN_OR_BLOOM_FILTER: { if (_is_bloomfilter) { - if (_use_new_hash) { - _context.bloom_filter_func->insert_crc32_hash(data); - } else { - _context.bloom_filter_func->insert(data); - } + _context.bloom_filter_func->insert(data); } else { _context.hybrid_set->insert(data); } @@ -457,30 +425,30 @@ public: } } - void insert_fixed_len(const char* data, const int* offsets, int number) { + void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) { switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { if (_is_ignored_in_filter) { break; } - _context.hybrid_set->insert_fixed_len(data, offsets, number); + _context.hybrid_set->insert_fixed_len(column, start); break; } case RuntimeFilterType::MIN_FILTER: case RuntimeFilterType::MAX_FILTER: case RuntimeFilterType::MINMAX_FILTER: { - _context.minmax_func->insert_fixed_len(data, offsets, number); + _context.minmax_func->insert_fixed_len(column, start); break; } case RuntimeFilterType::BLOOM_FILTER: { - _context.bloom_filter_func->insert_fixed_len(data, offsets, number); + _context.bloom_filter_func->insert_fixed_len(column, start); break; } case RuntimeFilterType::IN_OR_BLOOM_FILTER: { if (_is_bloomfilter) { - _context.bloom_filter_func->insert_fixed_len(data, offsets, number); + _context.bloom_filter_func->insert_fixed_len(column, start); } else { - _context.hybrid_set->insert_fixed_len(data, offsets, number); + _context.hybrid_set->insert_fixed_len(column, start); } break; } @@ -508,24 +476,33 @@ public: } } - void insert_batch(const vectorized::ColumnPtr column, const std::vector& rows) { + void insert_batch(const vectorized::ColumnPtr& column, size_t start) { if (get_real_type() == RuntimeFilterType::BITMAP_FILTER) { - bitmap_filter_insert_batch(column, rows); - } else if (IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)) { - insert_fixed_len(column->get_raw_data().data, rows.data(), rows.size()); + bitmap_filter_insert_batch(column, start); } else { - for (int index : rows) { - insert(column->get_data_at(index)); - } + insert_fixed_len(column, start); } } - void bitmap_filter_insert_batch(const vectorized::ColumnPtr column, - const std::vector& rows) { + void bitmap_filter_insert_batch(const vectorized::ColumnPtr column, size_t start) { std::vector bitmaps; - auto* col = assert_cast*>(column.get()); - for (int index : rows) { - bitmaps.push_back(&(col->get_data()[index])); + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + const auto& col = + assert_cast(nullable->get_nested_column()); + const auto& nullmap = + assert_cast(nullable->get_null_map_column()) + .get_data(); + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { + bitmaps.push_back(&(col.get_data()[i])); + } + } + } else { + const auto* col = assert_cast(column.get()); + for (size_t i = start; i < column->size(); i++) { + bitmaps.push_back(&(col->get_data()[i])); + } } _context.bitmap_filter_func->insert_many(bitmaps); } @@ -1039,13 +1016,6 @@ private: bool _is_ignored_in_filter = false; std::string* _ignored_in_filter_msg = nullptr; uint32_t _filter_id; - - // When _column_return_type is invalid, _use_batch will be always false. - bool _use_batch; - - // When _use_new_hash is set to true, use the new hash method. - // This is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - const bool _use_new_hash; }; Status IRuntimeFilter::create(RuntimeState* state, ObjectPool* pool, const TRuntimeFilterDesc* desc, @@ -1092,10 +1062,9 @@ void IRuntimeFilter::insert(const StringRef& value) { _wrapper->insert(value); } -void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, - const std::vector& rows) { +void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, size_t start) { DCHECK(is_producer()); - _wrapper->insert_batch(column, rows); + _wrapper->insert_batch(column, start); } Status IRuntimeFilter::publish() { diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 797b217662..4b8c982fee 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -248,7 +248,7 @@ public: // only used for producer void insert(const void* data); void insert(const StringRef& data); - void insert_batch(vectorized::ColumnPtr column, const std::vector& rows); + void insert_batch(vectorized::ColumnPtr column, size_t start); // publish filter // push filter to remote node or push down it to scan_node @@ -336,10 +336,6 @@ public: void update_runtime_filter_type_to_profile(); - static bool enable_use_batch(bool use_batch, PrimitiveType type) { - return use_batch && (is_int_or_bool(type) || is_float_or_double(type)); - } - int filter_id() const { return _filter_id; } static std::string to_string(RuntimeFilterType type) { diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index 6c96b16055..62cf0eab7d 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -37,7 +37,7 @@ public: const std::vector& runtime_filter_descs) : _build_expr_context(build_expr_ctxs), _runtime_filter_descs(runtime_filter_descs) {} - Status init(RuntimeState* state, int64_t hash_table_size, size_t build_bf_cardinality) { + Status init(RuntimeState* state, int64_t hash_table_size) { // runtime filter effect strategy // 1. we will ignore IN filter when hash_table_size is too big // 2. we will ignore BLOOM filter and MinMax filter when hash_table_size @@ -111,7 +111,7 @@ public: } if (runtime_filter->is_bloomfilter()) { - RETURN_IF_ERROR(runtime_filter->init_bloom_filter(build_bf_cardinality)); + RETURN_IF_ERROR(runtime_filter->init_bloom_filter(hash_table_size)); } // Note: @@ -162,7 +162,7 @@ public: return Status::OK(); } - void insert(std::unordered_map>& datas) { + void insert(const std::unordered_set& datas) { for (int i = 0; i < _build_expr_context.size(); ++i) { auto iter = _runtime_filters.find(i); if (iter == _runtime_filters.end()) { @@ -170,29 +170,10 @@ public: } int result_column_id = _build_expr_context[i]->get_last_result_column_id(); - for (auto it : datas) { - auto& column = it.first->get_by_position(result_column_id).column; - - if (auto* nullable = - vectorized::check_and_get_column(*column)) { - auto& column_nested = nullable->get_nested_column_ptr(); - auto& column_nullmap = nullable->get_null_map_column_ptr(); - std::vector indexs; - for (int row_num : it.second) { - if (assert_cast(column_nullmap.get()) - ->get_bool(row_num)) { - continue; - } - indexs.push_back(row_num); - } - for (auto filter : iter->second) { - filter->insert_batch(column_nested, indexs); - } - - } else { - for (auto filter : iter->second) { - filter->insert_batch(column, it.second); - } + for (const auto* it : datas) { + auto column = it->get_by_position(result_column_id).column; + for (auto* filter : iter->second) { + filter->insert_batch(column, 1); } } } diff --git a/be/src/exprs/runtime_filter_slots_cross.h b/be/src/exprs/runtime_filter_slots_cross.h index 4868b27a4e..76b6085bab 100644 --- a/be/src/exprs/runtime_filter_slots_cross.h +++ b/be/src/exprs/runtime_filter_slots_cross.h @@ -61,7 +61,7 @@ public: Status insert(vectorized::Block* block) { for (int i = 0; i < _runtime_filters.size(); ++i) { auto* filter = _runtime_filters[i]; - auto& vexpr_ctx = filter_src_expr_ctxs[i]; + const auto& vexpr_ctx = filter_src_expr_ctxs[i]; int result_column_id = -1; RETURN_IF_ERROR(vexpr_ctx->execute(block, &result_column_id)); @@ -70,25 +70,7 @@ public: block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); - auto& column = block->get_by_position(result_column_id).column; - if (auto* nullable = - vectorized::check_and_get_column(*column)) { - auto& column_nested = nullable->get_nested_column_ptr(); - auto& column_nullmap = nullable->get_null_map_column_ptr(); - std::vector indexs; - for (int row_index = 0; row_index < column->size(); ++row_index) { - if (assert_cast(column_nullmap.get()) - ->get_bool(row_index)) { - continue; - } - indexs.push_back(row_index); - } - filter->insert_batch(column_nested, indexs); - } else { - std::vector rows(column->size()); - std::iota(rows.begin(), rows.end(), 0); - filter->insert_batch(column, rows); - } + filter->insert_batch(block->get_by_position(result_column_id).column, 0); } return Status::OK(); } @@ -100,7 +82,7 @@ public: return Status::OK(); } - bool empty() { return !_runtime_filters.size(); } + bool empty() { return _runtime_filters.empty(); } private: const std::vector& _runtime_filter_descs; diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 7280a1e836..156f054a3f 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -66,52 +66,17 @@ private: uint16_t new_size = 0; if (column.is_column_dictionary()) { const auto* dict_col = reinterpret_cast(&column); - if (_be_exec_version >= 2) { - for (uint16_t i = 0; i < size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - if constexpr (is_nullable) { - new_size += !null_map[idx] && _specific_filter->find_uint32_t( - dict_col->get_crc32_hash_value(idx)); - } else { - new_size += _specific_filter->find_uint32_t( - dict_col->get_crc32_hash_value(idx)); - } - } - } else { - for (uint16_t i = 0; i < size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - if constexpr (is_nullable) { - new_size += !null_map[idx] && - _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); - } else { - new_size += _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); - } - } - } - } else if (is_string_type(T) && _be_exec_version >= 2) { - auto& pred_col = - reinterpret_cast< - const vectorized::PredicateColumnType>*>( - &column) - ->get_data(); - - auto pred_col_data = pred_col.data(); - const bool is_dense_column = pred_col.size() == size; for (uint16_t i = 0; i < size; i++) { - uint16_t idx = is_dense_column ? i : sel[i]; + uint16_t idx = sel[i]; + sel[new_size] = idx; if constexpr (is_nullable) { - if (!null_map[idx] && _specific_filter->find_crc32_hash(&pred_col_data[idx])) { - sel[new_size++] = idx; - } + new_size += !null_map[idx] && + _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); } else { - if (_specific_filter->find_crc32_hash(&pred_col_data[idx])) { - sel[new_size++] = idx; - } + new_size += _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); } } - } else if (IRuntimeFilter::enable_use_batch(_be_exec_version > 0, T)) { + } else { const auto& data = reinterpret_cast< const vectorized::PredicateColumnType>*>( @@ -119,20 +84,6 @@ private: ->get_data(); new_size = _specific_filter->find_fixed_len_olap_engine((char*)data.data(), null_map, sel, size, data.size() != size); - } else { - auto& pred_col = - reinterpret_cast< - const vectorized::PredicateColumnType>*>( - &column) - ->get_data(); - - auto pred_col_data = pred_col.data(); -#define EVALUATE_WITH_NULL_IMPL(IDX) \ - !null_map[IDX] && _specific_filter->find_olap_engine(&pred_col_data[IDX]) -#define EVALUATE_WITHOUT_NULL_IMPL(IDX) _specific_filter->find_olap_engine(&pred_col_data[IDX]) - EVALUATE_BY_SELECTOR(EVALUATE_WITH_NULL_IMPL, EVALUATE_WITHOUT_NULL_IMPL) -#undef EVALUATE_WITH_NULL_IMPL -#undef EVALUATE_WITHOUT_NULL_IMPL } return new_size; } @@ -164,8 +115,8 @@ uint16_t BloomFilterColumnPredicate::evaluate(const vectorized::IColumn& colu return size; } if (column.is_nullable()) { - auto* nullable_col = reinterpret_cast(&column); - auto& null_map_data = nullable_col->get_null_map_column().get_data(); + const auto* nullable_col = reinterpret_cast(&column); + const auto& null_map_data = nullable_col->get_null_map_column().get_data(); new_size = evaluate(nullable_col->get_nested_column(), null_map_data.data(), sel, size); } else { diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 41b030b4e1..1e31014512 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -39,10 +39,7 @@ Overload(Callables&&... callables) -> Overload; HashJoinBuildSinkLocalState::HashJoinBuildSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) - : JoinBuildSinkLocalState(parent, state), - _build_block_idx(0), - _build_side_mem_used(0), - _build_side_last_mem_used(0) {} + : JoinBuildSinkLocalState(parent, state) {} Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(JoinBuildSinkLocalState::init(state, info)); @@ -52,15 +49,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _parent->operator_id(), _parent->node_id(), state->get_query_ctx()); auto& p = _parent->cast(); _shared_state->join_op_variants = p._join_op_variants; - if (p._is_broadcast_join && state->enable_share_hash_table_for_broadcast_join()) { - _shared_state->build_blocks = p._shared_hash_table_context->blocks; - } else { - _shared_state->build_blocks.reset(new std::vector()); - // avoid vector expand change block address. - // one block can store 4g data, _build_blocks can store 128*4g data. - // if probe data bigger than 512g, runtime filter maybe will core dump when insert data. - _shared_state->build_blocks->reserve(vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT); - } + _shared_state->is_null_safe_eq_join = p._is_null_safe_eq_join; _shared_state->store_null_in_hash_table = p._store_null_in_hash_table; _build_expr_ctxs.resize(p._build_expr_ctxs.size()); @@ -84,11 +73,6 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_hash_table_dependency->block(); p._shared_hashtable_controller->append_dependency(p.node_id(), _shared_hash_table_dependency); - } else if (p._is_broadcast_join) { - // avoid vector expand change block address. - // one block can store 4g data, _build_blocks can store 128*4g data. - // if probe data bigger than 512g, runtime filter maybe will core dump when insert data. - _shared_state->build_blocks->reserve(vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT); } _memory_usage_counter = ADD_LABEL_COUNTER(profile(), "MemoryUsage"); @@ -106,17 +90,10 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _build_side_merge_block_timer = ADD_TIMER(profile(), "BuildSideMergeBlockTime"); _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime"); _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime"); - _build_table_expanse_timer = ADD_TIMER(record_profile, "BuildTableExpanseTime"); - _build_table_convert_timer = ADD_TIMER(record_profile, "BuildTableConvertToPartitionedTime"); _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime"); - _build_runtime_filter_timer = ADD_TIMER(record_profile, "BuildRuntimeFilterTime"); _allocate_resource_timer = ADD_TIMER(profile(), "AllocateResourceTime"); - _build_buckets_counter = ADD_COUNTER(profile(), "BuildBuckets", TUnit::UNIT); - _build_buckets_fill_counter = ADD_COUNTER(profile(), "FilledBuckets", TUnit::UNIT); - - _build_collisions_counter = ADD_COUNTER(profile(), "BuildCollisions", TUnit::UNIT); // Hash Table Init _hash_table_init(state); @@ -158,19 +135,18 @@ void HashJoinBuildSinkLocalState::init_short_circuit_for_probe() { _shared_state->short_circuit_for_probe = (_shared_state->_has_null_in_build_side && p._join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !p._is_mark_join) || - (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::INNER_JOIN && + (!_shared_state->build_block && p._join_op == TJoinOp::INNER_JOIN && !p._is_mark_join) || - (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::LEFT_SEMI_JOIN && + (!_shared_state->build_block && p._join_op == TJoinOp::LEFT_SEMI_JOIN && !p._is_mark_join) || - (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::RIGHT_OUTER_JOIN) || - (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::RIGHT_SEMI_JOIN) || - (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::RIGHT_ANTI_JOIN); + (!_shared_state->build_block && p._join_op == TJoinOp::RIGHT_OUTER_JOIN) || + (!_shared_state->build_block && p._join_op == TJoinOp::RIGHT_SEMI_JOIN) || + (!_shared_state->build_block && p._join_op == TJoinOp::RIGHT_ANTI_JOIN); //when build table rows is 0 and not have other_join_conjunct and not _is_mark_join and join type is one of LEFT_OUTER_JOIN/FULL_OUTER_JOIN/LEFT_ANTI_JOIN //we could get the result is probe table + null-column(if need output) _shared_state->empty_right_table_need_probe_dispose = - (_shared_state->build_blocks->empty() && !p._have_other_join_conjunct && - !p._is_mark_join) && + (!_shared_state->build_block && !p._have_other_join_conjunct && !p._is_mark_join) && (p._join_op == TJoinOp::LEFT_OUTER_JOIN || p._join_op == TJoinOp::FULL_OUTER_JOIN || p._join_op == TJoinOp::LEFT_ANTI_JOIN); } @@ -238,7 +214,7 @@ Status HashJoinBuildSinkLocalState::_extract_join_column( } Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, - vectorized::Block& block, uint8_t offset) { + vectorized::Block& block) { auto& p = _parent->cast(); SCOPED_TIMER(_build_table_timer); size_t rows = block.rows(); @@ -254,6 +230,14 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, RETURN_IF_ERROR(_do_evaluate(block, _build_expr_ctxs, *_build_expr_call_timer, res_col_ids)); if (p._join_op == TJoinOp::LEFT_OUTER_JOIN || p._join_op == TJoinOp::FULL_OUTER_JOIN) { _convert_block_to_null(block); + // first row is mocked + for (int i = 0; i < block.columns(); i++) { + auto [column, is_const] = unpack_if_const(block.safe_get_by_position(i).column); + assert_cast(column->assume_mutable().get()) + ->get_null_map_column() + .get_data() + .data()[0] = 1; + } } // TODO: Now we are not sure whether a column is nullable only by ExecNode's `row_desc` // so we have to initialize this flag by the first build block. @@ -270,29 +254,30 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, Status st = _extract_join_column(block, null_map_val, raw_ptrs, res_col_ids); st = std::visit( - Overload { - [&](std::monostate& arg, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - return Status::OK(); - }, - [&](auto&& arg, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { - using HashTableCtxType = std::decay_t; - vectorized::ProcessHashTableBuild - hash_table_build_process(rows, block, raw_ptrs, this, - state->batch_size(), offset, state); - return hash_table_build_process - .template run( - arg, - has_null_value || short_circuit_for_null_in_build_side - ? &null_map_val->get_data() - : nullptr, - &_shared_state->_has_null_in_build_side); - }}, - *_shared_state->hash_table_variants, + Overload {[&](std::monostate& arg, auto join_op, auto has_null_value, + auto short_circuit_for_null_in_build_side) -> Status { + LOG(FATAL) << "FATAL: uninited hash table"; + __builtin_unreachable(); + return Status::OK(); + }, + [&](auto&& arg, auto&& join_op, auto has_null_value, + auto short_circuit_for_null_in_build_side) -> Status { + using HashTableCtxType = std::decay_t; + using JoinOpType = std::decay_t; + vectorized::ProcessHashTableBuild + hash_table_build_process(rows, block, raw_ptrs, this, + state->batch_size(), state); + return hash_table_build_process + .template run( + arg, + has_null_value || short_circuit_for_null_in_build_side + ? &null_map_val->get_data() + : nullptr, + &_shared_state->_has_null_in_build_side); + }}, + *_shared_state->hash_table_variants, _shared_state->join_op_variants, vectorized::make_bool_variant(_build_side_ignore_null), vectorized::make_bool_variant(p._short_circuit_for_null_in_build_side)); @@ -384,7 +369,7 @@ void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { } return; } - if (!try_get_hash_map_context_fixed( + if (!try_get_hash_map_context_fixed( *_shared_state->hash_table_variants, _build_expr_ctxs)) { _shared_state->hash_table_variants ->emplace>(); @@ -394,16 +379,6 @@ void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { vectorized::make_bool_variant(p._have_other_join_conjunct)); DCHECK(!std::holds_alternative(*_shared_state->hash_table_variants)); - - std::visit(vectorized::Overload {[&](std::monostate& arg) { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - }, - [&](auto&& arg) { - arg.hash_table->set_partitioned_threshold( - state->partitioned_hash_join_rows_threshold()); - }}, - *_shared_state->hash_table_variants); } HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, @@ -466,68 +441,41 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - // make one block for each 4 gigabytes - constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL; - - if (local_state._shared_state->_has_null_in_build_side) { - // TODO: if _has_null_in_build_side is true we should finish current pipeline task. - DCHECK(state->enable_pipeline_exec()); - return Status::OK(); - } if (local_state._should_build_hash_table) { // If eos or have already met a null value using short-circuit strategy, we do not need to pull // data from probe side. local_state._build_side_mem_used += in_block->allocated_bytes(); + if (local_state._build_side_mutable_block.empty()) { + auto tmp_build_block = vectorized::VectorizedUtils::create_empty_columnswithtypename( + _child_x->row_desc()); + local_state._build_side_mutable_block = + vectorized::MutableBlock::build_mutable_block(&tmp_build_block); + RETURN_IF_ERROR(local_state._build_side_mutable_block.merge( + *(tmp_build_block.create_same_struct_block(1, false)))); + } + if (in_block->rows() != 0) { SCOPED_TIMER(local_state._build_side_merge_block_timer); RETURN_IF_ERROR(local_state._build_side_mutable_block.merge(*in_block)); - } - - if (UNLIKELY(local_state._build_side_mem_used - local_state._build_side_last_mem_used > - BUILD_BLOCK_MAX_SIZE)) { - if (local_state._shared_state->build_blocks->size() == - vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT) { - return Status::NotSupported(strings::Substitute( - "data size of right table in hash join > $0", - BUILD_BLOCK_MAX_SIZE * vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT)); + if (local_state._build_side_mutable_block.rows() > + std::numeric_limits::max()) { + return Status::NotSupported( + "Hash join do not support build table rows" + " over:" + + std::to_string(std::numeric_limits::max())); } - local_state._shared_state->build_blocks->emplace_back( - local_state._build_side_mutable_block.to_block()); - - COUNTER_UPDATE(local_state._build_blocks_memory_usage, - (*local_state._shared_state->build_blocks)[local_state._build_block_idx] - .bytes()); - - // TODO:: Rethink may we should do the process after we receive all build blocks ? - // which is better. - RETURN_IF_ERROR(local_state.process_build_block( - state, (*local_state._shared_state->build_blocks)[local_state._build_block_idx], - local_state._build_block_idx)); - - local_state._build_side_mutable_block = vectorized::MutableBlock(); - ++local_state._build_block_idx; - local_state._build_side_last_mem_used = local_state._build_side_mem_used; } } if (local_state._should_build_hash_table && source_state == SourceState::FINISHED) { - if (!local_state._build_side_mutable_block.empty()) { - if (local_state._shared_state->build_blocks->size() == - vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT) { - return Status::NotSupported(strings::Substitute( - "data size of right table in hash join > $0", - BUILD_BLOCK_MAX_SIZE * vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT)); - } - local_state._shared_state->build_blocks->emplace_back( - local_state._build_side_mutable_block.to_block()); - COUNTER_UPDATE(local_state._build_blocks_memory_usage, - (*local_state._shared_state->build_blocks)[local_state._build_block_idx] - .bytes()); - RETURN_IF_ERROR(local_state.process_build_block( - state, (*local_state._shared_state->build_blocks)[local_state._build_block_idx], - local_state._build_block_idx)); - } + DCHECK(!local_state._build_side_mutable_block.empty()); + local_state._shared_state->build_block = std::make_shared( + local_state._build_side_mutable_block.to_block()); + COUNTER_UPDATE(local_state._build_blocks_memory_usage, + (*local_state._shared_state->build_block).bytes()); + RETURN_IF_ERROR( + local_state.process_build_block(state, (*local_state._shared_state->build_block))); auto ret = std::visit( Overload {[&](std::monostate&) -> Status { LOG(FATAL) << "FATAL: uninited hash table"; @@ -557,6 +505,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* local_state._runtime_filter_slots->copy_to_shared_context( _shared_hash_table_context); } + _shared_hash_table_context->block = local_state._shared_state->build_block; _shared_hashtable_controller->signal(node_id()); } } else if (!local_state._should_build_hash_table) { @@ -585,6 +534,8 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* *std::static_pointer_cast( _shared_hash_table_context->hash_table_variants)); + local_state._shared_state->build_block = _shared_hash_table_context->block; + if (!_shared_hash_table_context->runtime_filters.empty()) { auto ret = std::visit( Overload { @@ -601,7 +552,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* _build_expr_ctxs, _runtime_filter_descs); RETURN_IF_ERROR(local_state._runtime_filter_slots->init( - state, arg.hash_table->size(), 0)); + state, arg.hash_table->size())); RETURN_IF_ERROR( local_state._runtime_filter_slots->copy_from_shared_context( _shared_hash_table_context)); @@ -617,7 +568,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* if (source_state == SourceState::FINISHED) { // Since the comparison of null values is meaningless, null aware left anti join should not output null // when the build side is not empty. - if (!local_state._shared_state->build_blocks->empty() && + if (local_state._shared_state->build_block && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { local_state._shared_state->probe_ignore_null = true; } diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 34c3147b7f..b45c2eed75 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -68,11 +68,11 @@ public: ENABLE_FACTORY_CREATOR(HashJoinBuildSinkLocalState); using Parent = HashJoinBuildSinkOperatorX; HashJoinBuildSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state); - ~HashJoinBuildSinkLocalState() = default; + ~HashJoinBuildSinkLocalState() override = default; Status init(RuntimeState* state, LocalSinkStateInfo& info) override; Status open(RuntimeState* state) override; - Status process_build_block(RuntimeState* state, vectorized::Block& block, uint8_t offset); + Status process_build_block(RuntimeState* state, vectorized::Block& block); void init_short_circuit_for_probe(); @@ -108,30 +108,20 @@ protected: std::vector _runtime_filters; bool _should_build_hash_table = true; - uint8_t _build_block_idx = 0; int64_t _build_side_mem_used = 0; int64_t _build_side_last_mem_used = 0; vectorized::MutableBlock _build_side_mutable_block; std::shared_ptr _runtime_filter_slots; bool _has_set_need_null_map_for_build = false; bool _build_side_ignore_null = false; - size_t _build_rf_cardinality = 0; - std::unordered_map> _inserted_rows; + std::unordered_set _inserted_blocks; std::shared_ptr _shared_hash_table_dependency; RuntimeProfile::Counter* _build_table_timer = nullptr; RuntimeProfile::Counter* _build_expr_call_timer = nullptr; RuntimeProfile::Counter* _build_table_insert_timer = nullptr; - RuntimeProfile::Counter* _build_table_expanse_timer = nullptr; - RuntimeProfile::Counter* _build_table_convert_timer = nullptr; - RuntimeProfile::Counter* _build_buckets_counter = nullptr; - RuntimeProfile::Counter* _build_buckets_fill_counter = nullptr; - RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr; RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr; - RuntimeProfile::Counter* _build_runtime_filter_timer = nullptr; - - RuntimeProfile::Counter* _build_collisions_counter = nullptr; RuntimeProfile::Counter* _allocate_resource_timer = nullptr; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 52ac0ea842..412c358037 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -89,7 +89,9 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) { void HashJoinProbeLocalState::prepare_for_next() { _probe_index = 0; + _build_index = 0; _ready_probe = false; + _last_probe_match = -1; _prepare_probe_block(); } diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index 59a1057b3a..4de50474bf 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -97,8 +97,8 @@ public: vectorized::DataTypes right_table_data_types(); vectorized::DataTypes left_table_data_types(); bool* has_null_in_build_side() { return &_shared_state->_has_null_in_build_side; } - std::shared_ptr> build_blocks() const { - return _shared_state->build_blocks; + const std::shared_ptr& build_block() const { + return _shared_state->build_block; } private: @@ -114,9 +114,11 @@ private: friend struct vectorized::ProcessHashTableProbe; int _probe_index = -1; + uint32_t _build_index = 0; bool _ready_probe = false; bool _probe_eos = false; std::atomic _probe_inited = false; + int _last_probe_match; vectorized::Block _probe_block; vectorized::ColumnRawPtrs _probe_columns; @@ -130,8 +132,6 @@ private: bool _need_null_map_for_probe = false; bool _has_set_need_null_map_for_probe = false; vectorized::ColumnUInt8::MutablePtr _null_map_column; - // for cases when a probe row matches more than batch size build rows. - bool _is_any_probe_match_row_output = false; std::unique_ptr _process_hashtable_ctx_variants = std::make_unique(); diff --git a/be/src/pipeline/exec/join_build_sink_operator.cpp b/be/src/pipeline/exec/join_build_sink_operator.cpp index fe790d9032..e2cc361c22 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.cpp +++ b/be/src/pipeline/exec/join_build_sink_operator.cpp @@ -35,10 +35,10 @@ Status JoinBuildSinkLocalState::init(RuntimeState* stat _build_rows_counter = ADD_COUNTER(PipelineXSinkLocalState::profile(), "BuildRows", TUnit::UNIT); - _push_down_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), - "PublishRuntimeFilterTime"); - _push_compute_timer = - ADD_TIMER(PipelineXSinkLocalState::profile(), "PushDownComputeTime"); + _publish_runtime_filter_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), + "PublishRuntimeFilterTime"); + _runtime_filter_compute_timer = ADD_TIMER(PipelineXSinkLocalState::profile(), + "RuntimeFilterComputeTime"); return Status::OK(); } diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index d339c2a977..8eeb02e2af 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -41,8 +41,8 @@ protected: friend class JoinBuildSinkOperatorX; RuntimeProfile::Counter* _build_rows_counter = nullptr; - RuntimeProfile::Counter* _push_down_timer = nullptr; - RuntimeProfile::Counter* _push_compute_timer = nullptr; + RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr; + RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr; }; template diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h index 0097b75c0a..9d7b8821c9 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h @@ -67,9 +67,13 @@ public: const std::vector& runtime_filter_descs(); vectorized::VExprContextSPtrs& filter_src_expr_ctxs() { return _filter_src_expr_ctxs; } - RuntimeProfile::Counter* push_compute_timer() { return _push_compute_timer; } + RuntimeProfile::Counter* runtime_filter_compute_timer() { + return _runtime_filter_compute_timer; + } vectorized::Blocks& build_blocks() { return _shared_state->build_blocks; } - RuntimeProfile::Counter* push_down_timer() { return _push_down_timer; } + RuntimeProfile::Counter* publish_runtime_filter_timer() { + return _publish_runtime_filter_timer; + } private: friend class NestedLoopJoinBuildSinkOperatorX; diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 31e8185720..cf8a994b1b 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -60,8 +60,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); auto& mem_used = local_state._shared_state->mem_used; - auto& build_blocks = local_state._shared_state->build_blocks; - auto& build_block_index = local_state._shared_state->build_block_index; + auto& build_block = local_state._shared_state->build_block; auto& valid_element_in_hash_tbl = local_state._shared_state->valid_element_in_hash_tbl; if (in_block->rows() != 0) { @@ -71,11 +70,9 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo if (source_state == SourceState::FINISHED || local_state._mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) { - build_blocks.emplace_back(local_state._mutable_block.to_block()); - RETURN_IF_ERROR(_process_build_block(local_state, build_blocks[build_block_index], - build_block_index, state)); + build_block = local_state._mutable_block.to_block(); + RETURN_IF_ERROR(_process_build_block(local_state, build_block, state)); local_state._mutable_block.clear(); - ++build_block_index; if (source_state == SourceState::FINISHED) { if constexpr (is_intersect) { @@ -102,7 +99,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo template Status SetSinkOperatorX::_process_build_block( - SetSinkLocalState& local_state, vectorized::Block& block, uint8_t offset, + SetSinkLocalState& local_state, vectorized::Block& block, RuntimeState* state) { size_t rows = block.rows(); if (rows == 0) { @@ -118,7 +115,7 @@ Status SetSinkOperatorX::_process_build_block( using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { vectorized::HashTableBuild - hash_table_build_process(&local_state, rows, raw_ptrs, offset, state); + hash_table_build_process(&local_state, rows, raw_ptrs, state); static_cast(hash_table_build_process(arg, local_state._arena)); } else { LOG(FATAL) << "FATAL: uninited hash table"; diff --git a/be/src/pipeline/exec/set_sink_operator.h b/be/src/pipeline/exec/set_sink_operator.h index 7c4cc7d2cb..d8abe12c9a 100644 --- a/be/src/pipeline/exec/set_sink_operator.h +++ b/be/src/pipeline/exec/set_sink_operator.h @@ -132,7 +132,7 @@ private: friend struct HashTableBuild; Status _process_build_block(SetSinkLocalState& local_state, - vectorized::Block& block, uint8_t offset, RuntimeState* state); + vectorized::Block& block, RuntimeState* state); Status _extract_build_column(SetSinkLocalState& local_state, vectorized::Block& block, vectorized::ColumnRawPtrs& raw_ptrs); diff --git a/be/src/pipeline/exec/set_source_operator.cpp b/be/src/pipeline/exec/set_source_operator.cpp index 8baadf7e53..e8a73c00ad 100644 --- a/be/src/pipeline/exec/set_source_operator.cpp +++ b/be/src/pipeline/exec/set_source_operator.cpp @@ -180,12 +180,12 @@ void SetSourceOperatorX::_add_result_columns( SetSourceLocalState& local_state, vectorized::RowRefListWithFlags& value, int& block_size) { auto& build_col_idx = local_state._shared_state->build_col_idx; - auto& build_blocks = local_state._shared_state->build_blocks; + auto& build_block = local_state._shared_state->build_block; auto it = value.begin(); for (auto idx = build_col_idx.begin(); idx != build_col_idx.end(); ++idx) { - auto& column = *build_blocks[it->block_offset].get_by_position(idx->first).column; - if (local_state._mutable_cols[idx->second]->is_nullable() xor column.is_nullable()) { + auto& column = *build_block.get_by_position(idx->first).column; + if (local_state._mutable_cols[idx->second]->is_nullable() ^ column.is_nullable()) { DCHECK(local_state._mutable_cols[idx->second]->is_nullable()); ((vectorized::ColumnNullable*)(local_state._mutable_cols[idx->second].get())) ->insert_from_not_nullable(column, it->row_num); diff --git a/be/src/pipeline/pipeline_x/dependency.h b/be/src/pipeline/pipeline_x/dependency.h index a26b7de781..32aa1ff4cf 100644 --- a/be/src/pipeline/pipeline_x/dependency.h +++ b/be/src/pipeline/pipeline_x/dependency.h @@ -398,7 +398,7 @@ struct HashJoinSharedState : public JoinSharedState { std::make_shared(); const std::vector build_side_child_desc; size_t build_exprs_size = 0; - std::shared_ptr> build_blocks; + std::shared_ptr build_block; bool probe_ignore_null = false; }; @@ -434,8 +434,7 @@ public: /// default init //record memory during running int64_t mem_used = 0; - std::vector build_blocks; // build to source - int build_block_index = 0; // build to source + vectorized::Block build_block; // build to source //record element size in hashtable int64_t valid_element_in_hash_tbl = 0; //first:column_id, could point to origin column or cast column @@ -506,7 +505,7 @@ public: return; } - if (!try_get_hash_map_context_fixed( *hash_table_variants, child_exprs_lists[0])) { hash_table_variants->emplace< diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 58fe0cb87e..9b49cce1c2 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -241,10 +241,17 @@ public: /// Appends a batch elements from other column with the same type /// indices_begin + indices_end represent the row indices of column src /// Warning: - /// if *indices == -1 means the row is null, only use in outer join, do not use in any other place + /// if *indices == -1 means the row is null virtual void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) = 0; + /// Appends a batch elements from other column with the same type + /// indices_begin + indices_end represent the row indices of column src + /// Warning: + /// if *indices == 0 means the row is null, only use in outer join, do not use in any other place + virtual void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) = 0; + /// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented). /// Is used to optimize some computations (in aggregation, for example). /// Parameter length could be ignored if column values have fixed size. diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index fa9e048636..98fb480dd1 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -808,6 +808,17 @@ void ColumnArray::insert_indices_from(const IColumn& src, const int* indices_beg } } +void ColumnArray::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + for (auto x = indices_begin; x != indices_end; ++x) { + if (*x == 0) { + ColumnArray::insert_default(); + } else { + ColumnArray::insert_from(src, *x); + } + } +} + ColumnPtr ColumnArray::replicate(const IColumn::Offsets& replicate_offsets) const { if (replicate_offsets.empty()) return clone_empty(); diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 172815d765..95fd463334 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -222,6 +222,9 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; + void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override { DCHECK(size() > self_row); const auto& r = assert_cast(rhs); diff --git a/be/src/vec/columns/column_complex.h b/be/src/vec/columns/column_complex.h index 6c752d082b..fb89740d85 100644 --- a/be/src/vec/columns/column_complex.h +++ b/be/src/vec/columns/column_complex.h @@ -199,6 +199,21 @@ public: } } + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + const Self& src_vec = assert_cast(src); + auto new_size = indices_end - indices_begin; + + for (uint32_t i = 0; i < new_size; ++i) { + auto offset = *(indices_begin + i); + if (offset == 0) { + data.emplace_back(T {}); + } else { + data.emplace_back(src_vec.get_element(offset)); + } + } + } + void pop_back(size_t n) override { data.erase(data.end() - n, data.end()); } // it's impossible to use ComplexType as key , so we don't have to implement them [[noreturn]] StringRef serialize_value_into_arena(size_t n, Arena& arena, diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h index 307066a7ae..280d2de834 100644 --- a/be/src/vec/columns/column_const.h +++ b/be/src/vec/columns/column_const.h @@ -116,6 +116,11 @@ public: s += (indices_end - indices_begin); } + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + s += (indices_end - indices_begin); + } + void insert(const Field&) override { ++s; } void insert_data(const char*, size_t) override { ++s; } diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h index 30c4f1116f..b61753146f 100644 --- a/be/src/vec/columns/column_decimal.h +++ b/be/src/vec/columns/column_decimal.h @@ -131,6 +131,18 @@ public: } } + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + auto origin_size = size(); + auto new_size = indices_end - indices_begin; + data.resize(origin_size + new_size); + const T* __restrict src_data = reinterpret_cast(src.get_raw_data().data); + + for (uint32_t i = 0; i < new_size; ++i) { + data[origin_size + i] = src_data[indices_begin[i]]; + } + } + void insert_many_fix_len_data(const char* data_ptr, size_t num) override; void insert_many_raw_data(const char* pos, size_t num) override { diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index 421c8fa2dd..d2374811e1 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -82,6 +82,11 @@ public: LOG(FATAL) << "insert_indices_from not supported in ColumnDictionary"; } + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + LOG(FATAL) << "insert_indices_from_join not supported in ColumnDictionary"; + } + void pop_back(size_t n) override { LOG(FATAL) << "pop_back not supported in ColumnDictionary"; } void update_hash_with_value(size_t n, SipHash& hash) const override { @@ -277,9 +282,7 @@ public: } uint32_t get_hash_value(uint32_t idx) const { return _dict.get_hash_value(_codes[idx], _type); } - uint32_t get_crc32_hash_value(uint32_t idx) const { - return _dict.get_crc32_hash_value(_codes[idx], _type); - } + template void find_codes(const HybridSetType* values, std::vector& selected) const { return _dict.find_codes(values, selected); @@ -378,31 +381,6 @@ public: } inline uint32_t get_hash_value(T code, FieldType type) const { - if (_compute_hash_value_flags[code]) { - return _hash_values[code]; - } else { - auto& sv = (*_dict_data)[code]; - // The char data is stored in the disk with the schema length, - // and zeros are filled if the length is insufficient - - // When reading data, use shrink_char_type_column_suffix_zero(_char_type_idx) - // Remove the suffix 0 - // When writing data, use the CharField::consume function to fill in the trailing 0. - - // For dictionary data of char type, sv.size is the schema length, - // so use strnlen to remove the 0 at the end to get the actual length. - int32_t len = sv.size; - if (type == FieldType::OLAP_FIELD_TYPE_CHAR) { - len = strnlen(sv.data, sv.size); - } - uint32_t hash_val = HashUtil::murmur_hash3_32(sv.data, len, 0); - _hash_values[code] = hash_val; - _compute_hash_value_flags[code] = 1; - return _hash_values[code]; - } - } - - inline uint32_t get_crc32_hash_value(T code, FieldType type) const { if (_compute_hash_value_flags[code]) { return _hash_values[code]; } else { diff --git a/be/src/vec/columns/column_fixed_length_object.h b/be/src/vec/columns/column_fixed_length_object.h index 5b9733748d..e6b1db8bf5 100644 --- a/be/src/vec/columns/column_fixed_length_object.h +++ b/be/src/vec/columns/column_fixed_length_object.h @@ -103,6 +103,28 @@ public: } } + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + const Self& src_vec = assert_cast(src); + auto origin_size = size(); + auto new_size = indices_end - indices_begin; + if (_item_size == 0) { + _item_size = src_vec._item_size; + } + DCHECK(_item_size == src_vec._item_size) << "dst and src should have the same _item_size"; + resize(origin_size + new_size); + + for (uint32_t i = 0; i < new_size; ++i) { + auto offset = indices_begin[i]; + if (offset) { + memcpy(&_data[(origin_size + i) * _item_size], &src_vec._data[offset * _item_size], + _item_size); + } else { + memset(&_data[(origin_size + i) * _item_size], 0, _item_size); + } + } + } + void clear() override { _data.clear(); _item_count = 0; diff --git a/be/src/vec/columns/column_map.cpp b/be/src/vec/columns/column_map.cpp index f3fa5ab6c2..82e8c0a911 100644 --- a/be/src/vec/columns/column_map.cpp +++ b/be/src/vec/columns/column_map.cpp @@ -196,6 +196,17 @@ void ColumnMap::insert_indices_from(const IColumn& src, const int* indices_begin } } +void ColumnMap::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + for (auto x = indices_begin; x != indices_end; ++x) { + if (*x == 0) { + ColumnMap::insert_default(); + } else { + ColumnMap::insert_from(src, *x); + } + } +} + StringRef ColumnMap::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const { size_t array_size = size_at(n); size_t offset = offset_at(n); diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h index f6fd313208..1cb3dd0c73 100644 --- a/be/src/vec/columns/column_map.h +++ b/be/src/vec/columns/column_map.h @@ -130,6 +130,9 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; + void append_data_by_selector(MutableColumnPtr& res, const IColumn::Selector& selector) const override { return append_data_by_selector_impl(res, selector); diff --git a/be/src/vec/columns/column_nothing.h b/be/src/vec/columns/column_nothing.h index 8a10eec8b6..8874bb6e7a 100644 --- a/be/src/vec/columns/column_nothing.h +++ b/be/src/vec/columns/column_nothing.h @@ -39,6 +39,11 @@ public: bool structure_equals(const IColumn& rhs) const override { return typeid(rhs) == typeid(ColumnNothing); } + + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + LOG(FATAL) << "insert_indices_from_join not supported in ColumnNothing"; + } }; } // namespace doris::vectorized diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index 4f25a3f4b1..3553e9823d 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -304,6 +304,16 @@ void ColumnNullable::insert_indices_from(const IColumn& src, const int* indices_ _need_update_has_null = true; } +void ColumnNullable::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + const auto& src_concrete = assert_cast(src); + get_nested_column().insert_indices_from_join(src_concrete.get_nested_column(), indices_begin, + indices_end); + _get_null_map_column().insert_indices_from_join(src_concrete.get_null_map_column(), + indices_begin, indices_end); + _need_update_has_null = true; +} + void ColumnNullable::insert(const Field& x) { if (x.is_null()) { get_nested_column().insert_default(); diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 8a45c51d23..365400a669 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -123,6 +123,9 @@ public: void insert_range_from(const IColumn& src, size_t start, size_t length) override; void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; + void insert(const Field& x) override; void insert_from(const IColumn& src, size_t n) override; diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index cf11b47947..fb793abe75 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -1464,4 +1464,15 @@ void ColumnObject::insert_indices_from(const IColumn& src, const int* indices_be }); } +void ColumnObject::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + // insert_indices_from with alignment + const ColumnObject& src_column = *check_and_get_column(src); + align_variant_by_name_and_type(*this, src_column, indices_end - indices_begin, + [indices_begin, indices_end](const IColumn& src, IColumn* dst) { + dst->insert_indices_from_join(src, indices_begin, + indices_end); + }); +} + } // namespace doris::vectorized diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index cade1342b6..c279042251 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -361,6 +361,9 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; + // May throw execption void try_insert(const Field& field); diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index d6a3a51499..2d009e2a08 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -161,6 +161,43 @@ void ColumnString::insert_indices_from(const IColumn& src, const int* indices_be } } +void ColumnString::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + const ColumnString& src_str = assert_cast(src); + auto src_offset_data = src_str.offsets.data(); + + auto old_char_size = chars.size(); + size_t total_chars_size = old_char_size; + + auto dst_offsets_pos = offsets.size(); + offsets.resize(offsets.size() + indices_end - indices_begin); + auto* dst_offsets_data = offsets.data(); + + for (auto x = indices_begin; x != indices_end; ++x) { + if (*x != 0) { + total_chars_size += src_offset_data[*x] - src_offset_data[*x - 1]; + } + dst_offsets_data[dst_offsets_pos++] = total_chars_size; + } + check_chars_length(total_chars_size, offsets.size()); + + chars.resize(total_chars_size); + + auto* src_data_ptr = src_str.chars.data(); + auto* dst_data_ptr = chars.data(); + + size_t dst_chars_pos = old_char_size; + for (auto x = indices_begin; x != indices_end; ++x) { + if (*x != 0) { + const size_t size_to_append = src_offset_data[*x] - src_offset_data[*x - 1]; + const size_t offset = src_offset_data[*x - 1]; + memcpy_small_allow_read_write_overflow15(dst_data_ptr + dst_chars_pos, + src_data_ptr + offset, size_to_append); + dst_chars_pos += size_to_append; + } + } +} + void ColumnString::update_crcs_with_value(uint32_t* __restrict hashes, doris::PrimitiveType type, uint32_t rows, uint32_t offset, const uint8_t* __restrict null_data) const { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index e310817020..191c6a95cf 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -487,6 +487,9 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; + ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override; size_t filter(const Filter& filter) override; diff --git a/be/src/vec/columns/column_struct.cpp b/be/src/vec/columns/column_struct.cpp index 93c6213949..3502fdf581 100644 --- a/be/src/vec/columns/column_struct.cpp +++ b/be/src/vec/columns/column_struct.cpp @@ -233,6 +233,15 @@ void ColumnStruct::insert_indices_from(const IColumn& src, const int* indices_be } } +void ColumnStruct::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + const ColumnStruct& src_concrete = assert_cast(src); + for (size_t i = 0; i < columns.size(); ++i) { + columns[i]->insert_indices_from_join(src_concrete.get_column(i), indices_begin, + indices_end); + } +} + void ColumnStruct::insert_range_from(const IColumn& src, size_t start, size_t length) { const size_t tuple_size = columns.size(); for (size_t i = 0; i < tuple_size; ++i) { diff --git a/be/src/vec/columns/column_struct.h b/be/src/vec/columns/column_struct.h index 919b971b5a..499fb8444f 100644 --- a/be/src/vec/columns/column_struct.h +++ b/be/src/vec/columns/column_struct.h @@ -124,6 +124,9 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; + void get_permutation(bool reverse, size_t limit, int nan_direction_hint, Permutation& res) const override { LOG(FATAL) << "get_permutation not implemented"; diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index 744e74b484..a825e07d5f 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -388,6 +388,20 @@ void ColumnVector::insert_indices_from(const IColumn& src, const int* indices } } +template +void ColumnVector::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) { + auto origin_size = size(); + auto new_size = indices_end - indices_begin; + data.resize(origin_size + new_size); + + const T* __restrict src_data = reinterpret_cast(src.get_raw_data().data); + + for (uint32_t i = 0; i < new_size; ++i) { + data[origin_size + i] = src_data[indices_begin[i]]; + } +} + template ColumnPtr ColumnVector::filter(const IColumn::Filter& filt, ssize_t result_size_hint) const { size_t size = data.size(); diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index 77df238d2a..cb1edddb52 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -389,6 +389,8 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override; void fill(const value_type& element, size_t num) { auto old_size = data.size(); auto new_size = old_size + num; diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index 1448c3d4bd..79f445b08d 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -131,6 +131,11 @@ public: LOG(FATAL) << "insert_indices_from not supported in PredicateColumnType"; } + void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin, + const uint32_t* indices_end) override { + LOG(FATAL) << "insert_indices_from_join not supported in PredicateColumnType"; + } + void pop_back(size_t n) override { LOG(FATAL) << "pop_back not supported in PredicateColumnType"; } diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h index 5b7cd6f464..6efbdbb3e9 100644 --- a/be/src/vec/common/hash_table/hash_map.h +++ b/be/src/vec/common/hash_table/hash_map.h @@ -20,9 +20,14 @@ #pragma once +#include + +#include "common/compiler_util.h" +#include "vec/columns/column_filter_helper.h" #include "vec/common/hash_table/hash.h" #include "vec/common/hash_table/hash_table.h" #include "vec/common/hash_table/hash_table_allocator.h" + /** NOTE HashMap could only be used for memmoveable (position independent) types. * Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++. * Also, key in hash table must be of type, that zero bytes is compared equals to zero key. @@ -193,10 +198,349 @@ public: bool has_null_key_data() const { return false; } }; +template , + typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator> +class JoinHashMapTable : public HashMapTable { +public: + using Self = JoinHashMapTable; + using Base = HashMapTable; + + using key_type = Key; + using value_type = typename Cell::value_type; + using mapped_type = typename Cell::Mapped; + + using LookupResult = typename Base::LookupResult; + + using HashMapTable::HashMapTable; + + static uint32_t calc_bucket_size(size_t num_elem) { + size_t expect_bucket_size = num_elem + (num_elem - 1) / 7; + return phmap::priv::NormalizeCapacity(expect_bucket_size) + 1; + } + + size_t get_byte_size() const { + auto cal_vector_mem = [](const auto& vec) { return vec.capacity() * sizeof(vec[0]); }; + return cal_vector_mem(visited) + cal_vector_mem(first) + cal_vector_mem(next); + } + + template + void prepare_build(size_t num_elem, int batch_size, bool has_null_key) { + _has_null_key = has_null_key; + max_batch_size = batch_size; + bucket_size = calc_bucket_size(num_elem + 1); + first.resize(bucket_size + 1); + next.resize(num_elem); + + if constexpr (JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN || + JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN || + JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) { + visited.resize(num_elem); + } + } + + uint32_t get_bucket_size() const { return bucket_size; } + + size_t size() const { return Base::size() == 0 ? next.size() : Base::size(); } + + std::vector& get_visited() { return visited; } + + void build(const Key* __restrict keys, const uint32_t* __restrict bucket_nums, + size_t num_elem) { + build_keys = keys; + for (size_t i = 1; i < num_elem; i++) { + uint32_t bucket_num = bucket_nums[i]; + next[i] = first[bucket_num]; + first[bucket_num] = i; + } + first[bucket_size] = 0; // index = bucket_num means null + } + + template + auto find_batch(const Key* __restrict keys, const uint32_t* __restrict build_idx_map, + int probe_idx, uint32_t build_idx, int probe_rows, + uint32_t* __restrict probe_idxs, bool& probe_visited, + uint32_t* __restrict build_idxs, + doris::vectorized::ColumnFilterHelper* mark_column) { + if constexpr (is_mark_join) { + return _find_batch_mark( + keys, build_idx_map, probe_idx, probe_rows, probe_idxs, build_idxs, + mark_column); + } + + if constexpr (with_other_conjuncts) { + return _find_batch_conjunct(keys, build_idx_map, probe_idx, build_idx, + probe_rows, probe_idxs, build_idxs); + } + + if constexpr (JoinOpType == doris::TJoinOp::INNER_JOIN || + JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN || + JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN || + JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN) { + return _find_batch_inner_outer_join(keys, build_idx_map, probe_idx, + build_idx, probe_rows, probe_idxs, + probe_visited, build_idxs); + } + if constexpr (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || + JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + return _find_batch_left_semi_anti( + keys, build_idx_map, probe_idx, probe_rows, probe_idxs); + } + if constexpr (JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) { + return _find_batch_right_semi_anti(keys, build_idx_map, probe_idx, probe_rows); + } + return std::tuple {0, 0U, 0}; + } + + template + bool iterate_map(std::vector& build_idxs) const { + const auto batch_size = max_batch_size; + const auto elem_num = visited.size(); + int count = 0; + build_idxs.resize(batch_size); + + while (count < batch_size && iter_idx < elem_num) { + const auto matched = visited[iter_idx]; + build_idxs[count] = iter_idx; + if constexpr (JoinOpType != doris::TJoinOp::RIGHT_SEMI_JOIN) { + count += !matched; + } else { + count += matched; + } + iter_idx++; + } + + build_idxs.resize(count); + return iter_idx >= elem_num; + } + + bool has_null_key() { return _has_null_key; } + + void pre_build_idxs(std::vector& bucksets, const uint8_t* null_map) { + if (null_map) { + first[bucket_size] = bucket_size; // distinguish between not matched and null + } + + for (uint32_t i = 0; i < bucksets.size(); i++) { + bucksets[i] = first[bucksets[i]]; + } + } + +private: + // only LEFT_ANTI_JOIN/LEFT_SEMI_JOIN/NULL_AWARE_LEFT_ANTI_JOIN/CROSS_JOIN support mark join + template + auto _find_batch_mark(const Key* __restrict keys, const uint32_t* __restrict build_idx_map, + int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs, + uint32_t* __restrict build_idxs, + doris::vectorized::ColumnFilterHelper* mark_column) { + auto matched_cnt = 0; + const auto batch_size = max_batch_size; + + while (probe_idx < probe_rows && matched_cnt < batch_size) { + auto build_idx = build_idx_map[probe_idx] == bucket_size ? 0 : build_idx_map[probe_idx]; + + while (build_idx && keys[probe_idx] != build_keys[build_idx]) { + build_idx = next[build_idx]; + } + + if constexpr (!with_other_conjuncts) { + if (build_idx_map[probe_idx] == bucket_size) { + // mark result as null when probe row is null + mark_column->insert_null(); + } else { + bool matched = JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ? build_idx != 0 + : build_idx == 0; + if (!matched && _has_null_key) { + mark_column->insert_null(); + } else { + mark_column->insert_value(matched); + } + } + } + + probe_idxs[matched_cnt] = probe_idx++; + build_idxs[matched_cnt] = build_idx; + matched_cnt++; + } + return std::tuple {probe_idx, 0U, matched_cnt}; + } + + auto _find_batch_right_semi_anti(const Key* __restrict keys, + const uint32_t* __restrict build_idx_map, int probe_idx, + int probe_rows) { + while (probe_idx < probe_rows) { + auto build_idx = build_idx_map[probe_idx]; + + while (build_idx) { + if (!visited[build_idx] && keys[probe_idx] == build_keys[build_idx]) { + visited[build_idx] = 1; + } + build_idx = next[build_idx]; + } + probe_idx++; + } + return std::tuple {probe_idx, 0U, 0}; + } + + template + auto _find_batch_left_semi_anti(const Key* __restrict keys, + const uint32_t* __restrict build_idx_map, int probe_idx, + int probe_rows, uint32_t* __restrict probe_idxs) { + auto matched_cnt = 0; + const auto batch_size = max_batch_size; + + while (probe_idx < probe_rows && matched_cnt < batch_size) { + if constexpr (need_judge_null) { + if (build_idx_map[probe_idx] == bucket_size) { + probe_idx++; + continue; + } + } + + auto build_idx = build_idx_map[probe_idx]; + + while (build_idx && keys[probe_idx] != build_keys[build_idx]) { + build_idx = next[build_idx]; + } + bool matched = + JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ? build_idx != 0 : build_idx == 0; + probe_idxs[matched_cnt] = probe_idx++; + matched_cnt += matched; + } + return std::tuple {probe_idx, 0U, matched_cnt}; + } + + template + auto _find_batch_conjunct(const Key* __restrict keys, const uint32_t* __restrict build_idx_map, + int probe_idx, uint32_t build_idx, int probe_rows, + uint32_t* __restrict probe_idxs, uint32_t* __restrict build_idxs) { + auto matched_cnt = 0; + const auto batch_size = max_batch_size; + + auto do_the_probe = [&]() { + while (build_idx && matched_cnt < batch_size) { + if constexpr (JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) { + if (!visited[build_idx] && keys[probe_idx] == build_keys[build_idx]) { + probe_idxs[matched_cnt] = probe_idx; + build_idxs[matched_cnt] = build_idx; + matched_cnt++; + } + } else if (keys[probe_idx] == build_keys[build_idx]) { + build_idxs[matched_cnt] = build_idx; + probe_idxs[matched_cnt] = probe_idx; + matched_cnt++; + } + build_idx = next[build_idx]; + } + + if constexpr (JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN || + JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN || + JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + // may over batch_size when emplace 0 into build_idxs + if (!build_idx) { + probe_idxs[matched_cnt] = probe_idx; + build_idxs[matched_cnt] = 0; + matched_cnt++; + } + } + + probe_idx++; + }; + + if (build_idx) { + do_the_probe(); + } + + while (probe_idx < probe_rows && matched_cnt < batch_size) { + build_idx = build_idx_map[probe_idx]; + do_the_probe(); + } + + probe_idx -= (build_idx != 0); + return std::tuple {probe_idx, build_idx, matched_cnt}; + } + + template + auto _find_batch_inner_outer_join(const Key* __restrict keys, + const uint32_t* __restrict build_idx_map, int probe_idx, + uint32_t build_idx, int probe_rows, + uint32_t* __restrict probe_idxs, bool& probe_visited, + uint32_t* __restrict build_idxs) { + auto matched_cnt = 0; + const auto batch_size = max_batch_size; + + auto do_the_probe = [&]() { + while (build_idx && matched_cnt < batch_size) { + if (keys[probe_idx] == build_keys[build_idx]) { + probe_idxs[matched_cnt] = probe_idx; + build_idxs[matched_cnt] = build_idx; + matched_cnt++; + if constexpr (JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN || + JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN) { + if (!visited[build_idx]) { + visited[build_idx] = 1; + } + } + } + build_idx = next[build_idx]; + } + + if constexpr (JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN || + JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN) { + // `(!matched_cnt || probe_idxs[matched_cnt - 1] != probe_idx)` means not match one build side + probe_visited |= (matched_cnt && probe_idxs[matched_cnt - 1] == probe_idx); + if (!build_idx) { + if (!probe_visited) { + probe_idxs[matched_cnt] = probe_idx; + build_idxs[matched_cnt] = 0; + matched_cnt++; + } + probe_visited = false; + } + } + probe_idx++; + }; + + if (build_idx) { + do_the_probe(); + } + + while (probe_idx < probe_rows && matched_cnt < batch_size) { + build_idx = build_idx_map[probe_idx]; + do_the_probe(); + } + + probe_idx -= (build_idx != 0); + return std::tuple {probe_idx, build_idx, matched_cnt}; + } + + const Key* __restrict build_keys; + std::vector visited; + + uint32_t bucket_size = 1; + int max_batch_size = 4064; + + std::vector first = {0}; + std::vector next = {0}; + + // use in iter hash map + mutable uint32_t iter_idx = 1; + Cell cell; + doris::vectorized::Arena* pool; + bool _has_null_key = false; +}; + template , typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator> using HashMap = HashMapTable, Hash, Grower, Allocator>; +template > +using JoinFixedHashMap = JoinHashMapTable, Hash>; + template , typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator> using HashMapWithSavedHash = diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 32c0d0a31b..031af96e79 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -55,6 +55,9 @@ struct MethodBase { Arena arena; std::vector hash_values; + // use in join case + std::vector bucket_nums; + MethodBase() { hash_table.reset(new HashMap()); } virtual ~MethodBase() = default; @@ -69,8 +72,29 @@ struct MethodBase { iterator = hash_table->begin(); } } + virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, - const uint8_t* null_map = nullptr) = 0; + const uint8_t* null_map = nullptr, bool is_join = false, + bool is_build = false, uint32_t bucket_size = 0) = 0; + + void init_join_bucket_num(uint32_t num_rows, uint32_t bucket_size, const uint8_t* null_map) { + bucket_nums.resize(num_rows); + + if (null_map == nullptr) { + init_join_bucket_num(num_rows, bucket_size); + return; + } + for (uint32_t k = 0; k < num_rows; ++k) { + bucket_nums[k] = + null_map[k] ? bucket_size : hash_table->hash(keys[k]) & (bucket_size - 1); + } + } + + void init_join_bucket_num(uint32_t num_rows, uint32_t bucket_size) { + for (uint32_t k = 0; k < num_rows; ++k) { + bucket_nums[k] = hash_table->hash(keys[k]) & (bucket_size - 1); + } + } void init_hash_values(size_t num_rows, const uint8_t* null_map) { if (null_map == nullptr) { @@ -148,7 +172,10 @@ struct MethodSerialized : public MethodBase { using Base::init_iterator; using State = ColumnsHashing::HashMethodSerialized; using Base::try_presis_key; - + // need keep until the hash probe end. + std::vector build_stored_keys; + Arena build_arena; + // refresh each time probe std::vector stored_keys; StringRef serialize_keys_to_pool_contiguous(size_t i, size_t keys_size, @@ -163,40 +190,48 @@ struct MethodSerialized : public MethodBase { return {begin, sum_size}; } - void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, - const uint8_t* null_map = nullptr) override { - Base::arena.clear(); - stored_keys.resize(num_rows); + void init_serialized_keys_impl(const ColumnRawPtrs& key_columns, size_t num_rows, + std::vector& keys, Arena& arena) { + arena.clear(); + keys.resize(num_rows); size_t max_one_row_byte_size = 0; for (const auto& column : key_columns) { max_one_row_byte_size += column->get_max_row_byte_size(); } size_t total_bytes = max_one_row_byte_size * num_rows; - if (total_bytes > config::pre_serialize_keys_limit_bytes) { // reach mem limit, don't serialize in batch size_t keys_size = key_columns.size(); for (size_t i = 0; i < num_rows; ++i) { - stored_keys[i] = - serialize_keys_to_pool_contiguous(i, keys_size, key_columns, Base::arena); + keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, key_columns, arena); } } else { - auto* serialized_key_buffer = - reinterpret_cast(Base::arena.alloc(total_bytes)); + auto* serialized_key_buffer = reinterpret_cast(arena.alloc(total_bytes)); for (size_t i = 0; i < num_rows; ++i) { - stored_keys[i].data = + keys[i].data = reinterpret_cast(serialized_key_buffer + i * max_one_row_byte_size); - stored_keys[i].size = 0; + keys[i].size = 0; } for (const auto& column : key_columns) { - column->serialize_vec(stored_keys, num_rows, max_one_row_byte_size); + column->serialize_vec(keys, num_rows, max_one_row_byte_size); } } - Base::keys = stored_keys.data(); - Base::init_hash_values(num_rows, null_map); + Base::keys = keys.data(); + } + + void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr, bool is_join = false, + bool is_build = false, uint32_t bucket_size = 0) override { + init_serialized_keys_impl(key_columns, num_rows, is_build ? build_stored_keys : stored_keys, + is_build ? build_arena : Base::arena); + if (is_join) { + Base::init_join_bucket_num(num_rows, bucket_size, null_map); + } else { + Base::init_hash_values(num_rows, null_map); + } } void insert_keys_into_columns(std::vector& keys, MutableColumns& key_columns, @@ -222,7 +257,8 @@ struct MethodStringNoCache : public MethodBase { std::vector stored_keys; void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, - const uint8_t* null_map = nullptr) override { + const uint8_t* null_map = nullptr, bool is_join = false, + bool is_build = false, uint32_t bucket_size = 0) override { const IColumn& column = *key_columns[0]; const auto& column_string = assert_cast( column.is_nullable() @@ -237,7 +273,11 @@ struct MethodStringNoCache : public MethodBase { } Base::keys = stored_keys.data(); - Base::init_hash_values(num_rows, null_map); + if (is_join) { + Base::init_join_bucket_num(num_rows, bucket_size, null_map); + } else { + Base::init_hash_values(num_rows, null_map); + } } void insert_keys_into_columns(std::vector& keys, MutableColumns& key_columns, @@ -258,15 +298,20 @@ struct MethodOneNumber : public MethodBase { FieldType>; void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, - const uint8_t* null_map = nullptr) override { + const uint8_t* null_map = nullptr, bool is_join = false, + bool is_build = false, uint32_t bucket_size = 0) override { Base::keys = (FieldType*)(key_columns[0]->is_nullable() ? assert_cast(key_columns[0]) ->get_nested_column_ptr() - : key_columns[0]) - ->get_raw_data() - .data; + ->get_raw_data() + .data + : key_columns[0]->get_raw_data().data); std::string name = key_columns[0]->get_name(); - Base::init_hash_values(num_rows, null_map); + if (is_join) { + Base::init_join_bucket_num(num_rows, bucket_size, null_map); + } else { + Base::init_hash_values(num_rows, null_map); + } } void insert_keys_into_columns(std::vector& keys, @@ -292,17 +337,22 @@ struct MethodKeysFixed : public MethodBase { using State = ColumnsHashing::HashMethodKeysFixed; + // need keep until the hash probe end. use only in join + std::vector build_stored_keys; + // refresh each time probe hash table std::vector stored_keys; Sizes key_sizes; MethodKeysFixed(Sizes key_sizes_) : key_sizes(std::move(key_sizes_)) {} template - std::vector pack_fixeds(size_t row_numbers, const ColumnRawPtrs& key_columns, - const ColumnRawPtrs& nullmap_columns) { + void pack_fixeds(size_t row_numbers, const ColumnRawPtrs& key_columns, + const ColumnRawPtrs& nullmap_columns, std::vector& result) { size_t bitmap_size = get_bitmap_size(nullmap_columns.size()); + // set size to 0 at first, then use resize to call default constructor on index included from [0, row_numbers) to reset all memory + result.clear(); + result.resize(row_numbers); - std::vector result(row_numbers); size_t offset = 0; if (bitmap_size > 0) { for (size_t j = 0; j < nullmap_columns.size(); j++) { @@ -356,11 +406,11 @@ struct MethodKeysFixed : public MethodBase { } offset += key_sizes[j]; } - return result; } void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, - const uint8_t* null_map = nullptr) override { + const uint8_t* null_map = nullptr, bool is_join = false, + bool is_build = false, uint32_t bucket_size = 0) override { ColumnRawPtrs actual_columns; ColumnRawPtrs null_maps; if (has_nullable_keys) { @@ -378,9 +428,20 @@ struct MethodKeysFixed : public MethodBase { } else { actual_columns = key_columns; } - stored_keys = pack_fixeds(num_rows, actual_columns, null_maps); - Base::keys = stored_keys.data(); - Base::init_hash_values(num_rows, null_map); + + if (is_build) { + pack_fixeds(num_rows, actual_columns, null_maps, build_stored_keys); + Base::keys = build_stored_keys.data(); + } else { + pack_fixeds(num_rows, actual_columns, null_maps, stored_keys); + Base::keys = stored_keys.data(); + } + + if (is_join) { + Base::init_join_bucket_num(num_rows, bucket_size, null_map); + } else { + Base::init_hash_values(num_rows, null_map); + } } void insert_keys_into_columns(std::vector& keys, @@ -488,14 +549,14 @@ struct MethodSingleNullableColumn : public SingleColumnMethod { #endif template -using SerializedHashTableContext = MethodSerialized>; +using SerializedHashTableContext = MethodSerialized>; template using PrimaryTypeHashTableContext = - MethodOneNumber>>; + MethodOneNumber>>; template using FixedKeyHashTableContext = - MethodKeysFixed>, has_null>; + MethodKeysFixed>, has_null>; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h index 20c8c8e457..fcc682a49c 100644 --- a/be/src/vec/common/hash_table/hash_table.h +++ b/be/src/vec/common/hash_table/hash_table.h @@ -441,7 +441,6 @@ protected: Cell* buf = nullptr; /// A piece of memory for all elements except the element with zero key. Grower grower; int64_t _resize_timer_ns; - // the bucket count threshold above which it's converted to partioned hash table // > 0: enable convert dynamically // 0: convert is disabled diff --git a/be/src/vec/common/hash_table/hash_table_set_build.h b/be/src/vec/common/hash_table/hash_table_set_build.h index 34fb691f9e..152b20eeef 100644 --- a/be/src/vec/common/hash_table/hash_table_set_build.h +++ b/be/src/vec/common/hash_table/hash_table_set_build.h @@ -24,11 +24,9 @@ namespace doris::vectorized { template struct HashTableBuild { template - HashTableBuild(Parent* parent, int rows, ColumnRawPtrs& build_raw_ptrs, uint8_t offset, - RuntimeState* state) + HashTableBuild(Parent* parent, int rows, ColumnRawPtrs& build_raw_ptrs, RuntimeState* state) : _mem_used(parent->mem_used()), _rows(rows), - _offset(offset), _build_raw_ptrs(build_raw_ptrs), _state(state) {} @@ -48,9 +46,9 @@ struct HashTableBuild { size_t k = 0; auto creator = [&](const auto& ctor, auto& key, auto& origin) { HashTableContext::try_presis_key(key, origin, arena); - ctor(key, Mapped {k, _offset}); + ctor(key, Mapped {k}); }; - auto creator_for_null_key = [&](auto& mapped) { mapped = {k, _offset}; }; + auto creator_for_null_key = [&](auto& mapped) { mapped = {k}; }; for (; k < _rows; ++k) { if (k % CHECK_FRECUENCY == 0) { @@ -64,7 +62,6 @@ struct HashTableBuild { private: int64_t* _mem_used = nullptr; const int _rows; - const uint8_t _offset; ColumnRawPtrs& _build_raw_ptrs; RuntimeState* _state = nullptr; }; diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index 40222ad1f9..723dc3ac63 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -265,7 +265,7 @@ void Block::erase(const String& name) { ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) { if (position >= data.size()) { throw Exception(ErrorCode::INTERNAL_ERROR, - "invalid input position, position={}, data.size{}, names={}", position, + "invalid input position, position={}, data.size={}, names={}", position, data.size(), dump_names()); } return data[position]; @@ -274,7 +274,7 @@ ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) { const ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) const { if (position >= data.size()) { throw Exception(ErrorCode::INTERNAL_ERROR, - "invalid input position, position={}, data.size{}, names={}", position, + "invalid input position, position={}, data.size={}, names={}", position, data.size(), dump_names()); } return data[position]; @@ -338,8 +338,9 @@ void Block::check_number_of_rows(bool allow_null_columns) const { } if (!elem.column) { - LOG(FATAL) << fmt::format( - "Column {} in block is nullptr, in method check_number_of_rows.", elem.name); + throw Exception(ErrorCode::INTERNAL_ERROR, + "Column {} in block is nullptr, in method check_number_of_rows.", + elem.name); } ssize_t size = elem.column->size(); @@ -347,8 +348,8 @@ void Block::check_number_of_rows(bool allow_null_columns) const { if (rows == -1) { rows = size; } else if (rows != size) { - LOG(FATAL) << fmt::format("Sizes of columns doesn't match: {}:{},{}:{}, col size: {}", - data.front().name, rows, elem.name, size, each_col_size()); + throw Exception(ErrorCode::INTERNAL_ERROR, "Sizes of columns doesn't match, block={}", + dump_structure()); } } } @@ -1088,7 +1089,7 @@ std::unique_ptr Block::create_same_struct_block(size_t size, bool is_rese if (is_reserve) { column->reserve(size); } else { - column->resize(size); + column->insert_many_defaults(size); } temp_block->insert({std::move(column), d.type, d.name}); } diff --git a/be/src/vec/exec/join/join_op.h b/be/src/vec/exec/join/join_op.h index 8b8efe7389..62569270d9 100644 --- a/be/src/vec/exec/join/join_op.h +++ b/be/src/vec/exec/join/join_op.h @@ -18,7 +18,6 @@ #pragma once #include "vec/common/arena.h" #include "vec/common/columns_hashing.h" -#include "vec/common/hash_table/hash_map.h" #include "vec/core/block.h" namespace doris::vectorized { @@ -45,19 +44,19 @@ namespace doris::vectorized { */ struct RowRef { uint32_t row_num = 0; - uint8_t block_offset; RowRef() = default; - RowRef(size_t row_num_count, uint8_t block_offset_) - : row_num(row_num_count), block_offset(block_offset_) {} + RowRef(size_t row_num_count) : row_num(row_num_count) {} + void clear() {}; }; struct RowRefWithFlag : public RowRef { bool visited; RowRefWithFlag() = default; - RowRefWithFlag(size_t row_num_count, uint8_t block_offset_, bool is_visited = false) - : RowRef(row_num_count, block_offset_), visited(is_visited) {} + RowRefWithFlag(size_t row_num_count, bool is_visited = false) + : RowRef(row_num_count), visited(is_visited) {} + void clear() {}; }; /// Portion of RowRefs, 16 * (MAX_SIZE + 1) bytes sized. @@ -93,14 +92,15 @@ public: ForwardIterator() : root(nullptr), first(false), batch(nullptr), position(0) {} ForwardIterator(RowRefListType* begin) - : root(begin), first(true), batch(root->next), position(0) {} + : root(begin), first(true), batch((&root->next)), position(0) {} RowRefType& operator*() { if (first) { return *root; } - return batch->row_refs[position]; + return batch->operator[](position); } + RowRefType* operator->() { return &(**this); } void operator++() { @@ -109,21 +109,17 @@ public: return; } - if (batch) { + if (batch && position < batch->size()) { ++position; - if (position >= batch->size) { - batch = batch->next; - position = 0; - } } } - bool ok() const { return first || batch; } + bool ok() const { return first || (batch && position < batch->size()); } private: RowRefListType* root = nullptr; bool first; - Batch* batch = nullptr; + std::vector* batch = nullptr; size_t position; }; @@ -131,76 +127,60 @@ struct RowRefList : RowRef { using RowRefType = RowRef; RowRefList() = default; - RowRefList(size_t row_num_, uint8_t block_offset_) : RowRef(row_num_, block_offset_) {} + RowRefList(size_t row_num_) : RowRef(row_num_) {} ForwardIterator begin() { return ForwardIterator(this); } /// insert element after current one - void insert(RowRefType&& row_ref, Arena& pool) { - if (!next) { - next = pool.alloc>(); - *next = Batch(nullptr); - } - next = next->insert(std::move(row_ref), pool); - } + void insert(RowRefType&& row_ref, Arena& pool) { next.emplace_back(std::move(row_ref)); } + + void clear() { next.clear(); } private: friend class ForwardIterator; - - Batch* next = nullptr; + std::vector next; }; struct RowRefListWithFlag : RowRef { using RowRefType = RowRef; RowRefListWithFlag() = default; - RowRefListWithFlag(size_t row_num_, uint8_t block_offset_) : RowRef(row_num_, block_offset_) {} + RowRefListWithFlag(size_t row_num_) : RowRef(row_num_) {} ForwardIterator const begin() { return ForwardIterator(this); } /// insert element after current one - void insert(RowRef&& row_ref, Arena& pool) { - if (!next) { - next = pool.alloc>(); - *next = Batch(nullptr); - } - next = next->insert(std::move(row_ref), pool); - } + void insert(RowRefType&& row_ref, Arena& pool) { next.emplace_back(row_ref); } + + void clear() { next.clear(); } bool visited = false; private: friend class ForwardIterator; - - Batch* next = nullptr; + std::vector next; }; struct RowRefListWithFlags : RowRefWithFlag { using RowRefType = RowRefWithFlag; RowRefListWithFlags() = default; - RowRefListWithFlags(size_t row_num_, uint8_t block_offset_) - : RowRefWithFlag(row_num_, block_offset_) {} + RowRefListWithFlags(size_t row_num_) : RowRefWithFlag(row_num_) {} ForwardIterator const begin() { return ForwardIterator(this); } /// insert element after current one - void insert(RowRefWithFlag&& row_ref, Arena& pool) { - if (!next) { - next = pool.alloc>(); - *next = Batch(nullptr); - } - next = next->insert(std::move(row_ref), pool); - } + void insert(RowRefType&& row_ref, Arena& pool) { next.emplace_back(row_ref); } + + void clear() { next.clear(); } private: friend class ForwardIterator; - - Batch* next = nullptr; + std::vector next; }; } // namespace doris::vectorized diff --git a/be/src/vec/exec/join/process_hash_table_probe.h b/be/src/vec/exec/join/process_hash_table_probe.h index df75952820..295317517d 100644 --- a/be/src/vec/exec/join/process_hash_table_probe.h +++ b/be/src/vec/exec/join/process_hash_table_probe.h @@ -68,23 +68,12 @@ struct ProcessHashTableProbe { // and output block may be different // The output result is determined by the other join conjunct result and same_to_prev struct Status do_other_join_conjuncts(Block* output_block, bool is_mark_join, - int multi_matched_output_row_count, bool is_the_last_sub_block); - - void _process_splited_equal_matched_tuples(int start_row_idx, int row_count, - const UInt8* __restrict other_hit_column, - UInt8* __restrict null_map_data, - UInt8* __restrict filter_map, Block* output_block); - - void _emplace_element(int8_t block_offset, int32_t block_row, int& current_offset); + std::vector& visited, bool has_null_in_build_side); template typename HashTableType::State _init_probe_side(HashTableType& hash_table_ctx, size_t probe_rows, bool with_other_join_conjuncts, - const uint8_t* null_map); - - template - ForwardIterator& _probe_row_match(int& current_offset, int& probe_index, - size_t& probe_size, bool& all_match_one); + const uint8_t* null_map, bool need_judge_null); // Process full outer join/ right join / right semi/anti join to output the join result // in hash table @@ -94,14 +83,14 @@ struct ProcessHashTableProbe { Parent* _parent = nullptr; const int _batch_size; - std::shared_ptr> _build_blocks; + const std::shared_ptr& _build_block; std::unique_ptr _arena; std::vector _probe_keys; std::vector _probe_indexs; - PaddedPODArray _build_block_offsets; - PaddedPODArray _build_block_rows; - std::vector> _build_blocks_locs; + bool _probe_visited = false; + std::vector _build_indexs; + std::vector _build_blocks_locs; // only need set the tuple is null in RIGHT_OUTER_JOIN and FULL_OUTER_JOIN ColumnUInt8::Container* _tuple_is_null_left_flags = nullptr; // only need set the tuple is null in LEFT_OUTER_JOIN and FULL_OUTER_JOIN @@ -112,13 +101,6 @@ struct ProcessHashTableProbe { std::unique_ptr _serialize_key_arena; std::vector _probe_side_find_result; - std::vector _visited_map; - std::vector _same_to_prev; - - int _right_col_idx; - int _right_col_len; - int _row_count_from_last_probe; - bool _have_other_join_conjunct; bool _is_right_semi_anti; std::vector* _left_output_slot_flags = nullptr; @@ -130,7 +112,9 @@ struct ProcessHashTableProbe { RuntimeProfile::Counter* _build_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; - static constexpr int PROBE_SIDE_EXPLODE_RATE = 1; + + int _right_col_idx; + int _right_col_len; }; } // namespace vectorized diff --git a/be/src/vec/exec/join/process_hash_table_probe_impl.h b/be/src/vec/exec/join/process_hash_table_probe_impl.h index 704e5dc2b5..38f8b3a558 100644 --- a/be/src/vec/exec/join/process_hash_table_probe_impl.h +++ b/be/src/vec/exec/join/process_hash_table_probe_impl.h @@ -32,7 +32,7 @@ template ProcessHashTableProbe::ProcessHashTableProbe(Parent* parent, int batch_size) : _parent(parent), _batch_size(batch_size), - _build_blocks(parent->build_blocks()), + _build_block(parent->build_block()), _tuple_is_null_left_flags(parent->is_outer_join() ? &(reinterpret_cast( *parent->_tuple_is_null_left_flag_column) @@ -52,7 +52,11 @@ ProcessHashTableProbe::ProcessHashTableProbe(Parent* parent, _search_hashtable_timer(parent->_search_hashtable_timer), _build_side_output_timer(parent->_build_side_output_timer), _probe_side_output_timer(parent->_probe_side_output_timer), - _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer) {} + _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer), + _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct) + ? 0 + : _parent->left_table_data_types().size()), + _right_col_len(_parent->right_table_data_types().size()) {} template void ProcessHashTableProbe::build_side_output_column( @@ -68,52 +72,14 @@ void ProcessHashTableProbe::build_side_output_column( constexpr auto probe_all = JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN; - if (!is_semi_anti_join || have_other_join_conjunct) { - if (_build_blocks->size() == 1) { - for (int i = 0; i < _right_col_len; i++) { - auto& column = *(*_build_blocks)[0].get_by_position(i).column; - if (output_slot_flags[i]) { - mcol[i + _right_col_idx]->insert_indices_from(column, _build_block_rows.data(), - _build_block_rows.data() + size); - } else { - mcol[i + _right_col_idx]->insert_many_defaults(size); - } - } - } else { - for (int i = 0; i < _right_col_len; i++) { - if (output_slot_flags[i]) { - for (int j = 0; j < size; j++) { - if constexpr (probe_all) { - if (_build_block_offsets[j] == -1) { - DCHECK(mcol[i + _right_col_idx]->is_nullable()); - assert_cast(mcol[i + _right_col_idx].get()) - ->insert_default(); - } else { - auto& column = *(*_build_blocks)[_build_block_offsets[j]] - .get_by_position(i) - .column; - mcol[i + _right_col_idx]->insert_from(column, _build_block_rows[j]); - } - } else { - if (_build_block_offsets[j] == -1) { - // the only case to reach here: - // 1. left anti join with other conjuncts, and - // 2. equal conjuncts does not match - // since nullptr is emplaced back to visited_map, - // the output value of the build side does not matter, - // just insert default value - mcol[i + _right_col_idx]->insert_default(); - } else { - auto& column = *(*_build_blocks)[_build_block_offsets[j]] - .get_by_position(i) - .column; - mcol[i + _right_col_idx]->insert_from(column, _build_block_rows[j]); - } - } - } - } else { - mcol[i + _right_col_idx]->insert_many_defaults(size); - } + if ((!is_semi_anti_join || have_other_join_conjunct) && size) { + for (int i = 0; i < _right_col_len; i++) { + const auto& column = *_build_block->safe_get_by_position(i).column; + if (output_slot_flags[i]) { + mcol[i + _right_col_idx]->insert_indices_from_join(column, _build_indexs.data(), + _build_indexs.data() + size); + } else { + mcol[i + _right_col_idx]->insert_many_defaults(size); } } } @@ -123,7 +89,7 @@ void ProcessHashTableProbe::build_side_output_column( _tuple_is_null_right_flags->resize(size); auto* __restrict null_data = _tuple_is_null_right_flags->data(); for (int i = 0; i < size; ++i) { - null_data[i] = _build_block_rows[i] == -1; + null_data[i] = _build_indexs[i] == 0; } } } @@ -159,73 +125,22 @@ template template typename HashTableType::State ProcessHashTableProbe::_init_probe_side( HashTableType& hash_table_ctx, size_t probe_rows, bool with_other_join_conjuncts, - const uint8_t* null_map) { - _right_col_idx = _is_right_semi_anti && !with_other_join_conjuncts - ? 0 - : _parent->left_table_data_types().size(); - _right_col_len = _parent->right_table_data_types().size(); - _row_count_from_last_probe = 0; - - _build_block_rows.clear(); - _build_block_offsets.clear(); - _probe_indexs.clear(); - if (with_other_join_conjuncts) { - // use in right join to change visited state after exec the vother join conjunct - _visited_map.clear(); - _same_to_prev.clear(); - _visited_map.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE); - _same_to_prev.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE); - } - _probe_indexs.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE); - _build_block_rows.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE); - _build_block_offsets.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE); + const uint8_t* null_map, bool need_judge_null) { + // may over batch size 1 for some outer join case + _probe_indexs.resize(_batch_size + 1); + _build_indexs.resize(_batch_size + 1); if (!_parent->_ready_probe) { _parent->_ready_probe = true; hash_table_ctx.reset(); - hash_table_ctx.init_serialized_keys(_parent->_probe_columns, probe_rows, null_map); + hash_table_ctx.init_serialized_keys(_parent->_probe_columns, probe_rows, null_map, true, + false, hash_table_ctx.hash_table->get_bucket_size()); + hash_table_ctx.hash_table->pre_build_idxs(hash_table_ctx.bucket_nums, + need_judge_null ? null_map : nullptr); } return typename HashTableType::State(_parent->_probe_columns); } -template -template -ForwardIterator& ProcessHashTableProbe::_probe_row_match( - int& current_offset, int& probe_index, size_t& probe_size, bool& all_match_one) { - auto& probe_row_match_iter = std::get>(_parent->_probe_row_match_iter); - if (!probe_row_match_iter.ok()) { - return probe_row_match_iter; - } - - SCOPED_TIMER(_search_hashtable_timer); - for (; probe_row_match_iter.ok() && current_offset < _batch_size; ++probe_row_match_iter) { - _emplace_element(probe_row_match_iter->block_offset, probe_row_match_iter->row_num, - current_offset); - _probe_indexs.emplace_back(probe_index); - if constexpr (with_other_join_conjuncts) { - _visited_map.emplace_back(&probe_row_match_iter->visited); - } - } - - _row_count_from_last_probe = current_offset; - all_match_one &= (current_offset == 1); - if (!probe_row_match_iter.ok()) { - ++probe_index; - } - probe_size = 1; - - return probe_row_match_iter; -} - -template -void ProcessHashTableProbe::_emplace_element(int8_t block_offset, - int32_t block_row, - int& current_offset) { - _build_block_offsets.emplace_back(block_offset); - _build_block_rows.emplace_back(block_row); - current_offset++; -} - template template @@ -234,48 +149,28 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash MutableBlock& mutable_block, Block* output_block, size_t probe_rows) { + if (_right_col_len && !_build_block) { + return Status::InternalError("build block is nullptr"); + } + auto& probe_index = _parent->_probe_index; + auto& build_index = _parent->_build_index; + auto last_probe_index = probe_index; - using KeyGetter = typename HashTableType::State; - using Mapped = typename HashTableType::Mapped; - - KeyGetter key_getter = - _init_probe_side(hash_table_ctx, probe_rows, with_other_conjuncts, - need_null_map_for_probe ? null_map->data() : nullptr); + _init_probe_side( + hash_table_ctx, probe_rows, with_other_conjuncts, + need_null_map_for_probe ? null_map->data() : nullptr, + need_null_map_for_probe && ignore_null && + (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || + JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || is_mark_join)); auto& mcol = mutable_block.mutable_columns(); - constexpr auto is_right_semi_anti_join = - JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || JoinOpType == TJoinOp::RIGHT_SEMI_JOIN; - - constexpr auto probe_all = - JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN; - - int last_probe_index = probe_index; - int current_offset = 0; - bool all_match_one = true; + bool all_match_one = false; size_t probe_size = 0; - auto& probe_row_match_iter = _probe_row_match( - current_offset, probe_index, probe_size, all_match_one); - - // If not(which means it excceed batch size), probe_index is not increased and - // remaining matched rows for the current probe row will be - // handled in the next call of this function - int multi_matched_output_row_count = 0; - - // Is the last sub block of splitted block - bool is_the_last_sub_block = false; - - if (with_other_conjuncts && probe_size != 0) { - is_the_last_sub_block = !probe_row_match_iter.ok(); - _same_to_prev.emplace_back(false); - for (int i = 0; i < current_offset - 1; ++i) { - _same_to_prev.emplace_back(true); - } - } - std::unique_ptr mark_column; if (is_mark_join) { mark_column = std::make_unique(*mcol[mcol.size() - 1]); @@ -283,121 +178,17 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash { SCOPED_TIMER(_search_hashtable_timer); - using FindResult = KeyGetter::FindResult; - FindResult empty = {nullptr, false}; - while (current_offset < _batch_size && probe_index < probe_rows) { - if constexpr (ignore_null && need_null_map_for_probe) { - if ((*null_map)[probe_index]) { - if constexpr (probe_all) { - // only full outer / left outer need insert the data of right table - _emplace_element(-1, -1, current_offset); - _probe_indexs.emplace_back(probe_index); - - if constexpr (with_other_conjuncts) { - _same_to_prev.emplace_back(false); - _visited_map.emplace_back(nullptr); - } - } else { - all_match_one = false; - } - probe_index++; - continue; - } - } - - const auto& find_result = need_null_map_for_probe && (*null_map)[probe_index] - ? empty - : hash_table_ctx.find(key_getter, probe_index); - - auto current_probe_index = probe_index; - if constexpr (!with_other_conjuncts && - (JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::LEFT_SEMI_JOIN)) { - bool need_go_ahead = - (JoinOpType != TJoinOp::LEFT_SEMI_JOIN) ^ find_result.is_found(); - if constexpr (is_mark_join) { - ++current_offset; - bool null_result = (need_null_map_for_probe && (*null_map)[probe_index]) || - (!need_go_ahead && *_has_null_in_build_side); - if (null_result) { - mark_column->insert_null(); - } else { - mark_column->insert_value(need_go_ahead); - } - } else { - current_offset += need_go_ahead; - } - ++probe_index; - } else { - if (find_result.is_found()) { - auto& mapped = find_result.get_mapped(); - auto origin_offset = current_offset; - - // For mark join, if euqual-matched tuple count for one probe row - // excceeds batch size, it's difficult to implement the logic to - // split them into multiple sub blocks and handle them, keep the original - // logic for now. - if constexpr (is_mark_join && with_other_conjuncts) { - for (auto it = mapped.begin(); it.ok(); ++it) { - _emplace_element(it->block_offset, it->row_num, current_offset); - _visited_map.emplace_back(&it->visited); - } - ++probe_index; - } else if constexpr (with_other_conjuncts || !is_right_semi_anti_join) { - auto multi_match_last_offset = current_offset; - auto it = mapped.begin(); - for (; it.ok() && current_offset < _batch_size; ++it) { - _emplace_element(it->block_offset, it->row_num, current_offset); - - if constexpr (with_other_conjuncts) { - _visited_map.emplace_back(&it->visited); - } - } - probe_row_match_iter = it; - if (!it.ok()) { - // If all matched rows for the current probe row are handled, - // advance to next probe row. - // If not(which means it excceed batch size), probe_index is not increased and - // remaining matched rows for the current probe row will be - // handled in the next call of this function - ++probe_index; - } else if constexpr (with_other_conjuncts) { - multi_matched_output_row_count = - current_offset - multi_match_last_offset; - } - } else { - ++probe_index; - } - if constexpr (std::is_same_v) { - mapped.visited = true; - } - - if constexpr (with_other_conjuncts) { - _same_to_prev.emplace_back(false); - for (int i = 0; i < current_offset - origin_offset - 1; ++i) { - _same_to_prev.emplace_back(true); - } - } - } else if constexpr (probe_all || JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - (JoinOpType == TJoinOp::LEFT_SEMI_JOIN && is_mark_join)) { - // only full outer / left outer need insert the data of right table - _emplace_element(-1, -1, current_offset); - - if constexpr (with_other_conjuncts) { - _same_to_prev.emplace_back(false); - _visited_map.emplace_back(nullptr); - } - ++probe_index; - } else { - ++probe_index; - } - } - all_match_one &= (current_offset == _probe_indexs.size() + 1); - _probe_indexs.resize(current_offset, current_probe_index); - } - probe_size = probe_index - last_probe_index + probe_row_match_iter.ok(); + auto [new_probe_idx, new_build_idx, + new_current_offset] = hash_table_ctx.hash_table->template find_batch < JoinOpType, + with_other_conjuncts, is_mark_join, + need_null_map_for_probe && + ignore_null > (hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), + probe_index, build_index, probe_rows, _probe_indexs.data(), + _probe_visited, _build_indexs.data(), mark_column.get()); + probe_index = new_probe_idx; + build_index = new_build_idx; + current_offset = new_current_offset; + probe_size = probe_index - last_probe_index; } build_side_output_column(mcol, *_right_output_slot_flags, current_offset, with_other_conjuncts); @@ -412,8 +203,9 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash output_block->swap(mutable_block.to_block()); if constexpr (with_other_conjuncts) { - return do_other_join_conjuncts(output_block, is_mark_join, multi_matched_output_row_count, - is_the_last_sub_block); + return do_other_join_conjuncts(output_block, is_mark_join, + hash_table_ctx.hash_table->get_visited(), + hash_table_ctx.hash_table->has_null_key()); } return Status::OK(); @@ -421,8 +213,8 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash template Status ProcessHashTableProbe::do_other_join_conjuncts( - Block* output_block, bool is_mark_join, int multi_matched_output_row_count, - bool is_the_last_sub_block) { + Block* output_block, bool is_mark_join, std::vector& visited, + bool has_null_in_build_side) { // dispose the other join conjunct exec auto row_count = output_block->rows(); if (!row_count) { @@ -451,313 +243,90 @@ Status ProcessHashTableProbe::do_other_join_conjuncts( if constexpr (JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN) { - auto new_filter_column = ColumnVector::create(row_count); + auto new_filter_column = ColumnUInt8::create(row_count); auto* __restrict filter_map = new_filter_column->get_data().data(); - auto null_map_column = ColumnVector::create(row_count, 0); - auto* __restrict null_map_data = null_map_column->get_data().data(); - - // It contains non-first sub block of splited equal-conjuncts-matched tuples from last probe row - if (_row_count_from_last_probe > 0) { - _process_splited_equal_matched_tuples(0, _row_count_from_last_probe, filter_column_ptr, - null_map_data, filter_map, output_block); - // This is the last sub block of splitted block, and no equal-conjuncts-matched tuple - // is output in all sub blocks, need to output a tuple for this probe row - if (is_the_last_sub_block && !_parent->_is_any_probe_match_row_output) { - filter_map[0] = true; - null_map_data[0] = true; - } - } - int end_idx = row_count - multi_matched_output_row_count; // process equal-conjuncts-matched tuples that are newly generated // in this run if there are any. - bool has_no_match = true; /// If there was no any match in right table. - for (int i = _row_count_from_last_probe; i < end_idx; ++i) { - auto join_hit = _visited_map[i] != nullptr; - auto other_hit = filter_column_ptr[i]; - if (!_same_to_prev[i]) { - has_no_match = true; - } + for (int i = 0; i < row_count; ++i) { + bool join_hit = _build_indexs[i]; + bool other_hit = filter_column_ptr[i]; - if (!other_hit) { - for (size_t j = 0; j < _right_col_len; ++j) { - typeid_cast( - std::move(*output_block->get_by_position(j + _right_col_idx).column) - .assume_mutable() - .get()) - ->get_null_map_data()[i] = true; - } - } - null_map_data[i] = !join_hit || !other_hit; - - // For cases where one probe row matches multiple build rows for equal conjuncts, - // all the other-conjuncts-matched tuples should be output. - // - // Other-conjuncts-NOT-matched tuples fall into two categories: - // 1. The beginning consecutive one(s). - // For these tuples, only the last one is marked to output; - // If there are any following other-conjuncts-matched tuples, - // the last tuple is also marked NOT to output. - // 2. All the remaining other-conjuncts-NOT-matched tuples. - // All these tuples are marked not to output. - if (join_hit) { - *_visited_map[i] |= other_hit; - - /// Assuming that a row in the left table matches N rows in the right table after scanning the hash table, - /// which means `_same_to_prev[1]` ... `_same_to_prev[N - 1]` are all true. - /// If `other_hit` is true, it is outputted as a match. - /// However, if `other_hit` is false the current row needs to be outputted (`filter_map[i] = true`) - /// and the output of the previous row is cancelled(`filter_map[i - 1] = false`). - /// If a row in the left table matches at least one row in the right table (after filtering through other conjunctions, `has_no_match` is true), - /// the rows that do not satisfy the other conjunctions do not need to be output. - /** - * Assuming match 4 rows in right table: - * ________________________________ - * | row index | other conjuncts | - * +------------+-----------------| - * | 0 | 0 | - * +------------+-----------------| - * | 1 | 0 | - * +------------+-----------------| - * | 2 | 1 | - * +------------+-----------------| - * | 3 | 0 | - * -------------------------------- - * - * Scan row 0: `other_hit` is false, `!_same_to_prev[i]` is true - * set `filter_map[0]` = true - * Scan row 1: `other_hit` is false, !_same_to_prev[i] is false, has_no_match is true, filter_map[i - 1] is true - * set `filter_map[1] = true`, `filter_map[0] = false` - * Scan row 2: `other_hit` is true, !_same_to_prev[i] is false, has_no_match is true, filter_map[i - 1] is true - * set filter_map[2] = true, has_no_match = true - * `_same_to_prev[2] && filter_map[2] && !filter_column_ptr[2 - 1]` is true, - * so set `filter_map[2 - 1] = false` - * Scan row 3: `other_hit` is false, `!_same_to_prev[i]` is false, has_no_match is true - * set `filter_map[3]` = false - * - * After scanned the 4 rows, - * filter_map[0]: false - * filter_map[1]: false - * filter_map[2]: true - * filter_map[3]: false - */ - filter_map[i] = - other_hit || !_same_to_prev[i] || (has_no_match && filter_map[i - 1]); - has_no_match &= !other_hit; - // Here to keep only hit join conjunct and other join conjunt is true need to be output. - // if not, only some key must keep one row will output will null right table column - if (_same_to_prev[i] && filter_map[i] && !filter_column_ptr[i - 1]) { - filter_map[i - 1] = false; - } + if (!join_hit) { + filter_map[i] = _parent->_last_probe_match != _probe_indexs[i]; } else { - filter_map[i] = true; + filter_map[i] = other_hit; + } + if (filter_map[i]) { + _parent->_last_probe_match = _probe_indexs[i]; } - } - - // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row - if (multi_matched_output_row_count > 0) { - _parent->_is_any_probe_match_row_output = false; - _process_splited_equal_matched_tuples(row_count - multi_matched_output_row_count, - multi_matched_output_row_count, filter_column_ptr, - null_map_data, filter_map, output_block); } for (size_t i = 0; i < row_count; ++i) { if (filter_map[i]) { - _tuple_is_null_right_flags->emplace_back(null_map_data[i]); + _tuple_is_null_right_flags->emplace_back(!_build_indexs[i] || + !filter_column_ptr[i]); + if constexpr (JoinOpType == TJoinOp::FULL_OUTER_JOIN) { + visited[_build_indexs[i]] = 1; + } } } output_block->get_by_position(result_column_id).column = std::move(new_filter_column); - } else if constexpr (JoinOpType == TJoinOp::LEFT_SEMI_JOIN) { - // TODO: resize in advance - auto new_filter_column = ColumnVector::create(); - auto& filter_map = new_filter_column->get_data(); + } else if constexpr (JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::LEFT_SEMI_JOIN) { + auto new_filter_column = ColumnUInt8::create(row_count); + auto* __restrict filter_map = new_filter_column->get_data().data(); - size_t start_row_idx = 1; - // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks - if (_row_count_from_last_probe > 0) { - if (_parent->_is_any_probe_match_row_output) { - // if any matched tuple for this probe row is output, - // ignore all the following tuples for this probe row. - for (int row_idx = 0; row_idx < _row_count_from_last_probe; ++row_idx) { - filter_map.emplace_back(false); - } - start_row_idx += _row_count_from_last_probe; - if (_row_count_from_last_probe < row_count) { - filter_map.emplace_back(filter_column_ptr[_row_count_from_last_probe]); - } - } else { - filter_map.emplace_back(filter_column_ptr[0]); - } - } else { - filter_map.emplace_back(filter_column_ptr[0]); - } - for (size_t i = start_row_idx; i < row_count; ++i) { - if (filter_column_ptr[i] || (_same_to_prev[i] && filter_map[i - 1])) { - // Only last same element is true, output last one - filter_map.push_back(true); - filter_map[i - 1] = !_same_to_prev[i] && filter_map[i - 1]; - } else { - filter_map.push_back(false); - } - } - // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row - if (multi_matched_output_row_count > 0) { - // If a matched row is output, all the equal-matched tuples in - // the following sub blocks should be ignored - _parent->_is_any_probe_match_row_output = filter_map[row_count - 1]; - } else if (_row_count_from_last_probe > 0 && !_parent->_is_any_probe_match_row_output) { - // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks, - // and no matched tuple has been output in all previous run. - // If a tuple is output in this run, all the following mathced tuples should be ignored - if (filter_map[_row_count_from_last_probe - 1]) { - _parent->_is_any_probe_match_row_output = true; - } - } - - /// FIXME: incorrect result of semi mark join with other conjuncts(null value missed). if (is_mark_join) { auto mark_column = output_block->get_by_position(orig_columns - 1).column->assume_mutable(); ColumnFilterHelper helper(*mark_column); - // For mark join, we only filter rows which have duplicate join keys. - // And then, we set matched_map to the join result to do the mark join's filtering. - for (size_t i = 1; i < row_count; ++i) { - if (!_same_to_prev[i]) { - helper.insert_value(filter_map[i - 1]); - filter_map[i - 1] = true; - } - } - helper.insert_value(filter_map[filter_map.size() - 1]); - filter_map[filter_map.size() - 1] = true; - } - - output_block->get_by_position(result_column_id).column = std::move(new_filter_column); - } else if constexpr (JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { - auto new_filter_column = ColumnVector::create(row_count); - auto* __restrict filter_map = new_filter_column->get_data().data(); - - // for left anti join, the probe side is output only when - // there are no matched tuples for the probe row. - - // If multiple equal-conjuncts-matched tuples is splitted into several - // sub blocks, just filter out all the other-conjuncts-NOT-matched tuples at first, - // and when processing the last sub block, check whether there are any - // equal-conjuncts-matched tuple is output in all sub blocks, - // if there are none, just pick a tuple and output. - - size_t start_row_idx = 1; - // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks - if (_row_count_from_last_probe > 0 && _parent->_is_any_probe_match_row_output) { - // if any matched tuple for this probe row is output, - // ignore all the following tuples for this probe row. - for (int row_idx = 0; row_idx < _row_count_from_last_probe; ++row_idx) { - filter_map[row_idx] = false; - } - start_row_idx += _row_count_from_last_probe; - if (_row_count_from_last_probe < row_count) { - filter_map[_row_count_from_last_probe] = - filter_column_ptr[_row_count_from_last_probe] && - _visited_map[_row_count_from_last_probe]; - } - } else { - // Both equal conjuncts and other conjuncts are true - filter_map[0] = filter_column_ptr[0] && _visited_map[0]; - } - - for (size_t i = start_row_idx; i < row_count; ++i) { - if ((_visited_map[i] && filter_column_ptr[i]) || - (_same_to_prev[i] && filter_map[i - 1])) { - // When either of two conditions is meet: - // 1. Both equal conjuncts and other conjuncts are true or same_to_prev - // 2. This row is joined from the same build side row as the previous row - // Set filter_map[i] to true and filter_map[i - 1] to false if same_to_prev[i] - // is true. + for (size_t i = 0; i < row_count; ++i) { filter_map[i] = true; - filter_map[i - 1] = !_same_to_prev[i] && filter_map[i - 1]; - } else { - filter_map[i] = false; - } - } - - if (is_mark_join) { - auto& matched_map = assert_cast&>( - *(output_block->get_by_position(orig_columns - 1) - .column->assume_mutable())) - .get_data(); - for (int i = 1; i < row_count; ++i) { - if (!_same_to_prev[i]) { - matched_map.push_back(!filter_map[i - 1]); - filter_map[i - 1] = true; + if (has_null_in_build_side && + (_build_indexs[i] != 0) ^ (JoinOpType == TJoinOp::LEFT_SEMI_JOIN)) { + helper.insert_null(); + } else { + helper.insert_value(filter_column_ptr[i]); } } - matched_map.push_back(!filter_map[row_count - 1]); - filter_map[row_count - 1] = true; } else { - int end_row_idx = 0; - if (_row_count_from_last_probe > 0) { - end_row_idx = row_count - multi_matched_output_row_count; - if (!_parent->_is_any_probe_match_row_output) { - // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks, - // and no matched tuple has been output in all previous run. - // If a tuple is output in this run, all the following mathced tuples should be ignored - if (filter_map[_row_count_from_last_probe - 1]) { - _parent->_is_any_probe_match_row_output = true; - filter_map[_row_count_from_last_probe - 1] = false; - } - if (is_the_last_sub_block && !_parent->_is_any_probe_match_row_output) { - // This is the last sub block of splitted block, and no equal-conjuncts-matched tuple - // is output in all sub blocks, output a tuple for this probe row - filter_map[0] = true; + if constexpr (JoinOpType == TJoinOp::LEFT_SEMI_JOIN) { + for (size_t i = 0; i < row_count; ++i) { + if (filter_column_ptr[i]) { + filter_map[i] = _parent->_last_probe_match != _probe_indexs[i]; + _parent->_last_probe_match = _probe_indexs[i]; + } else { + filter_map[i] = false; } } - if (multi_matched_output_row_count > 0) { - // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row - // If a matched row is output, all the equal-matched tuples in - // the following sub blocks should be ignored - _parent->_is_any_probe_match_row_output = filter_map[row_count - 1]; - filter_map[row_count - 1] = false; - } - } else if (multi_matched_output_row_count > 0) { - end_row_idx = row_count - multi_matched_output_row_count; - // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row - // If a matched row is output, all the equal-matched tuples in - // the following sub blocks should be ignored - _parent->_is_any_probe_match_row_output = filter_map[row_count - 1]; - filter_map[row_count - 1] = false; } else { - end_row_idx = row_count; - } - - // Same to the semi join, but change the last value to opposite value - for (int i = 1 + _row_count_from_last_probe; i < end_row_idx; ++i) { - if (!_same_to_prev[i]) { - filter_map[i - 1] = !filter_map[i - 1]; + for (size_t i = 0; i < row_count; ++i) { + if (_build_indexs[i]) { + filter_map[i] = false; + if (filter_column_ptr[i]) { + _parent->_last_probe_match = _probe_indexs[i]; + } + } else { + filter_map[i] = _parent->_last_probe_match != _probe_indexs[i]; + } } } - auto non_sub_blocks_matched_row_count = - row_count - _row_count_from_last_probe - multi_matched_output_row_count; - if (non_sub_blocks_matched_row_count > 0) { - filter_map[end_row_idx - 1] = !filter_map[end_row_idx - 1]; - } } output_block->get_by_position(result_column_id).column = std::move(new_filter_column); } else if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || JoinOpType == TJoinOp::RIGHT_ANTI_JOIN) { for (int i = 0; i < row_count; ++i) { - DCHECK(_visited_map[i]); - *_visited_map[i] |= filter_column_ptr[i]; + visited[_build_indexs[i]] |= filter_column_ptr[i]; } } else if constexpr (JoinOpType == TJoinOp::RIGHT_OUTER_JOIN) { auto filter_size = 0; for (int i = 0; i < row_count; ++i) { - DCHECK(_visited_map[i]); - auto result = filter_column_ptr[i]; - *_visited_map[i] |= result; - filter_size += result; + visited[_build_indexs[i]] |= filter_column_ptr[i]; + filter_size += filter_column_ptr[i]; } _tuple_is_null_left_flags->resize_fill(filter_size, 0); } @@ -771,205 +340,54 @@ Status ProcessHashTableProbe::do_other_join_conjuncts( JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { orig_columns = _right_col_idx; } - static_cast( - Block::filter_block(output_block, result_column_id, - is_mark_join ? output_block->columns() : orig_columns)); + RETURN_IF_ERROR(Block::filter_block(output_block, result_column_id, + is_mark_join ? output_block->columns() : orig_columns)); } return Status::OK(); } -// For left or full outer join with other conjuncts. -// If multiple equal-conjuncts-matched tuples is splitted into several -// sub blocks, just filter out all the other-conjuncts-NOT-matched tuples at first, -// and when processing the last sub block, check whether there are any -// equal-conjuncts-matched tuple is output in all sub blocks, -// if not, just pick a tuple and output. -template -void ProcessHashTableProbe::_process_splited_equal_matched_tuples( - int start_row_idx, int row_count, const UInt8* __restrict other_hit_column, - UInt8* __restrict null_map_data, UInt8* __restrict filter_map, Block* output_block) { - int end_row_idx = start_row_idx + row_count; - for (int i = start_row_idx; i < end_row_idx; ++i) { - auto join_hit = _visited_map[i] != nullptr; - auto other_hit = other_hit_column[i]; - - if (!other_hit) { - for (size_t j = 0; j < _right_col_len; ++j) { - typeid_cast( - std::move(*output_block->get_by_position(j + _right_col_idx).column) - .assume_mutable() - .get()) - ->get_null_map_data()[i] = true; - } - } - - null_map_data[i] = !join_hit || !other_hit; - filter_map[i] = other_hit; - - if (join_hit) { - *_visited_map[i] |= other_hit; - } - } - _parent->_is_any_probe_match_row_output |= - simd::contain_byte(filter_map + start_row_idx, row_count, 1); -} - template template Status ProcessHashTableProbe::process_data_in_hashtable( HashTableType& hash_table_ctx, MutableBlock& mutable_block, Block* output_block, bool* eos) { - using Mapped = typename HashTableType::Mapped; SCOPED_TIMER(_probe_process_hashtable_timer); - if constexpr (std::is_same_v || - std::is_same_v) { - hash_table_ctx.init_iterator(); - auto& mcol = mutable_block.mutable_columns(); + auto& mcol = mutable_block.mutable_columns(); + *eos = hash_table_ctx.hash_table->template iterate_map(_build_indexs); + auto block_size = _build_indexs.size(); - bool right_semi_anti_without_other = _is_right_semi_anti && !_have_other_join_conjunct; - int right_col_idx = - right_semi_anti_without_other ? 0 : _parent->left_table_data_types().size(); - int right_col_len = _parent->right_table_data_types().size(); - - auto& iter = hash_table_ctx.iterator; - auto block_size = 0; - auto& visited_iter = - std::get>(_parent->_outer_join_pull_visited_iter); - _build_blocks_locs.resize(_batch_size); - auto register_build_loc = [&](int8_t offset, int32_t row_nums) { - _build_blocks_locs[block_size++] = std::pair(offset, row_nums); - }; - - if (visited_iter.ok()) { - if constexpr (std::is_same_v) { - for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - } else { - for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) { - if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { - if (visited_iter->visited) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - } else { - if (!visited_iter->visited) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - } - } - } - if (!visited_iter.ok()) { - ++iter; - } + if (block_size) { + if (mcol.size() < _right_col_len + _right_col_idx) { + return Status::InternalError( + "output block invalid, mcol.size()={}, _right_col_len={}, _right_col_idx={}", + mcol.size(), _right_col_len, _right_col_idx); } - - for (; iter != hash_table_ctx.hash_table->end() && block_size < _batch_size; ++iter) { - auto& mapped = iter->get_second(); - if constexpr (std::is_same_v) { - if (mapped.visited) { - if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { - visited_iter = mapped.begin(); - for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - if (visited_iter.ok()) { - // block_size >= _batch_size, quit for loop - break; - } - } - } else { - if constexpr (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) { - visited_iter = mapped.begin(); - for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - if (visited_iter.ok()) { - // block_size >= _batch_size, quit for loop - break; - } - } - } - } else { - visited_iter = mapped.begin(); - for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) { - if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) { - if (visited_iter->visited) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - } else { - if (!visited_iter->visited) { - register_build_loc(visited_iter->block_offset, visited_iter->row_num); - } - } - } - if (visited_iter.ok()) { - // block_size >= _batch_size, quit for loop - break; - } - } - } - _build_blocks_locs.resize(block_size); - - auto insert_build_rows = [&](int8_t offset) { - for (size_t j = 0; j < right_col_len; ++j) { - auto& column = *(*_build_blocks)[offset].get_by_position(j).column; - mcol[j + right_col_idx]->insert_indices_from( - column, _build_block_rows.data(), - _build_block_rows.data() + _build_block_rows.size()); - } - }; - if (_build_blocks->size() > 1) { - std::sort(_build_blocks_locs.begin(), _build_blocks_locs.end(), - [](const auto a, const auto b) { return a.first > b.first; }); - auto start = 0, end = 0; - while (start < _build_blocks_locs.size()) { - while (end < _build_blocks_locs.size() && - _build_blocks_locs[start].first == _build_blocks_locs[end].first) { - end++; - } - auto offset = _build_blocks_locs[start].first; - _build_block_rows.resize(end - start); - for (int i = 0; start + i < end; i++) { - _build_block_rows[i] = _build_blocks_locs[start + i].second; - } - start = end; - insert_build_rows(offset); - } - } else if (_build_blocks->size() == 1) { - const auto size = _build_blocks_locs.size(); - _build_block_rows.resize(_build_blocks_locs.size()); - for (int i = 0; i < size; i++) { - _build_block_rows[i] = _build_blocks_locs[i].second; - } - insert_build_rows(0); + for (size_t j = 0; j < _right_col_len; ++j) { + const auto& column = *_build_block->safe_get_by_position(j).column; + mcol[j + _right_col_idx]->insert_indices_from_join(column, _build_indexs.data(), + _build_indexs.data() + block_size); } // just resize the left table column in case with other conjunct to make block size is not zero if (_is_right_semi_anti && _have_other_join_conjunct) { - auto target_size = mcol[right_col_idx]->size(); - for (int i = 0; i < right_col_idx; ++i) { - mcol[i]->resize(target_size); + for (int i = 0; i < _right_col_idx; ++i) { + mcol[i]->resize(block_size); } } // right outer join / full join need insert data of left table if constexpr (JoinOpType == TJoinOp::RIGHT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN) { - for (int i = 0; i < right_col_idx; ++i) { + for (int i = 0; i < _right_col_idx; ++i) { assert_cast(mcol[i].get())->insert_many_defaults(block_size); } _tuple_is_null_left_flags->resize_fill(block_size, 1); } - *eos = iter == hash_table_ctx.hash_table->end(); - output_block->swap( - mutable_block.to_block(right_semi_anti_without_other ? right_col_idx : 0)); + output_block->swap(mutable_block.to_block(0)); DCHECK(block_size <= _batch_size); - return Status::OK(); - } else { - LOG(FATAL) << "Invalid RowRefList"; - return Status::InvalidArgument("Invalid RowRefList"); } + return Status::OK(); } template diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index f422f7919b..aaa591c110 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ #include "vec/common/assert_cast.h" #include "vec/common/hash_table/hash_map.h" #include "vec/common/uint128.h" +#include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_nullable.h" @@ -71,6 +73,8 @@ namespace doris::vectorized { +constexpr uint32_t JOIN_BUILD_SIZE_LIMIT = std::numeric_limits::max(); + template Status HashJoinNode::_extract_join_column( Block&, COW::mutable_ptr>&, std::vector>&, @@ -88,19 +92,12 @@ HashJoinNode::HashJoinNode(ObjectPool* pool, const TPlanNode& tnode, const Descr _hash_output_slot_ids(tnode.hash_join_node.__isset.hash_output_slot_ids ? tnode.hash_join_node.hash_output_slot_ids : std::vector {}), - _build_block_idx(0), _build_side_mem_used(0), _build_side_last_mem_used(0) { _runtime_filter_descs = tnode.runtime_filters; _arena = std::make_shared(); _hash_table_variants = std::make_shared(); _process_hashtable_ctx_variants = std::make_unique(); - _build_blocks.reset(new std::vector()); - - // avoid vector expand change block address. - // one block can store 4g data, _build_blocks can store 128*4g data. - // if probe data bigger than 512g, runtime filter maybe will core dump when insert data. - _build_blocks->reserve(HASH_JOIN_MAX_BUILD_BLOCK_COUNT); } Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { @@ -228,10 +225,7 @@ Status HashJoinNode::prepare(RuntimeState* state) { ADD_CHILD_TIMER(record_profile, "BuildSideMergeBlockTime", "BuildTime"); _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime"); _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime"); - _build_table_expanse_timer = ADD_TIMER(record_profile, "BuildTableExpanseTime"); - _build_table_convert_timer = ADD_TIMER(record_profile, "BuildTableConvertToPartitionedTime"); _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime"); - _build_runtime_filter_timer = ADD_TIMER(record_profile, "BuildRuntimeFilterTime"); // Probe phase auto probe_phase_profile = _probe_phase_profile; @@ -249,11 +243,6 @@ Status HashJoinNode::prepare(RuntimeState* state) { _allocate_resource_timer = ADD_TIMER(runtime_profile(), "AllocateResourceTime"); _process_other_join_conjunct_timer = ADD_TIMER(runtime_profile(), "OtherJoinConjunctTime"); - _build_buckets_counter = ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT); - _build_buckets_fill_counter = ADD_COUNTER(runtime_profile(), "FilledBuckets", TUnit::UNIT); - - _build_collisions_counter = ADD_COUNTER(runtime_profile(), "BuildCollisions", TUnit::UNIT); - RETURN_IF_ERROR(VExpr::prepare(_build_expr_ctxs, state, child(1)->row_desc())); RETURN_IF_ERROR(VExpr::prepare(_probe_expr_ctxs, state, child(0)->row_desc())); @@ -308,7 +297,9 @@ bool HashJoinNode::need_more_input_data() const { void HashJoinNode::prepare_for_next() { _probe_index = 0; + _build_index = 0; _ready_probe = false; + _last_probe_match = -1; _prepare_probe_block(); } @@ -462,7 +453,7 @@ Status HashJoinNode::pull(doris::RuntimeState* state, vectorized::Block* output_ if (!st) { return st; } - RETURN_IF_ERROR(_filter_data_and_build_output(state, output_block, eos, &temp_block)); + RETURN_IF_ERROR(_filter_data_and_build_output(state, output_block, eos, &temp_block, false)); // Here make _join_block release the columns' ptr _join_block.set_columns(_join_block.clone_empty_columns()); mutable_join_block.clear(); @@ -725,52 +716,36 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc SCOPED_TIMER(_exec_timer); SCOPED_TIMER(_build_timer); - // make one block for each 4 gigabytes - constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL; - if (_should_build_hash_table) { // If eos or have already met a null value using short-circuit strategy, we do not need to pull // data from probe side. _build_side_mem_used += in_block->allocated_bytes(); + if (_build_side_mutable_block.empty()) { + auto tmp_build_block = + VectorizedUtils::create_empty_columnswithtypename(child(1)->row_desc()); + _build_side_mutable_block = MutableBlock::build_mutable_block(&tmp_build_block); + RETURN_IF_ERROR(_build_side_mutable_block.merge( + *(tmp_build_block.create_same_struct_block(1, false)))); + } + if (in_block->rows() != 0) { SCOPED_TIMER(_build_side_merge_block_timer); RETURN_IF_ERROR(_build_side_mutable_block.merge(*in_block)); - } - - if (UNLIKELY(_build_side_mem_used - _build_side_last_mem_used > BUILD_BLOCK_MAX_SIZE)) { - if (_build_blocks->size() == HASH_JOIN_MAX_BUILD_BLOCK_COUNT) { - return Status::NotSupported(strings::Substitute( - "data size of right table in hash join > $0", - BUILD_BLOCK_MAX_SIZE * HASH_JOIN_MAX_BUILD_BLOCK_COUNT)); + if (_build_side_mutable_block.rows() > JOIN_BUILD_SIZE_LIMIT) { + return Status::NotSupported( + "Hash join do not support build table rows" + " over:" + + std::to_string(JOIN_BUILD_SIZE_LIMIT)); } - _build_blocks->emplace_back(_build_side_mutable_block.to_block()); - - COUNTER_UPDATE(_build_blocks_memory_usage, (*_build_blocks)[_build_block_idx].bytes()); - - // TODO:: Rethink may we should do the process after we receive all build blocks ? - // which is better. - RETURN_IF_ERROR(_process_build_block(state, (*_build_blocks)[_build_block_idx], - _build_block_idx)); - - _build_side_mutable_block = MutableBlock(); - ++_build_block_idx; - _build_side_last_mem_used = _build_side_mem_used; } } if (_should_build_hash_table && eos) { - if (!_build_side_mutable_block.empty()) { - if (_build_blocks->size() == HASH_JOIN_MAX_BUILD_BLOCK_COUNT) { - return Status::NotSupported(strings::Substitute( - "data size of right table in hash join > $0", - BUILD_BLOCK_MAX_SIZE * HASH_JOIN_MAX_BUILD_BLOCK_COUNT)); - } - _build_blocks->emplace_back(_build_side_mutable_block.to_block()); - COUNTER_UPDATE(_build_blocks_memory_usage, (*_build_blocks)[_build_block_idx].bytes()); - RETURN_IF_ERROR(_process_build_block(state, (*_build_blocks)[_build_block_idx], - _build_block_idx)); - } + DCHECK(!_build_side_mutable_block.empty()); + _build_block = std::make_shared(_build_side_mutable_block.to_block()); + COUNTER_UPDATE(_build_blocks_memory_usage, _build_block->bytes()); + RETURN_IF_ERROR(_process_build_block(state, *_build_block)); auto ret = std::visit(Overload {[&](std::monostate&) -> Status { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); @@ -791,7 +766,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc _shared_hash_table_context->status = Status::OK(); // arena will be shared with other instances. _shared_hash_table_context->arena = _arena; - _shared_hash_table_context->blocks = _build_blocks; + _shared_hash_table_context->block = _build_block; _shared_hash_table_context->hash_table_variants = _hash_table_variants; _shared_hash_table_context->short_circuit_for_null_in_probe_side = _has_null_in_build_side; @@ -823,7 +798,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc *_hash_table_variants, *std::static_pointer_cast( _shared_hash_table_context->hash_table_variants)); - _build_blocks = _shared_hash_table_context->blocks; + _build_block = _shared_hash_table_context->block; if (!_shared_hash_table_context->runtime_filters.empty()) { auto ret = std::visit( @@ -839,7 +814,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc _build_expr_ctxs, _runtime_filter_descs); RETURN_IF_ERROR(_runtime_filter_slots->init( - state, arg.hash_table->size(), 0)); + state, arg.hash_table->size())); RETURN_IF_ERROR(_runtime_filter_slots->copy_from_shared_context( _shared_hash_table_context)); RETURN_IF_ERROR(_runtime_filter_slots->publish()); @@ -856,7 +831,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc // Since the comparison of null values is meaningless, null aware left anti join should not output null // when the build side is not empty. - if (!_build_blocks->empty() && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + if (_build_block && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { _probe_ignore_null = true; } _init_short_circuit_for_probe(); @@ -955,7 +930,7 @@ void HashJoinNode::_set_build_ignore_flag(Block& block, const std::vector& } } -Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uint8_t offset) { +Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block) { SCOPED_TIMER(_build_table_timer); size_t rows = block.rows(); if (UNLIKELY(rows == 0)) { @@ -970,6 +945,14 @@ Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uin RETURN_IF_ERROR(_do_evaluate(block, _build_expr_ctxs, *_build_expr_call_timer, res_col_ids)); if (_join_op == TJoinOp::LEFT_OUTER_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN) { _convert_block_to_null(block); + // first row is mocked + for (int i = 0; i < block.columns(); i++) { + auto [column, is_const] = unpack_if_const(block.safe_get_by_position(i).column); + assert_cast(column->assume_mutable().get()) + ->get_null_map_column() + .get_data() + .data()[0] = 1; + } } // TODO: Now we are not sure whether a column is nullable only by ExecNode's `row_desc` // so we have to initialize this flag by the first build block. @@ -986,28 +969,30 @@ Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uin Status st = _extract_join_column(block, null_map_val, raw_ptrs, res_col_ids); st = std::visit( - Overload { - [&](std::monostate& arg, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - return Status::OK(); - }, - [&](auto&& arg, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { - using HashTableCtxType = std::decay_t; - ProcessHashTableBuild - hash_table_build_process(rows, block, raw_ptrs, this, - state->batch_size(), offset, state); - return hash_table_build_process - .template run( - arg, - has_null_value || short_circuit_for_null_in_build_side - ? &null_map_val->get_data() - : nullptr, - &_has_null_in_build_side); - }}, - *_hash_table_variants, make_bool_variant(_build_side_ignore_null), + Overload {[&](std::monostate& arg, auto join_op, auto has_null_value, + auto short_circuit_for_null_in_build_side) -> Status { + LOG(FATAL) << "FATAL: uninited hash table"; + __builtin_unreachable(); + return Status::OK(); + }, + [&](auto&& arg, auto&& join_op, auto has_null_value, + auto short_circuit_for_null_in_build_side) -> Status { + using HashTableCtxType = std::decay_t; + using JoinOpType = std::decay_t; + + ProcessHashTableBuild + hash_table_build_process(rows, block, raw_ptrs, this, + state->batch_size(), state); + return hash_table_build_process + .template run( + arg, + has_null_value || short_circuit_for_null_in_build_side + ? &null_map_val->get_data() + : nullptr, + &_has_null_in_build_side); + }}, + *_hash_table_variants, _join_op_variants, make_bool_variant(_build_side_ignore_null), make_bool_variant(_short_circuit_for_null_in_build_side)); return st; @@ -1076,7 +1061,7 @@ void HashJoinNode::_hash_table_init(RuntimeState* state) { return; } - if (!try_get_hash_map_context_fixed( + if (!try_get_hash_map_context_fixed( *_hash_table_variants, _build_expr_ctxs)) { _hash_table_variants->emplace>(); } @@ -1084,16 +1069,6 @@ void HashJoinNode::_hash_table_init(RuntimeState* state) { _join_op_variants, make_bool_variant(_have_other_join_conjunct)); DCHECK(!std::holds_alternative(*_hash_table_variants)); - - std::visit(Overload {[&](std::monostate& arg) { - LOG(FATAL) << "FATAL: uninited hash table"; - __builtin_unreachable(); - }, - [&](auto&& arg) { - arg.hash_table->set_partitioned_threshold( - state->partitioned_hash_join_rows_threshold()); - }}, - *_hash_table_variants); } void HashJoinNode::_process_hashtable_ctx_variants_init(RuntimeState* state) { diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h index 53cb247d0a..7913c49b0c 100644 --- a/be/src/vec/exec/join/vhash_join_node.h +++ b/be/src/vec/exec/join/vhash_join_node.h @@ -83,17 +83,17 @@ struct ProcessRuntimeFilterBuild { parent->_runtime_filter_slots = std::make_shared( parent->_build_expr_ctxs, parent->runtime_filter_descs()); - RETURN_IF_ERROR(parent->_runtime_filter_slots->init( - state, hash_table_ctx.hash_table->size(), parent->_build_rf_cardinality)); + RETURN_IF_ERROR( + parent->_runtime_filter_slots->init(state, hash_table_ctx.hash_table->size())); - if (!parent->_runtime_filter_slots->empty() && !parent->_inserted_rows.empty()) { + if (!parent->_runtime_filter_slots->empty() && !parent->_inserted_blocks.empty()) { { - SCOPED_TIMER(parent->_push_compute_timer); - parent->_runtime_filter_slots->insert(parent->_inserted_rows); + SCOPED_TIMER(parent->_runtime_filter_compute_timer); + parent->_runtime_filter_slots->insert(parent->_inserted_blocks); } } { - SCOPED_TIMER(parent->_push_down_timer); + SCOPED_TIMER(parent->_publish_runtime_filter_timer); RETURN_IF_ERROR(parent->_runtime_filter_slots->publish()); } @@ -106,143 +106,56 @@ using ProfileCounter = RuntimeProfile::Counter; template struct ProcessHashTableBuild { ProcessHashTableBuild(int rows, Block& acquired_block, ColumnRawPtrs& build_raw_ptrs, - Parent* parent, int batch_size, uint8_t offset, RuntimeState* state) + Parent* parent, int batch_size, RuntimeState* state) : _rows(rows), - _skip_rows(0), _acquired_block(acquired_block), _build_raw_ptrs(build_raw_ptrs), _parent(parent), _batch_size(batch_size), - _offset(offset), - _state(state), - _build_side_compute_hash_timer(parent->_build_side_compute_hash_timer) {} + _state(state) {} - template + template Status run(HashTableContext& hash_table_ctx, ConstNullMapPtr null_map, bool* has_null_key) { - using KeyGetter = typename HashTableContext::State; - using Mapped = typename HashTableContext::Mapped; - - Defer defer {[&]() { - int64_t bucket_size = hash_table_ctx.hash_table->get_buffer_size_in_cells(); - int64_t filled_bucket_size = hash_table_ctx.hash_table->size(); - int64_t bucket_bytes = hash_table_ctx.hash_table->get_buffer_size_in_bytes(); - COUNTER_SET(_parent->_hash_table_memory_usage, bucket_bytes); - COUNTER_SET(_parent->_build_buckets_counter, bucket_size); - COUNTER_SET(_parent->_build_collisions_counter, - hash_table_ctx.hash_table->get_collisions()); - COUNTER_SET(_parent->_build_buckets_fill_counter, filled_bucket_size); - - auto hash_table_buckets = hash_table_ctx.hash_table->get_buffer_sizes_in_cells(); - std::string hash_table_buckets_info; - for (auto bucket_count : hash_table_buckets) { - hash_table_buckets_info += std::to_string(bucket_count) + ", "; + if (short_circuit_for_null || ignore_null) { + // first row is mocked and is null + for (uint32_t i = 1; i < _rows; i++) { + if ((*null_map)[i]) { + *has_null_key = true; + } } - _parent->add_hash_buckets_info(hash_table_buckets_info); - - auto hash_table_sizes = hash_table_ctx.hash_table->sizes(); - hash_table_buckets_info.clear(); - for (auto table_size : hash_table_sizes) { - hash_table_buckets_info += std::to_string(table_size) + ", "; + if (short_circuit_for_null && *has_null_key) { + return Status::OK(); } - _parent->add_hash_buckets_filled_info(hash_table_buckets_info); - }}; + } - KeyGetter key_getter(_build_raw_ptrs); + if (!_parent->runtime_filter_descs().empty()) { + _parent->_inserted_blocks.insert(&_acquired_block); + } SCOPED_TIMER(_parent->_build_table_insert_timer); - hash_table_ctx.hash_table->reset_resize_timer(); - - // only not build_unique, we need expanse hash table before insert data - // 1. There are fewer duplicate keys, reducing the number of resize hash tables - // can improve performance to a certain extent, about 2%-5% - // 2. There are many duplicate keys, and the hash table filled bucket is far less than - // the hash table build bucket, which may waste a lot of memory. - // TODO, use the NDV expansion of the key column in the optimizer statistics - if (!_parent->build_unique()) { - RETURN_IF_CATCH_EXCEPTION(hash_table_ctx.hash_table->expanse_for_add_elem( - std::min(_rows, config::hash_table_pre_expanse_max_rows))); - } - - vector& inserted_rows = _parent->_inserted_rows[&_acquired_block]; - bool has_runtime_filter = !_parent->runtime_filter_descs().empty(); - if (has_runtime_filter) { - inserted_rows.reserve(_batch_size); - } + hash_table_ctx.hash_table->template prepare_build(_rows, _batch_size, + *has_null_key); hash_table_ctx.init_serialized_keys(_build_raw_ptrs, _rows, - null_map ? null_map->data() : nullptr); - - auto& arena = *_parent->arena(); - auto old_build_arena_memory = arena.size(); - - size_t k = 0; - bool inserted = false; - auto creator = [&](const auto& ctor, auto& key, auto& origin) { - HashTableContext::try_presis_key(key, origin, arena); - inserted = true; - ctor(key, Mapped {k, _offset}); - }; - - bool build_unique = _parent->build_unique(); -#define EMPLACE_IMPL(stmt) \ - for (; k < _rows; ++k) { \ - if (k % CHECK_FRECUENCY == 0) { \ - RETURN_IF_CANCELLED(_state); \ - } \ - if constexpr (short_circuit_for_null) { \ - if ((*null_map)[k]) { \ - *has_null_key = true; \ - return Status::OK(); \ - } \ - } else if constexpr (ignore_null) { \ - if ((*null_map)[k]) { \ - *has_null_key = true; \ - continue; \ - } \ - } \ - inserted = false; \ - [[maybe_unused]] auto& mapped = \ - hash_table_ctx.lazy_emplace(key_getter, k, creator, nullptr); \ - stmt; \ - } - - if (has_runtime_filter && build_unique) { - EMPLACE_IMPL( - if (inserted) { inserted_rows.push_back(k); } else { _skip_rows++; }); - } else if (has_runtime_filter && !build_unique) { - EMPLACE_IMPL( - if (inserted) { inserted_rows.push_back(k); } else { - mapped.insert({k, _offset}, *_parent->arena()); - inserted_rows.push_back(k); - }); - } else if (!has_runtime_filter && build_unique) { - EMPLACE_IMPL(if (!inserted) { _skip_rows++; }); - } else { - EMPLACE_IMPL(if (!inserted) { mapped.insert({k, _offset}, *_parent->arena()); }); - } - _parent->_build_rf_cardinality += inserted_rows.size(); - - _parent->_build_arena_memory_usage->add(arena.size() - old_build_arena_memory); - - COUNTER_UPDATE(_parent->_build_table_expanse_timer, - hash_table_ctx.hash_table->get_resize_timer_value()); - COUNTER_UPDATE(_parent->_build_table_convert_timer, - hash_table_ctx.hash_table->get_convert_timer_value()); + null_map ? null_map->data() : nullptr, true, true, + hash_table_ctx.hash_table->get_bucket_size()); + hash_table_ctx.hash_table->build(hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), + _rows); + hash_table_ctx.bucket_nums.resize(_batch_size); + hash_table_ctx.bucket_nums.shrink_to_fit(); + COUNTER_UPDATE(_parent->_hash_table_memory_usage, + hash_table_ctx.hash_table->get_byte_size()); return Status::OK(); } private: - const int _rows; - int _skip_rows; + const uint32_t _rows; Block& _acquired_block; ColumnRawPtrs& _build_raw_ptrs; Parent* _parent = nullptr; int _batch_size; - uint8_t _offset; RuntimeState* _state = nullptr; - - ProfileCounter* _build_side_compute_hash_timer = nullptr; }; template @@ -325,8 +238,6 @@ using HashTableIteratorVariants = std::variant, ForwardIterator, ForwardIterator>; -static constexpr auto HASH_JOIN_MAX_BUILD_BLOCK_COUNT = 128; - class HashJoinNode final : public VJoinNodeBase { public: HashJoinNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); @@ -369,7 +280,7 @@ public: bool have_other_join_conjunct() const { return _have_other_join_conjunct; } bool is_right_semi_anti() const { return _is_right_semi_anti; } bool is_outer_join() const { return _is_outer_join; } - std::shared_ptr> build_blocks() const { return _build_blocks; } + const std::shared_ptr& build_block() const { return _build_block; } std::vector* left_output_slot_flags() { return &_left_output_slot_flags; } std::vector* right_output_slot_flags() { return &_right_output_slot_flags; } bool* has_null_in_build_side() { return &_has_null_in_build_side; } @@ -387,19 +298,20 @@ private: friend struct ProcessHashTableProbe; void _init_short_circuit_for_probe() { + bool empty_block = !_build_block; _short_circuit_for_probe = (_has_null_in_build_side && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !_is_mark_join) || - (_build_blocks->empty() && _join_op == TJoinOp::INNER_JOIN && !_is_mark_join) || - (_build_blocks->empty() && _join_op == TJoinOp::LEFT_SEMI_JOIN && !_is_mark_join) || - (_build_blocks->empty() && _join_op == TJoinOp::RIGHT_OUTER_JOIN) || - (_build_blocks->empty() && _join_op == TJoinOp::RIGHT_SEMI_JOIN) || - (_build_blocks->empty() && _join_op == TJoinOp::RIGHT_ANTI_JOIN); + (empty_block && _join_op == TJoinOp::INNER_JOIN && !_is_mark_join) || + (empty_block && _join_op == TJoinOp::LEFT_SEMI_JOIN && !_is_mark_join) || + (empty_block && _join_op == TJoinOp::RIGHT_OUTER_JOIN) || + (empty_block && _join_op == TJoinOp::RIGHT_SEMI_JOIN) || + (empty_block && _join_op == TJoinOp::RIGHT_ANTI_JOIN); //when build table rows is 0 and not have other_join_conjunct and not _is_mark_join and join type is one of LEFT_OUTER_JOIN/FULL_OUTER_JOIN/LEFT_ANTI_JOIN //we could get the result is probe table + null-column(if need output) _empty_right_table_need_probe_dispose = - (_build_blocks->empty() && !_have_other_join_conjunct && !_is_mark_join) && + (empty_block && !_have_other_join_conjunct && !_is_mark_join) && (_join_op == TJoinOp::LEFT_OUTER_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN || _join_op == TJoinOp::LEFT_ANTI_JOIN); } @@ -430,21 +342,14 @@ private: RuntimeProfile::Counter* _build_table_timer = nullptr; RuntimeProfile::Counter* _build_expr_call_timer = nullptr; RuntimeProfile::Counter* _build_table_insert_timer = nullptr; - RuntimeProfile::Counter* _build_table_expanse_timer = nullptr; - RuntimeProfile::Counter* _build_table_convert_timer = nullptr; RuntimeProfile::Counter* _probe_expr_call_timer = nullptr; RuntimeProfile::Counter* _probe_next_timer = nullptr; - RuntimeProfile::Counter* _build_buckets_counter = nullptr; - RuntimeProfile::Counter* _build_buckets_fill_counter = nullptr; RuntimeProfile::Counter* _search_hashtable_timer = nullptr; RuntimeProfile::Counter* _build_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_side_output_timer = nullptr; RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr; RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr; RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr; - RuntimeProfile::Counter* _build_runtime_filter_timer = nullptr; - - RuntimeProfile::Counter* _build_collisions_counter = nullptr; RuntimeProfile::Counter* _open_timer = nullptr; RuntimeProfile::Counter* _allocate_resource_timer = nullptr; @@ -467,7 +372,7 @@ private: HashTableIteratorVariants _outer_join_pull_visited_iter; HashTableIteratorVariants _probe_row_match_iter; - std::shared_ptr> _build_blocks; + std::shared_ptr _build_block; Block _probe_block; ColumnRawPtrs _probe_columns; ColumnUInt8::MutablePtr _null_map_column; @@ -476,8 +381,10 @@ private: bool _has_set_need_null_map_for_build = false; bool _probe_ignore_null = false; int _probe_index = -1; + uint32_t _build_index = 0; bool _ready_probe = false; bool _probe_eos = false; + int _last_probe_match; bool _build_side_ignore_null = false; @@ -490,9 +397,6 @@ private: std::vector _left_output_slot_flags; std::vector _right_output_slot_flags; - // for cases when a probe row matches more than batch size build rows. - bool _is_any_probe_match_row_output = false; - uint8_t _build_block_idx = 0; int64_t _build_side_mem_used = 0; int64_t _build_side_last_mem_used = 0; MutableBlock _build_side_mutable_block; @@ -501,7 +405,7 @@ private: Status _materialize_build_side(RuntimeState* state) override; - Status _process_build_block(RuntimeState* state, Block& block, uint8_t offset); + Status _process_build_block(RuntimeState* state, Block& block); Status _do_evaluate(Block& block, VExprContextSPtrs& exprs, RuntimeProfile::Counter& expr_call_timer, std::vector& res_col_ids); @@ -539,10 +443,9 @@ private: friend struct ProcessRuntimeFilterBuild; std::vector _runtime_filter_descs; - std::unordered_map> _inserted_rows; + std::unordered_set _inserted_blocks; std::vector _runtime_filters; - size_t _build_rf_cardinality = 0; std::atomic_bool _probe_open_finish = false; }; } // namespace vectorized diff --git a/be/src/vec/exec/join/vjoin_node_base.cpp b/be/src/vec/exec/join/vjoin_node_base.cpp index 2401993563..0077fe2a7b 100644 --- a/be/src/vec/exec/join/vjoin_node_base.cpp +++ b/be/src/vec/exec/join/vjoin_node_base.cpp @@ -120,8 +120,8 @@ Status VJoinNodeBase::prepare(RuntimeState* state) { ADD_CHILD_TIMER(_probe_phase_profile, "BuildOutputBlock", "ProbeTime"); _probe_rows_counter = ADD_COUNTER_WITH_LEVEL(_probe_phase_profile, "ProbeRows", TUnit::UNIT, 1); - _push_down_timer = ADD_TIMER(runtime_profile(), "PublishRuntimeFilterTime"); - _push_compute_timer = ADD_TIMER(runtime_profile(), "PushDownComputeTime"); + _publish_runtime_filter_timer = ADD_TIMER(runtime_profile(), "PublishRuntimeFilterTime"); + _runtime_filter_compute_timer = ADD_TIMER(runtime_profile(), "RunmtimeFilterComputeTime"); return Status::OK(); } diff --git a/be/src/vec/exec/join/vjoin_node_base.h b/be/src/vec/exec/join/vjoin_node_base.h index 8d26db22ea..a44bc5513a 100644 --- a/be/src/vec/exec/join/vjoin_node_base.h +++ b/be/src/vec/exec/join/vjoin_node_base.h @@ -142,8 +142,8 @@ protected: RuntimeProfile* _probe_phase_profile = nullptr; RuntimeProfile::Counter* _probe_timer = nullptr; RuntimeProfile::Counter* _probe_rows_counter = nullptr; - RuntimeProfile::Counter* _push_down_timer = nullptr; - RuntimeProfile::Counter* _push_compute_timer = nullptr; + RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr; + RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr; RuntimeProfile::Counter* _join_filter_timer = nullptr; RuntimeProfile::Counter* _build_output_block_timer = nullptr; }; diff --git a/be/src/vec/exec/join/vnested_loop_join_node.cpp b/be/src/vec/exec/join/vnested_loop_join_node.cpp index 04d0b6a2c3..a5305a4b53 100644 --- a/be/src/vec/exec/join/vnested_loop_join_node.cpp +++ b/be/src/vec/exec/join/vnested_loop_join_node.cpp @@ -74,13 +74,13 @@ Status RuntimeFilterBuild::operator()(RuntimeState* state) { RETURN_IF_ERROR(runtime_filter_slots.init(state)); if (!runtime_filter_slots.empty() && !_parent->build_blocks().empty()) { - SCOPED_TIMER(_parent->push_compute_timer()); + SCOPED_TIMER(_parent->runtime_filter_compute_timer()); for (auto& build_block : _parent->build_blocks()) { RETURN_IF_ERROR(runtime_filter_slots.insert(&build_block)); } } { - SCOPED_TIMER(_parent->push_down_timer()); + SCOPED_TIMER(_parent->publish_runtime_filter_timer()); RETURN_IF_ERROR(runtime_filter_slots.publish()); } diff --git a/be/src/vec/exec/join/vnested_loop_join_node.h b/be/src/vec/exec/join/vnested_loop_join_node.h index 7dd63b498f..b309485db5 100644 --- a/be/src/vec/exec/join/vnested_loop_join_node.h +++ b/be/src/vec/exec/join/vnested_loop_join_node.h @@ -99,9 +99,13 @@ public: std::vector& runtime_filter_descs() { return _runtime_filter_descs; } VExprContextSPtrs& filter_src_expr_ctxs() { return _filter_src_expr_ctxs; } - RuntimeProfile::Counter* push_compute_timer() { return _push_compute_timer; } + RuntimeProfile::Counter* runtime_filter_compute_timer() { + return _runtime_filter_compute_timer; + } Blocks& build_blocks() { return _build_blocks; } - RuntimeProfile::Counter* push_down_timer() { return _push_down_timer; } + RuntimeProfile::Counter* publish_runtime_filter_timer() { + return _publish_runtime_filter_timer; + } private: template diff --git a/be/src/vec/exec/vset_operation_node.cpp b/be/src/vec/exec/vset_operation_node.cpp index 7cc025b607..28dfd23ec7 100644 --- a/be/src/vec/exec/vset_operation_node.cpp +++ b/be/src/vec/exec/vset_operation_node.cpp @@ -58,7 +58,6 @@ VSetOperationNode::VSetOperationNode(ObjectPool* pool, const TPlan : ExecNode(pool, tnode, descs), _valid_element_in_hash_tbl(0), _mem_used(0), - _build_block_index(0), _build_finished(false) { _hash_table_variants = std::make_unique(); } @@ -219,7 +218,7 @@ void VSetOperationNode::hash_table_init() { } return; } - if (!try_get_hash_map_context_fixed( + if (!try_get_hash_map_context_fixed( *_hash_table_variants, _child_expr_lists[0])) { _hash_table_variants->emplace>(); } @@ -228,36 +227,46 @@ void VSetOperationNode::hash_table_init() { template Status VSetOperationNode::sink(RuntimeState* state, Block* block, bool eos) { SCOPED_TIMER(_exec_timer); - constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL; if (block->rows() != 0) { _mem_used += block->allocated_bytes(); RETURN_IF_ERROR(_mutable_block.merge(*block)); } - if (eos || _mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) { - _build_blocks.emplace_back(_mutable_block.to_block()); - RETURN_IF_ERROR( - process_build_block(_build_blocks[_build_block_index], _build_block_index, state)); - _mutable_block.clear(); - ++_build_block_index; - - if (eos) { - if constexpr (is_intersect) { - _valid_element_in_hash_tbl = 0; - } else { - std::visit( - [&](auto&& arg) { - using HashTableCtxType = std::decay_t; - if constexpr (!std::is_same_v) { - _valid_element_in_hash_tbl = arg.hash_table->size(); - } - }, - *_hash_table_variants); - } - _build_finished = true; - _can_read = _children.size() == 1; + if (block->rows() != 0) { + if (_build_block.empty()) { + RETURN_IF_ERROR(_mutable_block.merge(*(block->create_same_struct_block(0, false)))); } + RETURN_IF_ERROR(_mutable_block.merge(*block)); + if (_mutable_block.rows() > std::numeric_limits::max()) { + return Status::NotSupported( + "Hash join do not support build table rows" + " over:" + + std::to_string(std::numeric_limits::max())); + } + } + + if (eos) { + if (!_mutable_block.empty()) { + _build_block = _mutable_block.to_block(); + } + RETURN_IF_ERROR(process_build_block(_build_block, state)); + _mutable_block.clear(); + + if constexpr (is_intersect) { + _valid_element_in_hash_tbl = 0; + } else { + std::visit( + [&](auto&& arg) { + using HashTableCtxType = std::decay_t; + if constexpr (!std::is_same_v) { + _valid_element_in_hash_tbl = arg.hash_table->size(); + } + }, + *_hash_table_variants); + } + _build_finished = true; + _can_read = _children.size() == 1; } return Status::OK(); } @@ -310,8 +319,7 @@ Status VSetOperationNode::hash_table_build(RuntimeState* state) { } template -Status VSetOperationNode::process_build_block(Block& block, uint8_t offset, - RuntimeState* state) { +Status VSetOperationNode::process_build_block(Block& block, RuntimeState* state) { size_t rows = block.rows(); if (rows == 0) { return Status::OK(); @@ -326,7 +334,7 @@ Status VSetOperationNode::process_build_block(Block& block, uint8_ using HashTableCtxType = std::decay_t; if constexpr (!std::is_same_v) { HashTableBuild hash_table_build_process( - this, rows, raw_ptrs, offset, state); + this, rows, raw_ptrs, state); st = hash_table_build_process(arg, _arena); } else { LOG(FATAL) << "FATAL: uninited hash table"; @@ -342,8 +350,8 @@ void VSetOperationNode::add_result_columns(RowRefListWithFlags& va int& block_size) { auto it = value.begin(); for (auto idx = _build_col_idx.begin(); idx != _build_col_idx.end(); ++idx) { - auto& column = *_build_blocks[it->block_offset].get_by_position(idx->first).column; - if (_mutable_cols[idx->second]->is_nullable() xor column.is_nullable()) { + const auto& column = *_build_block.get_by_position(idx->first).column; + if (_mutable_cols[idx->second]->is_nullable() ^ column.is_nullable()) { DCHECK(_mutable_cols[idx->second]->is_nullable()); ((ColumnNullable*)(_mutable_cols[idx->second].get())) ->insert_from_not_nullable(column, it->row_num); @@ -512,10 +520,6 @@ void VSetOperationNode::debug_string(int indentation_level, template void VSetOperationNode::release_mem() { _hash_table_variants = nullptr; - - std::vector tmp_build_blocks; - _build_blocks.swap(tmp_build_blocks); - _probe_block.clear(); } diff --git a/be/src/vec/exec/vset_operation_node.h b/be/src/vec/exec/vset_operation_node.h index 4e68965a04..070ad381f4 100644 --- a/be/src/vec/exec/vset_operation_node.h +++ b/be/src/vec/exec/vset_operation_node.h @@ -82,7 +82,7 @@ private: //It's time to abstract out the same methods and provide them directly to others; void hash_table_init(); Status hash_table_build(RuntimeState* state); - Status process_build_block(Block& block, uint8_t offset, RuntimeState* state); + Status process_build_block(Block& block, RuntimeState* state); Status extract_build_column(Block& block, ColumnRawPtrs& raw_ptrs); Status extract_probe_column(Block& block, ColumnRawPtrs& raw_ptrs, int child_id); void refresh_hash_table(); @@ -115,11 +115,10 @@ private: //record insert column id during probe std::vector _probe_column_inserted_id; - std::vector _build_blocks; + Block _build_block; Block _probe_block; ColumnRawPtrs _probe_columns; std::vector _mutable_cols; - int _build_block_index; bool _build_finished; std::vector _probe_finished_children_index; MutableBlock _mutable_block; diff --git a/be/src/vec/exprs/vbloom_predicate.cpp b/be/src/vec/exprs/vbloom_predicate.cpp index 06bd21a6eb..176ecb219c 100644 --- a/be/src/vec/exprs/vbloom_predicate.cpp +++ b/be/src/vec/exprs/vbloom_predicate.cpp @@ -88,41 +88,16 @@ Status VBloomPredicate::execute(VExprContext* context, Block* block, int* result block->get_by_position(arguments[0]).column->convert_to_full_column_if_const(); size_t sz = argument_column->size(); res_data_column->resize(sz); - auto ptr = ((ColumnVector*)res_data_column.get())->get_data().data(); + auto* ptr = ((ColumnVector*)res_data_column.get())->get_data().data(); auto type = WhichDataType(remove_nullable(block->get_by_position(arguments[0]).type)); if (type.is_string_or_fixed_string()) { - // When _be_exec_version is equal to or greater than 2, we use the new hash method. - // This is only to be used if the be_exec_version may be less than 2. If updated, please delete it. - if (_be_exec_version >= 2) { - for (size_t i = 0; i < sz; i++) { - /// TODO: remove virtual function call in get_data_at to improve performance - auto ele = argument_column->get_data_at(i); - const StringRef v(ele.data, ele.size); - ptr[i] = _filter->find_crc32_hash(reinterpret_cast(&v)); - } - } else { - for (size_t i = 0; i < sz; i++) { - auto ele = argument_column->get_data_at(i); - const StringRef v(ele.data, ele.size); - ptr[i] = _filter->find(reinterpret_cast(&v)); - } - } - } else if (_be_exec_version > 0 && (type.is_int_or_uint() || type.is_float())) { - if (argument_column->is_nullable()) { - auto column_nested = reinterpret_cast(argument_column.get()) - ->get_nested_column_ptr(); - auto column_nullmap = reinterpret_cast(argument_column.get()) - ->get_null_map_column_ptr(); - _filter->find_fixed_len(column_nested->get_raw_data().data, - (uint8*)column_nullmap->get_raw_data().data, sz, ptr); - } else { - _filter->find_fixed_len(argument_column->get_raw_data().data, nullptr, sz, ptr); + for (size_t i = 0; i < sz; i++) { + auto ele = argument_column->get_data_at(i); + const StringRef v(ele.data, ele.size); + ptr[i] = _filter->find(reinterpret_cast(&v)); } } else { - for (size_t i = 0; i < sz; i++) { - ptr[i] = _filter->find( - reinterpret_cast(argument_column->get_data_at(i).data)); - } + _filter->find_fixed_len(argument_column, ptr); } if (_data_type->is_nullable()) { diff --git a/be/src/vec/runtime/shared_hash_table_controller.h b/be/src/vec/runtime/shared_hash_table_controller.h index 6b31cf07ec..e1c0170904 100644 --- a/be/src/vec/runtime/shared_hash_table_controller.h +++ b/be/src/vec/runtime/shared_hash_table_controller.h @@ -53,18 +53,15 @@ struct SharedRuntimeFilterContext { struct SharedHashTableContext { SharedHashTableContext() - : hash_table_variants(nullptr), - blocks(new std::vector()), - signaled(false), - short_circuit_for_null_in_probe_side(false) {} + : hash_table_variants(nullptr), block(std::make_shared()) {} Status status; std::shared_ptr arena; std::shared_ptr hash_table_variants; - std::shared_ptr> blocks; + std::shared_ptr block; std::map runtime_filters; - bool signaled; - bool short_circuit_for_null_in_probe_side; + bool signaled {}; + bool short_circuit_for_null_in_probe_side {}; }; using SharedHashTableContextPtr = std::shared_ptr; diff --git a/be/test/exprs/bloom_filter_predicate_test.cpp b/be/test/exprs/bloom_filter_predicate_test.cpp index 4f4ecd7c87..8c33ed13a6 100644 --- a/be/test/exprs/bloom_filter_predicate_test.cpp +++ b/be/test/exprs/bloom_filter_predicate_test.cpp @@ -53,9 +53,6 @@ TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) { // test not exist val int not_exist_val = 0x3355ff; EXPECT_FALSE(func->find((const void*)¬_exist_val)); - // TEST null value - func->insert(nullptr); - func->find(nullptr); } TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) { diff --git a/regression-test/data/query_p0/join/mark_join/mark_join.out b/regression-test/data/query_p0/join/mark_join/mark_join.out new file mode 100644 index 0000000000..ed3575d0e1 --- /dev/null +++ b/regression-test/data/query_p0/join/mark_join/mark_join.out @@ -0,0 +1,19 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test -- +1 1 1 a +2 2 2 b +3 -3 \N c +3 3 \N c + +-- !test -- +1 1 1 a +2 2 2 b +3 -3 \N c +3 3 \N c + +-- !test -- +1 1 1 a +2 2 2 b +3 -3 \N c +3 3 \N c + diff --git a/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy b/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy index 5a5987ec43..ada61beccb 100644 --- a/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy +++ b/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy @@ -77,7 +77,7 @@ suite("sub_query_alias") { select * from lineorder l ) t on c.c_custkey = t.lo_custkey - order by c.c_custkey + order by c.c_custkey,lo_tax """ } diff --git a/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy b/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy index 0e98510e96..ac87bbc813 100644 --- a/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy +++ b/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy @@ -428,27 +428,27 @@ suite ("sub_query_correlated") { """ order_qt_hash_join_with_other_conjuncts1 """ - SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1; + SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1,k2; """ order_qt_hash_join_with_other_conjuncts2 """ - SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1; + SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1,k2; """ order_qt_hash_join_with_other_conjuncts3 """ - SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1; + SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1,k2; """ order_qt_hash_join_with_other_conjuncts4 """ - SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1; + SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1,k2; """ order_qt_same_subquery_in_conjuncts """ - SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1; + SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1,k2; """ order_qt_two_subquery_in_one_conjuncts """ - SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k3 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1; + SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k3 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1,k2; """ order_qt_multi_subquery_in_and_scalry """ diff --git a/regression-test/suites/nereids_syntax_p0/view.groovy b/regression-test/suites/nereids_syntax_p0/view.groovy index c694c37bbe..48e0ca3752 100644 --- a/regression-test/suites/nereids_syntax_p0/view.groovy +++ b/regression-test/suites/nereids_syntax_p0/view.groovy @@ -63,7 +63,7 @@ suite("view") { qt_select_3 """ select * from v3 - order by v3.c_custkey, v3.lo_orderkey + order by v3.c_custkey, v3.lo_orderkey,lo_tax """ qt_select_4 """ @@ -83,7 +83,7 @@ suite("view") { from v2 ) t on l.lo_custkey = t.lo_custkey - order by l.lo_custkey, t.lo_custkey, l.lo_linenumber, l.lo_tax + order by l.lo_custkey, t.lo_custkey, l.lo_linenumber, t.lo_linenumber, t.lo_shipmode,t.lo_tax """ qt_select_6 """ diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy new file mode 100644 index 0000000000..9759a0e9b4 --- /dev/null +++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("mark_join") { + sql "drop table if exists t1;" + sql "drop table if exists t2;" + sql """ + create table t1 ( + k1 int null, + k2 int null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + + sql """ + create table t2 ( + k1 int null, + k2 int null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + + sql "insert into t1 select 1,1,1,'a';" + sql "insert into t1 select 2,2,2,'b';" + sql "insert into t1 select 3,-3,null,'c';" + sql "insert into t1 select 3,3,null,'c';" + + sql "insert into t2 select 1,1,1,'a';" + sql "insert into t2 select 2,2,2,'b';" + sql "insert into t2 select 3,-3,null,'c';" + sql "insert into t2 select 3,3,null,'c';" + + qt_test """ + select * from t1 where exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2; + """ + qt_test """ + select * from t1 where not exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2; + """ + qt_test """ + select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 = t1.k2) or k1 < 10 order by k1, k2; + """ +}