From d0eb4d7da359116e6e582648c1409e84479cdc3b Mon Sep 17 00:00:00 2001 From: amory Date: Tue, 11 Jul 2023 14:40:40 +0800 Subject: [PATCH] [Improve](hash-fun)improve nested hash with range #21699 Issue Number: close #xxx when cal array hash, elem size is not need to seed hash hash = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), sizeof(elem_size), hash); but we need to be care [[], [1]] vs [[1], []], when array nested array , and nested array is empty, we should make hash seed to make difference 2. use range for one hash value to avoid virtual function call in loop. which double the performance. I make it in ut column: array[int64] 50 rows , and single array has 10w elements --- be/src/vec/columns/column.h | 8 +- be/src/vec/columns/column_array.cpp | 77 +++++++++++++----- be/src/vec/columns/column_array.h | 6 +- be/src/vec/columns/column_const.h | 8 +- be/src/vec/columns/column_decimal.cpp | 50 +++++++++--- be/src/vec/columns/column_decimal.h | 14 +++- be/src/vec/columns/column_map.cpp | 80 ++++++++++++++----- be/src/vec/columns/column_map.h | 6 +- be/src/vec/columns/column_nullable.cpp | 34 +++++--- be/src/vec/columns/column_nullable.h | 6 +- be/src/vec/columns/column_string.h | 42 ++++++++-- be/src/vec/columns/column_struct.cpp | 10 ++- be/src/vec/columns/column_struct.h | 6 +- be/src/vec/columns/column_vector.h | 42 ++++++++-- be/test/vec/columns/column_hash_func_test.cpp | 79 ++++++++++++++++++ 15 files changed, 371 insertions(+), 97 deletions(-) diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index b291ba2443..0ff14a1653 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -386,7 +386,9 @@ public: LOG(FATAL) << get_name() << " update_hashes_with_value xxhash not supported"; } - virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const { + // use range for one hash value to avoid virtual function call in loop + virtual void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { LOG(FATAL) << get_name() << " update_hash_with_value xxhash not supported"; } @@ -398,7 +400,9 @@ public: LOG(FATAL) << get_name() << "update_crcs_with_value not supported"; } - virtual void update_crc_with_value(size_t n, uint64_t& hash) const { + // use range for one hash value to avoid virtual function call in loop + virtual void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { LOG(FATAL) << get_name() << " update_crc_with_value not supported"; } diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index c5e35e5bb9..b575f7bf15 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -277,25 +277,64 @@ void ColumnArray::update_hashes_with_value(std::vector& hashes, } // for every array row calculate xxHash -void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const { - size_t elem_size = size_at(n); - size_t offset = offset_at(n); - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), sizeof(elem_size), - hash); - for (auto i = 0; i < elem_size; ++i) { - get_data().update_xxHash_with_value(offset + i, hash); +void ColumnArray::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets_column = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } } } // for every array row calculate crcHash -void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const { - size_t elem_size = size_at(n); - size_t offset = offset_at(n); - - crc = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), sizeof(elem_size), - crc); - for (auto i = 0; i < elem_size; ++i) { - get_data().update_crc_with_value(offset + i, crc); +void ColumnArray::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets_column = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } } } @@ -305,12 +344,12 @@ void ColumnArray::update_hashes_with_value(uint64_t* __restrict hashes, if (null_data) { for (size_t i = 0; i < s; ++i) { if (null_data[i] == 0) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } @@ -324,12 +363,12 @@ void ColumnArray::update_crcs_with_value(std::vector& hash, PrimitiveT for (size_t i = 0; i < s; ++i) { // every row if (null_data[i] == 0) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 2e1c96a2c5..4fe1827e17 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -139,8 +139,10 @@ public: StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; const char* deserialize_and_insert_from_arena(const char* pos) override; void update_hash_with_value(size_t n, SipHash& hash) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h index feeb0608a2..7554e773b9 100644 --- a/be/src/vec/columns/column_const.h +++ b/be/src/vec/columns/column_const.h @@ -152,7 +152,8 @@ public: data->serialize_vec(keys, num_rows, max_row_byte_size); } - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { auto real_data = data->get_data_at(0); if (real_data.data == nullptr) { hash = HashUtil::xxHash64NullWithSeed(hash); @@ -161,8 +162,9 @@ public: } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { - get_data_column_ptr()->update_crc_with_value(n, crc); + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + get_data_column_ptr()->update_crc_with_value(start, end, hash, nullptr); } void serialize_vec_with_null_map(std::vector& keys, size_t num_rows, diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index e0b8fef056..a73be249eb 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -138,16 +138,27 @@ void ColumnDecimal::update_hashes_with_value(std::vector& hashes, } template -void ColumnDecimal::update_crc_with_value(size_t n, uint64_t& crc) const { - if constexpr (!IsDecimalV2) { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); +void ColumnDecimal::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (null_data == nullptr) { + for (size_t i = start; i < end; i++) { + if constexpr (!IsDecimalV2) { + hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash); + } else { + decimalv2_do_crc(i, hash); + } + } } else { - const DecimalV2Value& dec_val = (const DecimalV2Value&)data[n]; - int64_t int_val = dec_val.int_value(); - int32_t frac_val = dec_val.frac_value(); - crc = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), crc); - crc = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), crc); - }; + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + if constexpr (!IsDecimalV2) { + hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash); + } else { + decimalv2_do_crc(i, hash); + } + } + } + } } template @@ -161,19 +172,32 @@ void ColumnDecimal::update_crcs_with_value(std::vector& hashes, Pri } else { if (null_data == nullptr) { for (size_t i = 0; i < s; i++) { - update_crc_with_value(i, hashes[i]); + decimalv2_do_crc(i, hashes[i]); } } else { for (size_t i = 0; i < s; i++) { - if (null_data[i] == 0) update_crc_with_value(i, hashes[i]); + if (null_data[i] == 0) decimalv2_do_crc(i, hashes[i]); } } } } template -void ColumnDecimal::update_xxHash_with_value(size_t n, uint64_t& hash) const { - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[n]), sizeof(T), hash); +void ColumnDecimal::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), + sizeof(T), hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), sizeof(T), + hash); + } + } } template diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h index 0aa0879f6b..404c8d7019 100644 --- a/be/src/vec/columns/column_decimal.h +++ b/be/src/vec/columns/column_decimal.h @@ -181,8 +181,10 @@ public: void update_crcs_with_value(std::vector& hashes, PrimitiveType type, const uint8_t* __restrict null_data) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; int compare_at(size_t n, size_t m, const IColumn& rhs_, int nan_direction_hint) const override; void get_permutation(bool reverse, size_t limit, int nan_direction_hint, @@ -299,6 +301,14 @@ protected: std::partial_sort(res.begin(), sort_end, res.end(), [this](size_t a, size_t b) { return data[a] < data[b]; }); } + + void ALWAYS_INLINE decimalv2_do_crc(size_t i, uint64_t& hash) const { + const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i]; + int64_t int_val = dec_val.int_value(); + int32_t frac_val = dec_val.frac_value(); + hash = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), hash); + hash = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), hash); + }; }; template diff --git a/be/src/vec/columns/column_map.cpp b/be/src/vec/columns/column_map.cpp index 1924e2ba46..ac7c5da1a9 100644 --- a/be/src/vec/columns/column_map.cpp +++ b/be/src/vec/columns/column_map.cpp @@ -253,26 +253,64 @@ void ColumnMap::update_hashes_with_value(std::vector& hashes, SIP_HASHES_FUNCTION_COLUMN_IMPL(); } -void ColumnMap::update_xxHash_with_value(size_t n, uint64_t& hash) const { - size_t kv_size = size_at(n); - size_t offset = offset_at(n); - - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&kv_size), sizeof(kv_size), - hash); - for (auto i = 0; i < kv_size; ++i) { - get_keys().update_xxHash_with_value(offset + i, hash); - get_values().update_xxHash_with_value(offset + i, hash); +void ColumnMap::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, nullptr); + } + } } } -void ColumnMap::update_crc_with_value(size_t n, uint64_t& crc) const { - size_t kv_size = size_at(n); - size_t offset = offset_at(n); - - crc = HashUtil::zlib_crc_hash(reinterpret_cast(&kv_size), sizeof(kv_size), crc); - for (size_t i = 0; i < kv_size; ++i) { - get_keys().update_crc_with_value(offset + i, crc); - get_values().update_crc_with_value(offset + i, crc); +void ColumnMap::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + } + } } } @@ -282,12 +320,12 @@ void ColumnMap::update_hashes_with_value(uint64_t* hashes, const uint8_t* null_d for (size_t i = 0; i < s; ++i) { // every row if (null_data[i] == 0) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } @@ -301,12 +339,12 @@ void ColumnMap::update_crcs_with_value(std::vector& hash, PrimitiveTyp for (size_t i = 0; i < s; ++i) { // every row if (null_data[i] == 0) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h index 0d7bb2d0a7..91c9eb0177 100644 --- a/be/src/vec/columns/column_map.h +++ b/be/src/vec/columns/column_map.h @@ -167,8 +167,10 @@ public: size_t allocated_bytes() const override; void protect() override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index ce5b68f3fb..538bcd27a6 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -65,21 +65,35 @@ MutableColumnPtr ColumnNullable::get_shrinked_column() { get_null_map_column_ptr()); } -void ColumnNullable::update_xxHash_with_value(size_t n, uint64_t& hash) const { - auto* __restrict real_null_data = assert_cast(*null_map).get_data().data(); - if (real_null_data[n] != 0) { - hash = HashUtil::xxHash64NullWithSeed(hash); +void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (!has_null()) { + nested_column->update_xxHash_with_value(start, end, hash, nullptr); } else { - nested_column->update_xxHash_with_value(n, hash); + auto* __restrict real_null_data = + assert_cast(*null_map).get_data().data(); + for (int i = start; i < end; ++i) { + if (real_null_data[i] != 0) { + hash = HashUtil::xxHash64NullWithSeed(hash); + } + } + nested_column->update_xxHash_with_value(start, end, hash, real_null_data); } } -void ColumnNullable::update_crc_with_value(size_t n, uint64_t& crc) const { - auto* __restrict real_null_data = assert_cast(*null_map).get_data().data(); - if (real_null_data[n] != 0) { - crc = HashUtil::zlib_crc_hash_null(crc); +void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (!has_null()) { + nested_column->update_crc_with_value(start, end, hash, nullptr); } else { - nested_column->update_xxHash_with_value(n, crc); + auto* __restrict real_null_data = + assert_cast(*null_map).get_data().data(); + for (int i = start; i < end; ++i) { + if (real_null_data[i] != 0) { + hash = HashUtil::zlib_crc_hash_null(hash); + } + } + nested_column->update_crc_with_value(start, end, hash, real_null_data); } } diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index be9ba72399..11c24be294 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -215,8 +215,10 @@ public: ColumnPtr replicate(const Offsets& replicate_offsets) const override; void replicate(const uint32_t* counts, size_t target_size, IColumn& column, size_t begin = 0, int count_sz = -1) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hash_with_value(size_t n, SipHash& hash) const override; void update_hashes_with_value(std::vector& hashes, diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index a8bf06c469..ac95f78037 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -393,16 +393,42 @@ public: void deserialize_vec_with_null_map(std::vector& keys, const size_t num_rows, const uint8_t* null_map) override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { - size_t string_size = size_at(n); - size_t offset = offset_at(n); - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&chars[offset]), - string_size, hash); + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t string_size = size_at(i); + size_t offset = offset_at(i); + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&chars[offset]), + string_size, hash); + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t string_size = size_at(i); + size_t offset = offset_at(i); + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&chars[offset]), + string_size, hash); + } + } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { - auto data_ref = get_data_at(n); - crc = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, crc); + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + auto data_ref = get_data_at(i); + hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + } + } + } else { + for (size_t i = start; i < end; ++i) { + auto data_ref = get_data_at(i); + hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + } + } } void update_hash_with_value(size_t n, SipHash& hash) const override { diff --git a/be/src/vec/columns/column_struct.cpp b/be/src/vec/columns/column_struct.cpp index 58f5a4abaf..0b3bcb24e8 100644 --- a/be/src/vec/columns/column_struct.cpp +++ b/be/src/vec/columns/column_struct.cpp @@ -196,15 +196,17 @@ void ColumnStruct::update_hashes_with_value(std::vector& hashes, SIP_HASHES_FUNCTION_COLUMN_IMPL(); } -void ColumnStruct::update_xxHash_with_value(size_t n, uint64_t& hash) const { +void ColumnStruct::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { for (const auto& column : columns) { - column->update_xxHash_with_value(n, hash); + column->update_xxHash_with_value(start, end, hash, nullptr); } } -void ColumnStruct::update_crc_with_value(size_t n, uint64_t& crc) const { +void ColumnStruct::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { for (const auto& column : columns) { - column->update_crc_with_value(n, crc); + column->update_crc_with_value(start, end, hash, nullptr); } } diff --git a/be/src/vec/columns/column_struct.h b/be/src/vec/columns/column_struct.h index 3771d29e48..9073725e81 100644 --- a/be/src/vec/columns/column_struct.h +++ b/be/src/vec/columns/column_struct.h @@ -106,8 +106,10 @@ public: const char* deserialize_and_insert_from_arena(const char* pos) override; void update_hash_with_value(size_t n, SipHash& hash) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index 67f2827c92..48228822f3 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -274,21 +274,49 @@ public: const uint8_t* null_map, size_t max_row_byte_size) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[n]), sizeof(T), hash); + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), + sizeof(T), hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), + sizeof(T), hash); + } + } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { + void ALWAYS_INLINE update_crc_with_value_without_null(size_t idx, uint64_t& hash) const { if constexpr (!std::is_same_v) { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); + hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash); } else { if (this->is_date_type() || this->is_datetime_type()) { char buf[64]; - const VecDateTimeValue& date_val = (const VecDateTimeValue&)data[n]; + const VecDateTimeValue& date_val = (const VecDateTimeValue&)data[idx]; auto len = date_val.to_buffer(buf); - crc = HashUtil::zlib_crc_hash(buf, len, crc); + hash = HashUtil::zlib_crc_hash(buf, len, hash); } else { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); + hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash); + } + } + } + + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + update_crc_with_value_without_null(i, hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + update_crc_with_value_without_null(i, hash); } } } diff --git a/be/test/vec/columns/column_hash_func_test.cpp b/be/test/vec/columns/column_hash_func_test.cpp index 0e409b4640..f80edb035f 100644 --- a/be/test/vec/columns/column_hash_func_test.cpp +++ b/be/test/vec/columns/column_hash_func_test.cpp @@ -20,6 +20,8 @@ #include #include "gtest/gtest_pred_impl.h" +#include "util/runtime_profile.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_const.h" #include "vec/core/field.h" #include "vec/data_types/data_type.h" @@ -86,6 +88,83 @@ TEST(HashFuncTest, ArrayTypeTest) { } } +TEST(HashFuncTest, ArraySimpleBenchmarkTest) { + DataTypes dataTypes = create_scala_data_types(); + + DataTypePtr d = std::make_shared(); + DataTypePtr array_ptr = std::make_shared(d); + MutableColumnPtr array_mutable_col = array_ptr->create_column(); + + int r_num = 50; + for (int r = 0; r < r_num; ++r) { + Array a; + for (int i = 0; i < 10000; ++i) { + a.push_back(Int64(i)); + } + array_mutable_col->insert(a); + } + std::vector crc_hash_vals(r_num); + int64_t time_t = 0; + { + SCOPED_RAW_TIMER(&time_t); + EXPECT_NO_FATAL_FAILURE(array_mutable_col->update_crcs_with_value( + crc_hash_vals, PrimitiveType::TYPE_ARRAY)); + } + std::cout << time_t << "ns" << std::endl; +} + +TEST(HashFuncTest, ArrayNestedArrayTest) { + DataTypes dataTypes = create_scala_data_types(); + + DataTypePtr d = std::make_shared(); + MutableColumnPtr scala_mutable_col = d->create_column(); + DataTypePtr nested_array_ptr = std::make_shared(d); + DataTypePtr array_ptr = std::make_shared(nested_array_ptr); + MutableColumnPtr array_mutable_col = array_ptr->create_column(); + + Array a, a1, a2, a3, nested, nested1; + nested.push_back(Int64(1)); + nested1.push_back(Int64(2)); + + // a: [[1], [2]] + a.push_back(nested); + a.push_back(nested1); + // a1: [[2], [1]] + a1.push_back(nested1); + a1.push_back(nested); + + // a2: [[], [1]] + a2.push_back(Array()); + a2.push_back(nested); + // a3: [[1], []] + a3.push_back(nested); + a3.push_back(Array()); + + array_mutable_col->insert(a); + array_mutable_col->insert(a1); + array_mutable_col->insert(a2); + array_mutable_col->insert(a3); + + auto nested_col = + reinterpret_cast(array_mutable_col.get())->get_data_ptr(); + EXPECT_EQ(nested_col->size(), 8); + + std::vector xx_hash_vals(4); + std::vector crc_hash_vals(4); + auto* __restrict xx_hashes = xx_hash_vals.data(); + auto* __restrict crc_hashes = crc_hash_vals.data(); + + // xxHash + EXPECT_NO_FATAL_FAILURE(array_mutable_col->update_hashes_with_value(xx_hashes)); + EXPECT_TRUE(xx_hashes[0] != xx_hashes[1]); + EXPECT_TRUE(xx_hashes[2] != xx_hashes[3]); + // crcHash + EXPECT_NO_FATAL_FAILURE( + array_mutable_col->update_crcs_with_value(crc_hash_vals, PrimitiveType::TYPE_ARRAY)); + EXPECT_TRUE(crc_hashes[0] != crc_hashes[1]); + EXPECT_TRUE(crc_hashes[2] != crc_hashes[3]); +} + TEST(HashFuncTest, ArrayCornerCaseTest) { DataTypes dataTypes = create_scala_data_types();