diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index b291ba2443..0ff14a1653 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -386,7 +386,9 @@ public: LOG(FATAL) << get_name() << " update_hashes_with_value xxhash not supported"; } - virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const { + // use range for one hash value to avoid virtual function call in loop + virtual void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { LOG(FATAL) << get_name() << " update_hash_with_value xxhash not supported"; } @@ -398,7 +400,9 @@ public: LOG(FATAL) << get_name() << "update_crcs_with_value not supported"; } - virtual void update_crc_with_value(size_t n, uint64_t& hash) const { + // use range for one hash value to avoid virtual function call in loop + virtual void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { LOG(FATAL) << get_name() << " update_crc_with_value not supported"; } diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index c5e35e5bb9..b575f7bf15 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -277,25 +277,64 @@ void ColumnArray::update_hashes_with_value(std::vector& hashes, } // for every array row calculate xxHash -void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const { - size_t elem_size = size_at(n); - size_t offset = offset_at(n); - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), sizeof(elem_size), - hash); - for (auto i = 0; i < elem_size; ++i) { - get_data().update_xxHash_with_value(offset + i, hash); +void ColumnArray::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets_column = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } } } // for every array row calculate crcHash -void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const { - size_t elem_size = size_at(n); - size_t offset = offset_at(n); - - crc = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), sizeof(elem_size), - crc); - for (auto i = 0; i < elem_size; ++i) { - get_data().update_crc_with_value(offset + i, crc); +void ColumnArray::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets_column = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } } } @@ -305,12 +344,12 @@ void ColumnArray::update_hashes_with_value(uint64_t* __restrict hashes, if (null_data) { for (size_t i = 0; i < s; ++i) { if (null_data[i] == 0) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } @@ -324,12 +363,12 @@ void ColumnArray::update_crcs_with_value(std::vector& hash, PrimitiveT for (size_t i = 0; i < s; ++i) { // every row if (null_data[i] == 0) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 2e1c96a2c5..4fe1827e17 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -139,8 +139,10 @@ public: StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; const char* deserialize_and_insert_from_arena(const char* pos) override; void update_hash_with_value(size_t n, SipHash& hash) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h index feeb0608a2..7554e773b9 100644 --- a/be/src/vec/columns/column_const.h +++ b/be/src/vec/columns/column_const.h @@ -152,7 +152,8 @@ public: data->serialize_vec(keys, num_rows, max_row_byte_size); } - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { auto real_data = data->get_data_at(0); if (real_data.data == nullptr) { hash = HashUtil::xxHash64NullWithSeed(hash); @@ -161,8 +162,9 @@ public: } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { - get_data_column_ptr()->update_crc_with_value(n, crc); + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + get_data_column_ptr()->update_crc_with_value(start, end, hash, nullptr); } void serialize_vec_with_null_map(std::vector& keys, size_t num_rows, diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index e0b8fef056..a73be249eb 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -138,16 +138,27 @@ void ColumnDecimal::update_hashes_with_value(std::vector& hashes, } template -void ColumnDecimal::update_crc_with_value(size_t n, uint64_t& crc) const { - if constexpr (!IsDecimalV2) { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); +void ColumnDecimal::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (null_data == nullptr) { + for (size_t i = start; i < end; i++) { + if constexpr (!IsDecimalV2) { + hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash); + } else { + decimalv2_do_crc(i, hash); + } + } } else { - const DecimalV2Value& dec_val = (const DecimalV2Value&)data[n]; - int64_t int_val = dec_val.int_value(); - int32_t frac_val = dec_val.frac_value(); - crc = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), crc); - crc = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), crc); - }; + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + if constexpr (!IsDecimalV2) { + hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash); + } else { + decimalv2_do_crc(i, hash); + } + } + } + } } template @@ -161,19 +172,32 @@ void ColumnDecimal::update_crcs_with_value(std::vector& hashes, Pri } else { if (null_data == nullptr) { for (size_t i = 0; i < s; i++) { - update_crc_with_value(i, hashes[i]); + decimalv2_do_crc(i, hashes[i]); } } else { for (size_t i = 0; i < s; i++) { - if (null_data[i] == 0) update_crc_with_value(i, hashes[i]); + if (null_data[i] == 0) decimalv2_do_crc(i, hashes[i]); } } } } template -void ColumnDecimal::update_xxHash_with_value(size_t n, uint64_t& hash) const { - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[n]), sizeof(T), hash); +void ColumnDecimal::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), + sizeof(T), hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), sizeof(T), + hash); + } + } } template diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h index 0aa0879f6b..404c8d7019 100644 --- a/be/src/vec/columns/column_decimal.h +++ b/be/src/vec/columns/column_decimal.h @@ -181,8 +181,10 @@ public: void update_crcs_with_value(std::vector& hashes, PrimitiveType type, const uint8_t* __restrict null_data) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; int compare_at(size_t n, size_t m, const IColumn& rhs_, int nan_direction_hint) const override; void get_permutation(bool reverse, size_t limit, int nan_direction_hint, @@ -299,6 +301,14 @@ protected: std::partial_sort(res.begin(), sort_end, res.end(), [this](size_t a, size_t b) { return data[a] < data[b]; }); } + + void ALWAYS_INLINE decimalv2_do_crc(size_t i, uint64_t& hash) const { + const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i]; + int64_t int_val = dec_val.int_value(); + int32_t frac_val = dec_val.frac_value(); + hash = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), hash); + hash = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), hash); + }; }; template diff --git a/be/src/vec/columns/column_map.cpp b/be/src/vec/columns/column_map.cpp index 1924e2ba46..ac7c5da1a9 100644 --- a/be/src/vec/columns/column_map.cpp +++ b/be/src/vec/columns/column_map.cpp @@ -253,26 +253,64 @@ void ColumnMap::update_hashes_with_value(std::vector& hashes, SIP_HASHES_FUNCTION_COLUMN_IMPL(); } -void ColumnMap::update_xxHash_with_value(size_t n, uint64_t& hash) const { - size_t kv_size = size_at(n); - size_t offset = offset_at(n); - - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&kv_size), sizeof(kv_size), - hash); - for (auto i = 0; i < kv_size; ++i) { - get_keys().update_xxHash_with_value(offset + i, hash); - get_values().update_xxHash_with_value(offset + i, hash); +void ColumnMap::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_xxHash_with_value(offsets[i - 1], offsets[i], hash, nullptr); + } + } } } -void ColumnMap::update_crc_with_value(size_t n, uint64_t& crc) const { - size_t kv_size = size_at(n); - size_t offset = offset_at(n); - - crc = HashUtil::zlib_crc_hash(reinterpret_cast(&kv_size), sizeof(kv_size), crc); - for (size_t i = 0; i < kv_size; ++i) { - get_keys().update_crc_with_value(offset + i, crc); - get_values().update_crc_with_value(offset + i, crc); +void ColumnMap::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t kv_size = offsets[i] - offsets[i - 1]; + if (kv_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast(&kv_size), + sizeof(kv_size), hash); + } else { + get_keys().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + get_values().update_crc_with_value(offsets[i - 1], offsets[i], hash, nullptr); + } + } } } @@ -282,12 +320,12 @@ void ColumnMap::update_hashes_with_value(uint64_t* hashes, const uint8_t* null_d for (size_t i = 0; i < s; ++i) { // every row if (null_data[i] == 0) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } @@ -301,12 +339,12 @@ void ColumnMap::update_crcs_with_value(std::vector& hash, PrimitiveTyp for (size_t i = 0; i < s; ++i) { // every row if (null_data[i] == 0) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_crc_with_value(i, hash[i]); + update_crc_with_value(i, i + 1, hash[i], nullptr); } } } diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h index 0d7bb2d0a7..91c9eb0177 100644 --- a/be/src/vec/columns/column_map.h +++ b/be/src/vec/columns/column_map.h @@ -167,8 +167,10 @@ public: size_t allocated_bytes() const override; void protect() override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index ce5b68f3fb..538bcd27a6 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -65,21 +65,35 @@ MutableColumnPtr ColumnNullable::get_shrinked_column() { get_null_map_column_ptr()); } -void ColumnNullable::update_xxHash_with_value(size_t n, uint64_t& hash) const { - auto* __restrict real_null_data = assert_cast(*null_map).get_data().data(); - if (real_null_data[n] != 0) { - hash = HashUtil::xxHash64NullWithSeed(hash); +void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (!has_null()) { + nested_column->update_xxHash_with_value(start, end, hash, nullptr); } else { - nested_column->update_xxHash_with_value(n, hash); + auto* __restrict real_null_data = + assert_cast(*null_map).get_data().data(); + for (int i = start; i < end; ++i) { + if (real_null_data[i] != 0) { + hash = HashUtil::xxHash64NullWithSeed(hash); + } + } + nested_column->update_xxHash_with_value(start, end, hash, real_null_data); } } -void ColumnNullable::update_crc_with_value(size_t n, uint64_t& crc) const { - auto* __restrict real_null_data = assert_cast(*null_map).get_data().data(); - if (real_null_data[n] != 0) { - crc = HashUtil::zlib_crc_hash_null(crc); +void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (!has_null()) { + nested_column->update_crc_with_value(start, end, hash, nullptr); } else { - nested_column->update_xxHash_with_value(n, crc); + auto* __restrict real_null_data = + assert_cast(*null_map).get_data().data(); + for (int i = start; i < end; ++i) { + if (real_null_data[i] != 0) { + hash = HashUtil::zlib_crc_hash_null(hash); + } + } + nested_column->update_crc_with_value(start, end, hash, real_null_data); } } diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index be9ba72399..11c24be294 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -215,8 +215,10 @@ public: ColumnPtr replicate(const Offsets& replicate_offsets) const override; void replicate(const uint32_t* counts, size_t target_size, IColumn& column, size_t begin = 0, int count_sz = -1) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hash_with_value(size_t n, SipHash& hash) const override; void update_hashes_with_value(std::vector& hashes, diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index a8bf06c469..ac95f78037 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -393,16 +393,42 @@ public: void deserialize_vec_with_null_map(std::vector& keys, const size_t num_rows, const uint8_t* null_map) override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { - size_t string_size = size_at(n); - size_t offset = offset_at(n); - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&chars[offset]), - string_size, hash); + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t string_size = size_at(i); + size_t offset = offset_at(i); + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&chars[offset]), + string_size, hash); + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t string_size = size_at(i); + size_t offset = offset_at(i); + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&chars[offset]), + string_size, hash); + } + } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { - auto data_ref = get_data_at(n); - crc = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, crc); + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + auto data_ref = get_data_at(i); + hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + } + } + } else { + for (size_t i = start; i < end; ++i) { + auto data_ref = get_data_at(i); + hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + } + } } void update_hash_with_value(size_t n, SipHash& hash) const override { diff --git a/be/src/vec/columns/column_struct.cpp b/be/src/vec/columns/column_struct.cpp index 58f5a4abaf..0b3bcb24e8 100644 --- a/be/src/vec/columns/column_struct.cpp +++ b/be/src/vec/columns/column_struct.cpp @@ -196,15 +196,17 @@ void ColumnStruct::update_hashes_with_value(std::vector& hashes, SIP_HASHES_FUNCTION_COLUMN_IMPL(); } -void ColumnStruct::update_xxHash_with_value(size_t n, uint64_t& hash) const { +void ColumnStruct::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { for (const auto& column : columns) { - column->update_xxHash_with_value(n, hash); + column->update_xxHash_with_value(start, end, hash, nullptr); } } -void ColumnStruct::update_crc_with_value(size_t n, uint64_t& crc) const { +void ColumnStruct::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { for (const auto& column : columns) { - column->update_crc_with_value(n, crc); + column->update_crc_with_value(start, end, hash, nullptr); } } diff --git a/be/src/vec/columns/column_struct.h b/be/src/vec/columns/column_struct.h index 3771d29e48..9073725e81 100644 --- a/be/src/vec/columns/column_struct.h +++ b/be/src/vec/columns/column_struct.h @@ -106,8 +106,10 @@ public: const char* deserialize_and_insert_from_arena(const char* pos) override; void update_hash_with_value(size_t n, SipHash& hash) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index 67f2827c92..48228822f3 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -274,21 +274,49 @@ public: const uint8_t* null_map, size_t max_row_byte_size) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { - hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[n]), sizeof(T), hash); + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), + sizeof(T), hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast(&data[i]), + sizeof(T), hash); + } + } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { + void ALWAYS_INLINE update_crc_with_value_without_null(size_t idx, uint64_t& hash) const { if constexpr (!std::is_same_v) { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); + hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash); } else { if (this->is_date_type() || this->is_datetime_type()) { char buf[64]; - const VecDateTimeValue& date_val = (const VecDateTimeValue&)data[n]; + const VecDateTimeValue& date_val = (const VecDateTimeValue&)data[idx]; auto len = date_val.to_buffer(buf); - crc = HashUtil::zlib_crc_hash(buf, len, crc); + hash = HashUtil::zlib_crc_hash(buf, len, hash); } else { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); + hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash); + } + } + } + + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + update_crc_with_value_without_null(i, hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + update_crc_with_value_without_null(i, hash); } } } diff --git a/be/test/vec/columns/column_hash_func_test.cpp b/be/test/vec/columns/column_hash_func_test.cpp index 0e409b4640..f80edb035f 100644 --- a/be/test/vec/columns/column_hash_func_test.cpp +++ b/be/test/vec/columns/column_hash_func_test.cpp @@ -20,6 +20,8 @@ #include #include "gtest/gtest_pred_impl.h" +#include "util/runtime_profile.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_const.h" #include "vec/core/field.h" #include "vec/data_types/data_type.h" @@ -86,6 +88,83 @@ TEST(HashFuncTest, ArrayTypeTest) { } } +TEST(HashFuncTest, ArraySimpleBenchmarkTest) { + DataTypes dataTypes = create_scala_data_types(); + + DataTypePtr d = std::make_shared(); + DataTypePtr array_ptr = std::make_shared(d); + MutableColumnPtr array_mutable_col = array_ptr->create_column(); + + int r_num = 50; + for (int r = 0; r < r_num; ++r) { + Array a; + for (int i = 0; i < 10000; ++i) { + a.push_back(Int64(i)); + } + array_mutable_col->insert(a); + } + std::vector crc_hash_vals(r_num); + int64_t time_t = 0; + { + SCOPED_RAW_TIMER(&time_t); + EXPECT_NO_FATAL_FAILURE(array_mutable_col->update_crcs_with_value( + crc_hash_vals, PrimitiveType::TYPE_ARRAY)); + } + std::cout << time_t << "ns" << std::endl; +} + +TEST(HashFuncTest, ArrayNestedArrayTest) { + DataTypes dataTypes = create_scala_data_types(); + + DataTypePtr d = std::make_shared(); + MutableColumnPtr scala_mutable_col = d->create_column(); + DataTypePtr nested_array_ptr = std::make_shared(d); + DataTypePtr array_ptr = std::make_shared(nested_array_ptr); + MutableColumnPtr array_mutable_col = array_ptr->create_column(); + + Array a, a1, a2, a3, nested, nested1; + nested.push_back(Int64(1)); + nested1.push_back(Int64(2)); + + // a: [[1], [2]] + a.push_back(nested); + a.push_back(nested1); + // a1: [[2], [1]] + a1.push_back(nested1); + a1.push_back(nested); + + // a2: [[], [1]] + a2.push_back(Array()); + a2.push_back(nested); + // a3: [[1], []] + a3.push_back(nested); + a3.push_back(Array()); + + array_mutable_col->insert(a); + array_mutable_col->insert(a1); + array_mutable_col->insert(a2); + array_mutable_col->insert(a3); + + auto nested_col = + reinterpret_cast(array_mutable_col.get())->get_data_ptr(); + EXPECT_EQ(nested_col->size(), 8); + + std::vector xx_hash_vals(4); + std::vector crc_hash_vals(4); + auto* __restrict xx_hashes = xx_hash_vals.data(); + auto* __restrict crc_hashes = crc_hash_vals.data(); + + // xxHash + EXPECT_NO_FATAL_FAILURE(array_mutable_col->update_hashes_with_value(xx_hashes)); + EXPECT_TRUE(xx_hashes[0] != xx_hashes[1]); + EXPECT_TRUE(xx_hashes[2] != xx_hashes[3]); + // crcHash + EXPECT_NO_FATAL_FAILURE( + array_mutable_col->update_crcs_with_value(crc_hash_vals, PrimitiveType::TYPE_ARRAY)); + EXPECT_TRUE(crc_hashes[0] != crc_hashes[1]); + EXPECT_TRUE(crc_hashes[2] != crc_hashes[3]); +} + TEST(HashFuncTest, ArrayCornerCaseTest) { DataTypes dataTypes = create_scala_data_types();