Files
doris/be/src/vec/common/columns_hashing_impl.h

469 lines
16 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/ColumnsHashingImpl.h
// and modified by Doris
#pragma once
#include "vec/columns/column.h"
#include "vec/columns/column_nullable.h"
#include "vec/common/aggregation_common.h"
#include "vec/common/assert_cast.h"
#include "vec/common/hash_table/hash_table_key_holder.h"
#include "vec/common/hash_table/ph_hash_map.h"
// #include <Interpreters/AggregationCommon.h>
namespace doris::vectorized {
namespace ColumnsHashing {
/// Generic context for HashMethod. Context is shared between multiple threads, all methods must be thread-safe.
/// Is used for caching.
class HashMethodContext {
public:
virtual ~HashMethodContext() = default;
struct Settings {
size_t max_threads;
};
};
using HashMethodContextPtr = std::shared_ptr<HashMethodContext>;
namespace columns_hashing_impl {
template <typename Value, bool consecutive_keys_optimization_>
struct LastElementCache {
static constexpr bool consecutive_keys_optimization = consecutive_keys_optimization_;
Value value;
bool empty = true;
bool found = false;
bool check(const Value& value_) { return !empty && value == value_; }
template <typename Key>
bool check(const Key& key) {
return !empty && value.first == key;
}
};
template <typename Data>
struct LastElementCache<Data, false> {
static constexpr bool consecutive_keys_optimization = false;
};
template <typename Mapped>
class EmplaceResultImpl {
Mapped& value;
Mapped& cached_value;
bool inserted;
public:
EmplaceResultImpl(Mapped& value_, Mapped& cached_value_, bool inserted_)
: value(value_), cached_value(cached_value_), inserted(inserted_) {}
bool is_inserted() const { return inserted; }
auto& get_mapped() const { return value; }
void set_mapped(const Mapped& mapped) {
cached_value = mapped;
value = mapped;
}
};
template <>
class EmplaceResultImpl<void> {
bool inserted;
public:
explicit EmplaceResultImpl(bool inserted_) : inserted(inserted_) {}
bool is_inserted() const { return inserted; }
};
template <typename Mapped>
class FindResultImpl {
Mapped* value;
bool found;
public:
FindResultImpl(Mapped* value_, bool found_) : value(value_), found(found_) {}
bool is_found() const { return found; }
Mapped& get_mapped() const { return *value; }
};
template <>
class FindResultImpl<void> {
bool found;
public:
explicit FindResultImpl(bool found_) : found(found_) {}
bool is_found() const { return found; }
};
template <typename Derived, typename Value, typename Mapped, bool consecutive_keys_optimization>
class HashMethodBase {
public:
using EmplaceResult = EmplaceResultImpl<Mapped>;
using FindResult = FindResultImpl<Mapped>;
static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
using Cache = LastElementCache<Value, consecutive_keys_optimization>;
static HashMethodContextPtr createContext(const HashMethodContext::Settings&) {
return nullptr;
}
template <typename Data>
ALWAYS_INLINE EmplaceResult emplace_key(Data& data, size_t row, Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return emplaceImpl(key_holder, data);
}
template <typename Data>
ALWAYS_INLINE EmplaceResult emplace_key(Data& data, size_t hash_value, size_t row,
Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return emplaceImpl(key_holder, hash_value, data);
}
template <typename Data, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_key(Data& data,
size_t row,
Arena& pool,
Func&& f) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return lazy_emplace_impl(key_holder, data, std::forward<Func>(f));
}
template <typename Data, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_key(
Data& data, size_t hash_value, size_t row, Arena& pool, Func&& f) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return lazy_emplace_impl(key_holder, hash_value, data, std::forward<Func>(f));
}
template <typename Data>
ALWAYS_INLINE FindResult find_key(Data& data, size_t row, Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return find_key_impl(key_holder_get_key(key_holder), data);
}
template <typename Data>
ALWAYS_INLINE FindResult find_key_with_hash(Data& data, size_t hash_value, size_t row,
Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return find_key_impl(key_holder_get_key(key_holder), hash_value, data);
}
template <typename Data>
ALWAYS_INLINE size_t get_hash(const Data& data, size_t row, Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return data.hash(key_holder_get_key(key_holder));
}
template <typename Data>
ALWAYS_INLINE void prefetch(Data& data, size_t row, Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
data.prefetch(key_holder);
}
template <bool READ, typename Data>
ALWAYS_INLINE void prefetch(Data& data, size_t row, Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
data.template prefetch<READ>(key_holder);
}
template <bool READ, typename Data>
ALWAYS_INLINE void prefetch_by_hash(Data& data, size_t hash_value) {
data.template prefetch_by_hash<READ>(hash_value);
}
ALWAYS_INLINE auto get_key_holder(size_t row, Arena& pool) {
return static_cast<Derived&>(*this).get_key_holder(row, pool);
}
template <typename Data, typename KeyHolder>
ALWAYS_INLINE EmplaceResult emplace_key(Data& data, size_t hash_value, KeyHolder key_holder) {
return emplaceImpl(key_holder, hash_value, data);
}
protected:
Cache cache;
HashMethodBase() {
if constexpr (consecutive_keys_optimization) {
if constexpr (has_mapped) {
/// Init PairNoInit elements.
cache.value.second = Mapped();
cache.value.first = {};
} else
cache.value = Value();
}
}
template <typename Data, typename KeyHolder>
ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder& key_holder, Data& data) {
if constexpr (Cache::consecutive_keys_optimization) {
if (cache.found && cache.check(key_holder_get_key(key_holder))) {
if constexpr (has_mapped)
return EmplaceResult(cache.value.second, cache.value.second, false);
else
return EmplaceResult(false);
}
}
typename Data::LookupResult it;
bool inserted = false;
data.emplace(key_holder, it, inserted);
[[maybe_unused]] Mapped* cached = nullptr;
if constexpr (has_mapped) cached = lookup_result_get_mapped(it);
if (inserted) {
if constexpr (has_mapped) {
new (lookup_result_get_mapped(it)) Mapped();
}
}
if constexpr (consecutive_keys_optimization) {
cache.found = true;
cache.empty = false;
if constexpr (has_mapped) {
cache.value.first = *lookup_result_get_key(it);
cache.value.second = *lookup_result_get_mapped(it);
cached = &cache.value.second;
} else {
cache.value = *lookup_result_get_key(it);
}
}
if constexpr (has_mapped)
return EmplaceResult(*lookup_result_get_mapped(it), *cached, inserted);
else
return EmplaceResult(inserted);
}
template <typename Data, typename KeyHolder>
ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder& key_holder, size_t hash_value, Data& data) {
if constexpr (Cache::consecutive_keys_optimization) {
if (cache.found && cache.check(key_holder_get_key(key_holder))) {
if constexpr (has_mapped)
return EmplaceResult(cache.value.second, cache.value.second, false);
else
return EmplaceResult(false);
}
}
typename Data::LookupResult it;
bool inserted = false;
data.emplace(key_holder, it, hash_value, inserted);
[[maybe_unused]] Mapped* cached = nullptr;
if constexpr (has_mapped) cached = lookup_result_get_mapped(it);
if (inserted) {
if constexpr (has_mapped) {
new (lookup_result_get_mapped(it)) Mapped();
}
}
if constexpr (consecutive_keys_optimization) {
cache.found = true;
cache.empty = false;
if constexpr (has_mapped) {
cache.value.first = *lookup_result_get_key(it);
cache.value.second = *lookup_result_get_mapped(it);
cached = &cache.value.second;
} else {
cache.value = *lookup_result_get_key(it);
}
}
if constexpr (has_mapped)
return EmplaceResult(*lookup_result_get_mapped(it), *cached, inserted);
else
return EmplaceResult(inserted);
}
template <typename Data, typename KeyHolder, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_impl(
KeyHolder& key_holder, Data& data, Func&& f) {
typename Data::LookupResult it;
data.lazy_emplace(key_holder, it, std::forward<Func>(f));
return *lookup_result_get_mapped(it);
}
template <typename Data, typename KeyHolder, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_impl(
KeyHolder& key_holder, size_t hash_value, Data& data, Func&& f) {
typename Data::LookupResult it;
data.lazy_emplace(key_holder, it, hash_value, std::forward<Func>(f));
return *lookup_result_get_mapped(it);
}
template <typename Data, typename Key>
ALWAYS_INLINE FindResult find_key_impl(Key key, Data& data) {
if constexpr (Cache::consecutive_keys_optimization) {
if (cache.check(key)) {
if constexpr (has_mapped)
return FindResult(&cache.value.second, cache.found);
else
return FindResult(cache.found);
}
}
auto it = data.find(key);
if constexpr (consecutive_keys_optimization) {
cache.found = it != nullptr;
cache.empty = false;
if constexpr (has_mapped) {
cache.value.first = key;
if (it) {
cache.value.second = *lookup_result_get_mapped(it);
}
} else {
cache.value = key;
}
}
if constexpr (has_mapped)
return FindResult(it ? lookup_result_get_mapped(it) : nullptr, it != nullptr);
else
return FindResult(it != nullptr);
}
template <typename Data, typename Key>
ALWAYS_INLINE FindResult find_key_impl(Key key, size_t hash_value, Data& data) {
if constexpr (Cache::consecutive_keys_optimization) {
if (cache.check(key)) {
if constexpr (has_mapped)
return FindResult(&cache.value.second, cache.found);
else
return FindResult(cache.found);
}
}
auto it = data.find(key, hash_value);
if constexpr (consecutive_keys_optimization) {
cache.found = it != nullptr;
cache.empty = false;
if constexpr (has_mapped) {
cache.value.first = key;
if (it) {
cache.value.second = *lookup_result_get_mapped(it);
}
} else {
cache.value = key;
}
}
if constexpr (has_mapped)
return FindResult(it ? lookup_result_get_mapped(it) : nullptr, it != nullptr);
else
return FindResult(it != nullptr);
}
};
template <typename T>
struct MappedCache : public PaddedPODArray<T> {};
template <>
struct MappedCache<void> {};
/// This class is designed to provide the functionality that is required for
/// supporting nullable keys in HashMethodKeysFixed. If there are
/// no nullable keys, this class is merely implemented as an empty shell.
template <typename Key, bool has_nullable_keys>
class BaseStateKeysFixed;
/// Case where nullable keys are supported.
template <typename Key>
class BaseStateKeysFixed<Key, true> {
protected:
BaseStateKeysFixed(const ColumnRawPtrs& key_columns) {
null_maps.reserve(key_columns.size());
actual_columns.reserve(key_columns.size());
for (const auto& col : key_columns) {
if (auto* nullable_col = check_and_get_column<ColumnNullable>(col)) {
actual_columns.push_back(&nullable_col->get_nested_column());
null_maps.push_back(&nullable_col->get_null_map_column());
} else {
actual_columns.push_back(col);
null_maps.push_back(nullptr);
}
}
}
/// Return the columns which actually contain the values of the keys.
/// For a given key column, if it is nullable, we return its nested
/// column. Otherwise we return the key column itself.
const ColumnRawPtrs& get_actual_columns() const { return actual_columns; }
/// Create a bitmap that indicates whether, for a particular row,
/// a key column bears a null value or not.
KeysNullMap<Key> create_bitmap(size_t row) const {
KeysNullMap<Key> bitmap {};
for (size_t k = 0; k < null_maps.size(); ++k) {
if (null_maps[k] != nullptr) {
const auto& null_map = assert_cast<const ColumnUInt8&>(*null_maps[k]).get_data();
if (null_map[row] == 1) {
size_t bucket = k / 8;
size_t offset = k % 8;
bitmap[bucket] |= UInt8(1) << offset;
}
}
}
return bitmap;
}
private:
ColumnRawPtrs actual_columns;
ColumnRawPtrs null_maps;
};
/// Case where nullable keys are not supported.
template <typename Key>
class BaseStateKeysFixed<Key, false> {
protected:
BaseStateKeysFixed(const ColumnRawPtrs& columns) : actual_columns(columns) {}
const ColumnRawPtrs& get_actual_columns() const { return actual_columns; }
KeysNullMap<Key> create_bitmap(size_t) const {
LOG(FATAL) << "Internal error: calling create_bitmap() for non-nullable keys is forbidden";
}
private:
ColumnRawPtrs actual_columns;
};
} // namespace columns_hashing_impl
} // namespace ColumnsHashing
} // namespace doris::vectorized