243 lines
10 KiB
C++
243 lines
10 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
// This file is copied from
|
|
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/ColumnsHashing.h
|
|
// and modified by Doris
|
|
|
|
#pragma once
|
|
|
|
#include <memory>
|
|
|
|
#include "vec/columns/column_string.h"
|
|
#include "vec/common/arena.h"
|
|
#include "vec/common/assert_cast.h"
|
|
#include "vec/common/columns_hashing_impl.h"
|
|
#include "vec/common/hash_table/hash_table.h"
|
|
#include "vec/common/hash_table/hash_table_key_holder.h"
|
|
#include "vec/common/unaligned.h"
|
|
|
|
namespace doris::vectorized {
|
|
|
|
namespace ColumnsHashing {
|
|
|
|
/// For the case when there is one numeric key.
|
|
/// UInt8/16/32/64 for any type with corresponding bit width.
|
|
template <typename Value, typename Mapped, typename FieldType, bool use_cache = true>
|
|
struct HashMethodOneNumber : public columns_hashing_impl::HashMethodBase<
|
|
HashMethodOneNumber<Value, Mapped, FieldType, use_cache>,
|
|
Value, Mapped, use_cache> {
|
|
using Self = HashMethodOneNumber<Value, Mapped, FieldType, use_cache>;
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
|
|
|
const char* vec;
|
|
|
|
/// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
|
|
HashMethodOneNumber(const ColumnRawPtrs& key_columns, const Sizes& /*key_sizes*/,
|
|
const HashMethodContextPtr&) {
|
|
vec = key_columns[0]->get_raw_data().data;
|
|
}
|
|
|
|
HashMethodOneNumber(const IColumn* column) { vec = column->get_raw_data().data; }
|
|
|
|
/// Creates context. Method is called once and result context is used in all threads.
|
|
using Base::createContext; /// (const HashMethodContext::Settings &) -> HashMethodContextPtr
|
|
|
|
/// Emplace key into HashTable or HashMap. If Data is HashMap, returns ptr to value, otherwise nullptr.
|
|
/// Data is a HashTable where to insert key from column's row.
|
|
/// For Serialized method, key may be placed in pool.
|
|
using Base::emplace_key; /// (Data & data, size_t row, Arena & pool) -> EmplaceResult
|
|
|
|
/// Find key into HashTable or HashMap. If Data is HashMap and key was found, returns ptr to value, otherwise nullptr.
|
|
using Base::find_key; /// (Data & data, size_t row, Arena & pool) -> FindResult
|
|
|
|
/// Get hash value of row.
|
|
using Base::get_hash; /// (const Data & data, size_t row, Arena & pool) -> size_t
|
|
|
|
/// Is used for default implementation in HashMethodBase.
|
|
FieldType get_key_holder(size_t row, Arena&) const {
|
|
return unaligned_load<FieldType>(vec + row * sizeof(FieldType));
|
|
}
|
|
};
|
|
|
|
/// For the case when there is one string key.
|
|
template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
|
|
struct HashMethodString : public columns_hashing_impl::HashMethodBase<
|
|
HashMethodString<Value, Mapped, place_string_to_arena, use_cache>,
|
|
Value, Mapped, use_cache> {
|
|
using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache>;
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
|
|
|
const IColumn::Offset* offsets;
|
|
const UInt8* chars;
|
|
|
|
HashMethodString(const ColumnRawPtrs& key_columns, const Sizes& /*key_sizes*/,
|
|
const HashMethodContextPtr&) {
|
|
const IColumn& column = *key_columns[0];
|
|
const ColumnString& column_string = assert_cast<const ColumnString&>(column);
|
|
offsets = column_string.get_offsets().data();
|
|
chars = column_string.get_chars().data();
|
|
}
|
|
|
|
auto get_key_holder(ssize_t row, [[maybe_unused]] Arena& pool) const {
|
|
StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1);
|
|
|
|
if constexpr (place_string_to_arena) {
|
|
return ArenaKeyHolder {key, pool};
|
|
} else {
|
|
return key;
|
|
}
|
|
}
|
|
|
|
protected:
|
|
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
|
};
|
|
|
|
/** Hash by concatenating serialized key values.
|
|
* The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
|
|
* That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
|
|
* Therefore, when aggregating by several strings, there is no ambiguity.
|
|
*/
|
|
template <typename Value, typename Mapped>
|
|
struct HashMethodSerialized
|
|
: public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped>, Value,
|
|
Mapped, false> {
|
|
using Self = HashMethodSerialized<Value, Mapped>;
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
|
|
|
|
ColumnRawPtrs key_columns;
|
|
size_t keys_size;
|
|
|
|
HashMethodSerialized(const ColumnRawPtrs& key_columns_, const Sizes& /*key_sizes*/,
|
|
const HashMethodContextPtr&)
|
|
: key_columns(key_columns_), keys_size(key_columns_.size()) {}
|
|
|
|
protected:
|
|
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
|
|
|
|
ALWAYS_INLINE SerializedKeyHolder get_key_holder(size_t row, Arena& pool) const {
|
|
return SerializedKeyHolder {
|
|
serialize_keys_to_pool_contiguous(row, keys_size, key_columns, pool), pool};
|
|
}
|
|
};
|
|
|
|
/// For the case when there is one string key.
|
|
template <typename Value, typename Mapped, bool use_cache = true>
|
|
struct HashMethodHashed
|
|
: public columns_hashing_impl::HashMethodBase<HashMethodHashed<Value, Mapped, use_cache>,
|
|
Value, Mapped, use_cache> {
|
|
using Key = UInt128;
|
|
using Self = HashMethodHashed<Value, Mapped, use_cache>;
|
|
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
|
|
|
ColumnRawPtrs key_columns;
|
|
|
|
HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes&, const HashMethodContextPtr&)
|
|
: key_columns(std::move(key_columns_)) {}
|
|
|
|
ALWAYS_INLINE Key get_key_holder(size_t row, Arena&) const {
|
|
return hash128(row, key_columns.size(), key_columns);
|
|
}
|
|
};
|
|
|
|
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
|
|
template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false,
|
|
bool use_cache = true>
|
|
struct HashMethodKeysFixed
|
|
: private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>,
|
|
public columns_hashing_impl::HashMethodBase<
|
|
HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, use_cache>, Value,
|
|
Mapped, use_cache> {
|
|
using Self = HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, use_cache>;
|
|
using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
|
|
using Base = columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>;
|
|
|
|
const Sizes& key_sizes;
|
|
size_t keys_size;
|
|
|
|
HashMethodKeysFixed(const ColumnRawPtrs& key_columns, const Sizes& key_sizes_,
|
|
const HashMethodContextPtr&)
|
|
: Base(key_columns), key_sizes(key_sizes_), keys_size(key_columns.size()) {}
|
|
|
|
ALWAYS_INLINE Key get_key_holder(size_t row, Arena&) const {
|
|
if constexpr (has_nullable_keys_) {
|
|
auto bitmap = Base::create_bitmap(row);
|
|
return pack_fixed<Key>(row, keys_size, Base::get_actual_columns(), key_sizes, bitmap);
|
|
} else {
|
|
return pack_fixed<Key>(row, keys_size, Base::get_actual_columns(), key_sizes);
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename SingleColumnMethod, typename Mapped, bool use_cache>
|
|
struct HashMethodSingleLowNullableColumn : public SingleColumnMethod {
|
|
using Base = SingleColumnMethod;
|
|
|
|
static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
|
|
using EmplaceResult = columns_hashing_impl::EmplaceResultImpl<Mapped>;
|
|
using FindResult = columns_hashing_impl::FindResultImpl<Mapped>;
|
|
|
|
static HashMethodContextPtr createContext(const HashMethodContext::Settings & settings) {
|
|
return nullptr;
|
|
}
|
|
|
|
ColumnRawPtrs key_columns;
|
|
|
|
static const ColumnRawPtrs get_nested_column(const IColumn* col) {
|
|
auto* nullable = check_and_get_column<ColumnNullable>(*col);
|
|
DCHECK(nullable != nullptr);
|
|
const auto nested_col = nullable->get_nested_column_ptr().get();
|
|
return {nested_col};
|
|
}
|
|
|
|
HashMethodSingleLowNullableColumn(
|
|
const ColumnRawPtrs & key_columns_nullable, const Sizes & key_sizes, const HashMethodContextPtr & context)
|
|
: Base(get_nested_column(key_columns_nullable[0]), key_sizes, context), key_columns(key_columns_nullable) {
|
|
}
|
|
|
|
template <typename Data>
|
|
ALWAYS_INLINE EmplaceResult emplace_key(Data & data, size_t row, Arena & pool) {
|
|
if (key_columns[0]->is_null_at(row)) {
|
|
bool has_null_key = data.has_null_key_data();
|
|
data.has_null_key_data() = true;
|
|
|
|
if constexpr (has_mapped)
|
|
return EmplaceResult(data.get_null_key_data(), data.get_null_key_data(), !has_null_key);
|
|
else
|
|
return EmplaceResult(!has_null_key);
|
|
}
|
|
|
|
auto key_holder = Base::get_key_holder(row, pool);
|
|
|
|
bool inserted = false;
|
|
typename Data::LookupResult it;
|
|
data.emplace(key_holder, it, inserted);
|
|
|
|
if constexpr (has_mapped) {
|
|
auto & mapped = *lookup_result_get_mapped(it);
|
|
if (inserted) {
|
|
new (&mapped) Mapped();
|
|
}
|
|
return EmplaceResult(mapped, mapped, inserted);
|
|
}
|
|
else
|
|
return EmplaceResult(inserted);
|
|
}
|
|
};
|
|
|
|
} // namespace ColumnsHashing
|
|
} // namespace doris::vectorized
|