Files
doris/be/src/vec/columns/predicate_column.h

493 lines
20 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "olap/decimal12.h"
#include "olap/uint24.h"
#include "runtime/string_value.h"
#include "vec/columns/column.h"
#include "vec/columns/column_decimal.h"
#include "vec/columns/column_impl.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/core/types.h"
namespace doris::vectorized {
/**
* used to keep predicate column in storage layer
*
* T = predicate column type
*/
template <PrimitiveType Type>
class PredicateColumnType final : public COWHelper<IColumn, PredicateColumnType<Type>> {
private:
PredicateColumnType() {}
PredicateColumnType(const size_t n) : data(n) {}
friend class COWHelper<IColumn, PredicateColumnType<Type>>;
using T = typename PredicatePrimitiveTypeTraits<Type>::PredicateFieldType;
PredicateColumnType(const PredicateColumnType& src) : data(src.data.begin(), src.data.end()) {}
uint64_t get_date_at(uint16_t idx) {
const T val = data[idx];
const char* val_ptr = reinterpret_cast<const char*>(&val);
uint64_t value = 0;
value = *(unsigned char*)(val_ptr + 2);
value <<= 8;
value |= *(unsigned char*)(val_ptr + 1);
value <<= 8;
value |= *(unsigned char*)(val_ptr);
return value;
}
void insert_date_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnVector<Int64>* res_ptr) {
for (size_t i = 0; i < sel_size; i++) {
VecDateTimeValue date = VecDateTimeValue::create_from_olap_date(get_date_at(sel[i]));
res_ptr->insert_data(reinterpret_cast<char*>(&date), 0);
}
}
void insert_date32_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnVector<Int64>* res_ptr) {
res_ptr->reserve(sel_size);
auto& res_data = res_ptr->get_data();
for (size_t i = 0; i < sel_size; i++) {
uint64_t val = data[sel[i]];
VecDateTimeValue date;
date.set_olap_date(val);
res_data.push_back_without_reserve(
unaligned_load<Int64>(reinterpret_cast<char*>(&date)));
}
}
void insert_datetime_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnVector<Int64>* res_ptr) {
for (size_t i = 0; i < sel_size; i++) {
uint64_t value = data[sel[i]];
vectorized::VecDateTimeValue datetime =
VecDateTimeValue::create_from_olap_datetime(value);
res_ptr->insert_data(reinterpret_cast<char*>(&datetime), 0);
}
}
void insert_string_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnString* res_ptr) {
for (size_t i = 0; i < sel_size; i++) {
uint16_t n = sel[i];
auto& sv = reinterpret_cast<StringValue&>(data[n]);
res_ptr->insert_data(sv.ptr, sv.len);
}
}
void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnDecimal<Decimal128>* res_ptr) {
for (size_t i = 0; i < sel_size; i++) {
uint16_t n = sel[i];
auto& dv = reinterpret_cast<const decimal12_t&>(data[n]);
DecimalV2Value dv_data(dv.integer, dv.fraction);
res_ptr->insert_data(reinterpret_cast<char*>(&dv_data), 0);
}
}
template <typename Y>
void insert_default_value_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnVector<Y>* res_ptr) {
static_assert(std::is_same_v<T, Y>);
auto& res_data = res_ptr->get_data();
DCHECK(res_data.empty());
res_data.reserve(sel_size);
Y* y = (Y*)res_data.get_end_ptr();
for (size_t i = 0; i < sel_size; i++) {
y[i] = T(data[sel[i]]);
}
res_data.set_end_ptr(y + sel_size);
}
void insert_byte_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::IColumn* res_ptr) {
for (size_t i = 0; i < sel_size; i++) {
uint16_t n = sel[i];
char* ch_val = reinterpret_cast<char*>(&data[n]);
res_ptr->insert_data(ch_val, 0);
}
}
// note(wb): Write data one by one has a slight performance improvement than memcpy directly
void insert_many_default_type(const char* data_ptr, size_t num) {
T* input_val_ptr = (T*)data_ptr;
T* res_val_ptr = (T*)data.get_end_ptr();
for (int i = 0; i < num; i++) {
res_val_ptr[i] = input_val_ptr[i];
}
res_val_ptr += num;
data.set_end_ptr(res_val_ptr);
}
void insert_many_in_copy_way(const char* data_ptr, size_t num) {
char* res_ptr = (char*)data.get_end_ptr();
memcpy(res_ptr, data_ptr, num * sizeof(T));
res_ptr += num * sizeof(T);
data.set_end_ptr(res_ptr);
}
public:
using Self = PredicateColumnType;
using value_type = T;
using Container = PaddedPODArray<value_type>;
bool is_numeric() const override { return false; }
bool is_predicate_column() const override { return true; }
size_t size() const override { return data.size(); }
[[noreturn]] StringRef get_data_at(size_t n) const override {
LOG(FATAL) << "get_data_at not supported in PredicateColumnType";
}
void insert_from(const IColumn& src, size_t n) override {
LOG(FATAL) << "insert_from not supported in PredicateColumnType";
}
void insert_range_from(const IColumn& src, size_t start, size_t length) override {
LOG(FATAL) << "insert_range_from not supported in PredicateColumnType";
}
void insert_indices_from(const IColumn& src, const int* indices_begin,
const int* indices_end) override {
LOG(FATAL) << "insert_indices_from not supported in PredicateColumnType";
}
void pop_back(size_t n) override {
LOG(FATAL) << "pop_back not supported in PredicateColumnType";
}
void update_hash_with_value(size_t n, SipHash& hash) const override {
LOG(FATAL) << "update_hash_with_value not supported in PredicateColumnType";
}
void insert_string_value(const char* data_ptr, size_t length) {
StringValue sv((char*)data_ptr, length);
data.push_back_without_reserve(sv);
}
void insert_decimal_value(const char* data_ptr, size_t length) {
decimal12_t dc12_value;
dc12_value.integer = *(int64_t*)(data_ptr);
dc12_value.fraction = *(int32_t*)(data_ptr + sizeof(int64_t));
data.push_back_without_reserve(dc12_value);
}
// used for int128
void insert_in_copy_way(const char* data_ptr, size_t length) {
T val {};
memcpy(&val, data_ptr, sizeof(val));
data.push_back_without_reserve(val);
}
void insert_default_type(const char* data_ptr, size_t length) {
T* val = (T*)data_ptr;
data.push_back_without_reserve(*val);
}
void insert_data(const char* data_ptr, size_t length) override {
if constexpr (std::is_same_v<T, StringValue>) {
insert_string_value(data_ptr, length);
} else if constexpr (std::is_same_v<T, decimal12_t>) {
insert_decimal_value(data_ptr, length);
} else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
insert_in_copy_way(data_ptr, length);
} else {
insert_default_type(data_ptr, length);
}
}
void insert_many_date(const char* data_ptr, size_t num) {
size_t intput_type_size = sizeof(uint24_t);
size_t res_type_size = sizeof(uint32_t);
char* input_data_ptr = const_cast<char*>(data_ptr);
char* res_ptr = (char*)data.get_end_ptr();
memset(res_ptr, 0, res_type_size * num);
for (int i = 0; i < num; i++) {
memcpy(res_ptr, input_data_ptr, intput_type_size);
res_ptr += res_type_size;
input_data_ptr += intput_type_size;
}
data.set_end_ptr(res_ptr);
}
void insert_many_fix_len_data(const char* data_ptr, size_t num) override {
if constexpr (std::is_same_v<T, decimal12_t>) {
insert_many_in_copy_way(data_ptr, num);
} else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
insert_many_in_copy_way(data_ptr, num);
} else if constexpr (std::is_same_v<T, StringValue>) {
// here is unreachable, just for compilation to be able to pass
} else if constexpr (Type == TYPE_DATE) {
insert_many_date(data_ptr, num);
} else {
insert_many_default_type(data_ptr, num);
}
}
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict,
size_t num, uint32_t /*dict_num*/) override {
if constexpr (std::is_same_v<T, StringValue>) {
for (size_t end_index = start_index + num; start_index < end_index; ++start_index) {
int32_t codeword = data_array[start_index];
insert_string_value(dict[codeword].data, dict[codeword].size);
}
}
}
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num) override {
if constexpr (std::is_same_v<T, StringValue>) {
if (_pool == nullptr) {
_pool.reset(new MemPool());
}
size_t total_mem_size = 0;
for (size_t i = 0; i < num; i++) {
total_mem_size += len_array[i];
}
char* destination = (char*)_pool->allocate(total_mem_size);
for (size_t i = 0; i < num; i++) {
uint32_t len = len_array[i];
uint32_t start_offset = start_offset_array[i];
memcpy(destination, data_array + start_offset, len);
StringValue sv(destination, len);
data.push_back_without_reserve(sv);
destination += len;
}
}
}
void insert_default() override { data.push_back(T()); }
void clear() override {
data.clear();
if (_pool != nullptr) {
_pool->clear();
}
}
size_t byte_size() const override { return data.size() * sizeof(T); }
size_t allocated_bytes() const override { return byte_size(); }
void protect() override {}
void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
IColumn::Permutation& res) const override {
LOG(FATAL) << "get_permutation not supported in PredicateColumnType";
}
void reserve(size_t n) override { data.reserve(n); }
const char* get_family_name() const override { return TypeName<T>::get(); }
[[noreturn]] MutableColumnPtr clone_resized(size_t size) const override {
LOG(FATAL) << "clone_resized not supported in PredicateColumnType";
}
void insert(const Field& x) override {
LOG(FATAL) << "insert not supported in PredicateColumnType";
}
[[noreturn]] Field operator[](size_t n) const override {
LOG(FATAL) << "operator[] not supported in PredicateColumnType";
}
void get(size_t n, Field& res) const override {
LOG(FATAL) << "get field not supported in PredicateColumnType";
}
[[noreturn]] UInt64 get64(size_t n) const override {
LOG(FATAL) << "get field not supported in PredicateColumnTyped";
}
[[noreturn]] Float64 get_float64(size_t n) const override {
LOG(FATAL) << "get field not supported in PredicateColumnType";
}
[[noreturn]] UInt64 get_uint(size_t n) const override {
LOG(FATAL) << "get field not supported in PredicateColumnType";
}
[[noreturn]] bool get_bool(size_t n) const override {
LOG(FATAL) << "get field not supported in PredicateColumnType";
}
[[noreturn]] Int64 get_int(size_t n) const override {
LOG(FATAL) << "get field not supported in PredicateColumnType";
}
// it's impossable to use ComplexType as key , so we don't have to implemnt them
[[noreturn]] StringRef serialize_value_into_arena(size_t n, Arena& arena,
char const*& begin) const override {
LOG(FATAL) << "serialize_value_into_arena not supported in PredicateColumnType";
}
[[noreturn]] const char* deserialize_and_insert_from_arena(const char* pos) override {
LOG(FATAL) << "deserialize_and_insert_from_arena not supported in PredicateColumnType";
}
[[noreturn]] int compare_at(size_t n, size_t m, const IColumn& rhs,
int nan_direction_hint) const override {
LOG(FATAL) << "compare_at not supported in PredicateColumnType";
}
void get_extremes(Field& min, Field& max) const override {
LOG(FATAL) << "get_extremes not supported in PredicateColumnType";
}
bool can_be_inside_nullable() const override { return true; }
bool is_fixed_and_contiguous() const override { return true; }
size_t size_of_value_if_fixed() const override { return sizeof(T); }
[[noreturn]] StringRef get_raw_data() const override {
LOG(FATAL) << "get_raw_data not supported in PredicateColumnType";
}
[[noreturn]] bool structure_equals(const IColumn& rhs) const override {
LOG(FATAL) << "structure_equals not supported in PredicateColumnType";
}
[[noreturn]] ColumnPtr filter(const IColumn::Filter& filt,
ssize_t result_size_hint) const override {
LOG(FATAL) << "filter not supported in PredicateColumnType";
};
[[noreturn]] ColumnPtr permute(const IColumn::Permutation& perm, size_t limit) const override {
LOG(FATAL) << "permute not supported in PredicateColumnType";
};
Container& get_data() { return data; }
const Container& get_data() const { return data; }
[[noreturn]] ColumnPtr replicate(const IColumn::Offsets& replicate_offsets) const override {
LOG(FATAL) << "replicate not supported in PredicateColumnType";
};
[[noreturn]] MutableColumns scatter(IColumn::ColumnIndex num_columns,
const IColumn::Selector& selector) const override {
LOG(FATAL) << "scatter not supported in PredicateColumnType";
}
void append_data_by_selector(MutableColumnPtr& res,
const IColumn::Selector& selector) const override {
LOG(FATAL) << "append_data_by_selector is not supported in PredicateColumnType!";
}
Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override {
if constexpr (std::is_same_v<T, StringValue>) {
insert_string_to_res_column(sel, sel_size,
reinterpret_cast<vectorized::ColumnString*>(col_ptr));
} else if constexpr (std::is_same_v<T, decimal12_t>) {
insert_decimal_to_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnDecimal<Decimal128>*>(col_ptr));
} else if constexpr (std::is_same_v<T, doris::vectorized::Int8>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Int8>*>(col_ptr));
} else if constexpr (std::is_same_v<T, doris::vectorized::Int16>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Int16>*>(col_ptr));
} else if constexpr (std::is_same_v<T, doris::vectorized::Int32>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Int32>*>(col_ptr));
} else if constexpr (std::is_same_v<T, doris::vectorized::Int64>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Int64>*>(col_ptr));
} else if constexpr (std::is_same_v<T, doris::vectorized::Float32>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Float32>*>(
col_ptr));
} else if constexpr (std::is_same_v<T, doris::vectorized::Float64>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Float64>*>(
col_ptr));
} else if constexpr (std::is_same_v<T, uint64_t>) {
if (const vectorized::ColumnVector<UInt64>* date_col =
check_and_get_column<vectorized::ColumnVector<UInt64>>(
const_cast<const IColumn*>(col_ptr))) {
insert_default_value_res_column(
sel, sel_size, const_cast<vectorized::ColumnVector<UInt64>*>(date_col));
} else {
insert_datetime_to_res_column(
sel, sel_size, reinterpret_cast<vectorized::ColumnVector<Int64>*>(col_ptr));
}
} else if constexpr (std::is_same_v<T, uint24_t>) {
insert_date_to_res_column(sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<Int64>*>(col_ptr));
} else if constexpr (std::is_same_v<T, uint32_t>) {
if (const vectorized::ColumnVector<Int64>* date_col =
check_and_get_column<vectorized::ColumnVector<Int64>>(
const_cast<const IColumn*>(col_ptr))) {
// a trick type judge, need refactor it.
insert_date32_to_res_column(sel, sel_size,
const_cast<vectorized::ColumnVector<Int64>*>(date_col));
} else {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::UInt32>*>(
col_ptr));
}
} else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
insert_default_value_res_column(
sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<doris::vectorized::Int128>*>(
col_ptr));
} else if (std::is_same_v<T, bool>) {
insert_byte_to_res_column(sel, sel_size, col_ptr);
} else {
return Status::NotSupported("not supported output type in predicate_column");
}
return Status::OK();
}
void replace_column_data(const IColumn&, size_t row, size_t self_row = 0) override {
LOG(FATAL) << "should not call replace_column_data in predicate column";
}
void replace_column_data_default(size_t self_row = 0) override {
LOG(FATAL) << "should not call replace_column_data_default in predicate column";
}
private:
Container data;
// manages the memory for slice's data(For string type)
std::unique_ptr<MemPool> _pool;
};
} // namespace doris::vectorized