[improvement](storage) Low cardinality string optimization in storage layer (#8318)

Low cardinality string optimization in storage layer
This commit is contained in:
ZenoYang
2022-03-20 23:04:25 +08:00
committed by GitHub
parent ed47e20eea
commit 2ec0b81030
16 changed files with 657 additions and 146 deletions

View File

@ -234,6 +234,8 @@ CONF_Bool(disable_storage_page_cache, "false");
CONF_Bool(enable_storage_vectorization, "false");
CONF_Bool(enable_low_cardinality_optimize, "false");
// be policy
// whether disable automatic compaction task
CONF_mBool(disable_auto_compaction, "false");

View File

@ -57,7 +57,7 @@ struct ExprValue {
float_val(0.0),
double_val(0.0),
string_data(),
string_val(nullptr, 0),
string_val(),
datetime_val(),
decimalv2_val(0),
array_val() {}

View File

@ -73,6 +73,8 @@ public:
virtual bool is_bloom_filter_predicate() { return false; }
virtual bool is_range_comparison_predicate() { return false; }
protected:
uint32_t _column_id;
bool _opposite;

View File

@ -21,6 +21,7 @@
#include "olap/schema.h"
#include "runtime/string_value.hpp"
#include "runtime/vectorized_row_batch.h"
#include "vec/columns/column_dictionary.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_vector.h"
#include "vec/columns/predicate_column.h"
@ -145,28 +146,65 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(LessEqualPredicate, <=)
COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterPredicate, >)
COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterEqualPredicate, >=)
#define COMPARISON_PRED_COLUMN_EVALUATE(CLASS, OP) \
// todo(zeno) define interface in IColumn to simplify code
#define COMPARISON_PRED_COLUMN_EVALUATE(CLASS, OP, IS_RANGE) \
template <class type> \
void CLASS<type>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { \
uint16_t new_size = 0; \
if (column.is_nullable()) { \
auto* nullable_column = \
auto* nullable_col = \
vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \
auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>( \
*(nullable_column->get_null_map_column_ptr())) \
auto& null_bitmap = reinterpret_cast<const vectorized::ColumnUInt8&>( \
nullable_col->get_null_map_column()) \
.get_data(); \
auto* nest_column_vector = \
vectorized::check_and_get_column<vectorized::PredicateColumnType<type>>( \
nullable_column->get_nested_column()); \
auto& data_array = nest_column_vector->get_data(); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \
bool ret = !null_bitmap[idx] && (cell_value OP _value); \
new_size += _opposite ? !ret : ret; \
auto& nested_col = nullable_col->get_nested_column(); \
if (nested_col.is_column_dictionary()) { \
if constexpr (std::is_same_v<type, StringValue>) { \
auto* nested_col_ptr = vectorized::check_and_get_column< \
vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); \
auto code = IS_RANGE ? nested_col_ptr->find_bound_code(_value, 0 OP 1, 1 OP 1) \
: nested_col_ptr->find_code(_value); \
auto& data_array = nested_col_ptr->get_data(); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const auto& cell_value = \
reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \
bool ret = !null_bitmap[idx] && (cell_value OP code); \
new_size += _opposite ? !ret : ret; \
} \
} \
} else { \
auto* nested_col_ptr = \
vectorized::check_and_get_column<vectorized::PredicateColumnType<type>>( \
nested_col); \
auto& data_array = nested_col_ptr->get_data(); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \
bool ret = !null_bitmap[idx] && (cell_value OP _value); \
new_size += _opposite ? !ret : ret; \
} \
} \
*size = new_size; \
} else if (column.is_column_dictionary()) { \
if constexpr (std::is_same_v<type, StringValue>) { \
auto& dict_col = \
reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(column);\
auto& data_array = dict_col.get_data(); \
auto code = IS_RANGE ? dict_col.find_bound_code(_value, 0 OP 1, 1 OP 1) \
: dict_col.find_code(_value); \
for (uint16_t i = 0; i < *size; ++i) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const auto& cell_value = \
reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \
bool ret = cell_value OP code; \
new_size += _opposite ? !ret : ret; \
} \
*size = new_size; \
} \
} else { \
auto& pred_column_ref = \
reinterpret_cast<vectorized::PredicateColumnType<type>&>(column); \
@ -182,12 +220,13 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterEqualPredicate, >=)
} \
}
COMPARISON_PRED_COLUMN_EVALUATE(EqualPredicate, ==)
COMPARISON_PRED_COLUMN_EVALUATE(NotEqualPredicate, !=)
COMPARISON_PRED_COLUMN_EVALUATE(LessPredicate, <)
COMPARISON_PRED_COLUMN_EVALUATE(LessEqualPredicate, <=)
COMPARISON_PRED_COLUMN_EVALUATE(GreaterPredicate, >)
COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=)
COMPARISON_PRED_COLUMN_EVALUATE(EqualPredicate, ==, false)
COMPARISON_PRED_COLUMN_EVALUATE(NotEqualPredicate, !=, false)
COMPARISON_PRED_COLUMN_EVALUATE(LessPredicate, <, true)
COMPARISON_PRED_COLUMN_EVALUATE(LessEqualPredicate, <=, true)
COMPARISON_PRED_COLUMN_EVALUATE(GreaterPredicate, >, true)
COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
#define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP) \
template <class type> \

View File

@ -26,35 +26,38 @@ namespace doris {
class VectorizedRowBatch;
#define COMPARISON_PRED_CLASS_DEFINE(CLASS) \
template <class type> \
class CLASS : public ColumnPredicate { \
public: \
CLASS(uint32_t column_id, const type& value, bool opposite = false); \
virtual void evaluate(VectorizedRowBatch* batch) const override; \
void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override; \
void evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, \
bool* flags) const override; \
void evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, \
bool* flags) const override; \
virtual Status evaluate(const Schema& schema, \
const std::vector<BitmapIndexIterator*>& iterators, \
uint32_t num_rows, roaring::Roaring* roaring) const override; \
void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const override; \
void evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const override; \
void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const override; \
#define COMPARISON_PRED_CLASS_DEFINE(CLASS, IS_RANGE) \
template <class type> \
class CLASS : public ColumnPredicate { \
public: \
CLASS(uint32_t column_id, const type& value, bool opposite = false); \
virtual void evaluate(VectorizedRowBatch* batch) const override; \
void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override; \
void evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, \
bool* flags) const override; \
void evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, \
bool* flags) const override; \
virtual Status evaluate(const Schema& schema, \
const std::vector<BitmapIndexIterator*>& iterators, \
uint32_t num_rows, roaring::Roaring* roaring) const override; \
void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const override; \
void evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size, \
bool* flags) const override; \
void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size, \
bool* flags) const override; \
void evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) const override; \
\
private: \
type _value; \
bool is_range_comparison_predicate() override { return IS_RANGE; } \
\
private: \
type _value; \
};
COMPARISON_PRED_CLASS_DEFINE(EqualPredicate)
COMPARISON_PRED_CLASS_DEFINE(NotEqualPredicate)
COMPARISON_PRED_CLASS_DEFINE(LessPredicate)
COMPARISON_PRED_CLASS_DEFINE(LessEqualPredicate)
COMPARISON_PRED_CLASS_DEFINE(GreaterPredicate)
COMPARISON_PRED_CLASS_DEFINE(GreaterEqualPredicate)
COMPARISON_PRED_CLASS_DEFINE(EqualPredicate, false)
COMPARISON_PRED_CLASS_DEFINE(NotEqualPredicate, false)
COMPARISON_PRED_CLASS_DEFINE(LessPredicate, true)
COMPARISON_PRED_CLASS_DEFINE(LessEqualPredicate, true)
COMPARISON_PRED_CLASS_DEFINE(GreaterPredicate, true)
COMPARISON_PRED_CLASS_DEFINE(GreaterEqualPredicate, true)
} //namespace doris

View File

@ -20,6 +20,7 @@
#include "olap/field.h"
#include "runtime/string_value.hpp"
#include "runtime/vectorized_row_batch.h"
#include "vec/columns/column_dictionary.h"
#include "vec/columns/predicate_column.h"
#include "vec/columns/column_nullable.h"
@ -117,26 +118,62 @@ IN_LIST_PRED_EVALUATE(NotInListPredicate, ==)
IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(InListPredicate, !=)
IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(NotInListPredicate, ==)
// todo(zeno) define interface in IColumn to simplify code
#define IN_LIST_PRED_COLUMN_EVALUATE(CLASS, OP) \
template <class type> \
void CLASS<type>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { \
uint16_t new_size = 0; \
if (column.is_nullable()) { \
auto* nullable_column = \
vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \
auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>(*( \
nullable_column->get_null_map_column_ptr())).get_data(); \
auto* nest_column_vector = vectorized::check_and_get_column \
<vectorized::PredicateColumnType<type>>(nullable_column->get_nested_column()); \
auto& data_array = nest_column_vector->get_data(); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \
bool ret = !null_bitmap[idx] && (_values.find(cell_value) OP _values.end()); \
new_size += _opposite ? !ret : ret; \
auto* nullable_col = \
vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \
auto& null_bitmap = reinterpret_cast<const vectorized::ColumnUInt8&>( \
nullable_col->get_null_map_column()).get_data(); \
auto& nested_col = nullable_col->get_nested_column(); \
if (nested_col.is_column_dictionary()) { \
if constexpr (std::is_same_v<type, StringValue>) { \
auto* nested_col_ptr = vectorized::check_and_get_column< \
vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); \
auto code_set = nested_col_ptr->find_codes(_values); \
auto& data_array = nested_col_ptr->get_data(); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const auto& cell_value = \
reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \
bool ret = !null_bitmap[idx] \
&& (code_set.find(cell_value) OP code_set.end()); \
new_size += _opposite ? !ret : ret; \
} \
} \
} else { \
auto* nested_col_ptr = vectorized::check_and_get_column< \
vectorized::PredicateColumnType<type>>(nested_col); \
auto& data_array = nested_col_ptr->get_data(); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \
bool ret = !null_bitmap[idx] && (_values.find(cell_value) OP _values.end()); \
new_size += _opposite ? !ret : ret; \
} \
} \
*size = new_size; \
} else if (column.is_column_dictionary()) { \
if constexpr (std::is_same_v<type, StringValue>) { \
auto& dict_col = \
reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \
column); \
auto& data_array = dict_col.get_data(); \
auto code_set = dict_col.find_codes(_values); \
for (uint16_t i = 0; i < *size; i++) { \
uint16_t idx = sel[i]; \
sel[new_size] = idx; \
const auto& cell_value = \
reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \
auto result = (code_set.find(cell_value) OP code_set.end()); \
new_size += _opposite ? !result : result; \
} \
} \
} else { \
auto& number_column = reinterpret_cast<vectorized::PredicateColumnType<type>&>(column);\
auto& data_array = number_column.get_data(); \

View File

@ -21,6 +21,8 @@
#include "gutil/strings/substitute.h" // for Substitute
#include "runtime/mem_pool.h"
#include "util/slice.h" // for Slice
#include "vec/columns/column.h"
#include "vec/columns/column_dictionary.h"
#include "vec/columns/column_vector.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_nullable.h"
@ -131,7 +133,6 @@ void BinaryDictPageBuilder::reset() {
} else {
_data_page_builder->reset();
}
_finished = false;
}
size_t BinaryDictPageBuilder::count() const {
@ -239,6 +240,18 @@ void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, StringRe
Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr &dst) {
if (_encoding_type == PLAIN_ENCODING) {
// todo(zeno) Handle convert in ColumnDictionary,
// add interface like convert_to_predicate_column_if_necessary
auto* col_ptr = dst.get();
if (dst->is_nullable()) {
auto nullable_col = reinterpret_cast<vectorized::ColumnNullable*>(dst.get());
col_ptr = nullable_col->get_nested_column_ptr().get();
}
if (col_ptr->is_column_dictionary()) {
auto* dict_col_ptr = reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>*>(col_ptr);
col_ptr = (*std::move(dict_col_ptr->convert_to_predicate_column())).assume_mutable();
}
return _data_page_decoder->next_batch(n, dst);
}
// dictionary encoding
@ -253,15 +266,14 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr
size_t max_fetch = std::min(*n, static_cast<size_t>(_bit_shuffle_ptr->_num_elements - _bit_shuffle_ptr->_cur_index));
*n = max_fetch;
const int32_t* data_array = reinterpret_cast<const int32_t*>(_bit_shuffle_ptr->_chunk.data);
const auto* data_array = reinterpret_cast<const int32_t*>(_bit_shuffle_ptr->_chunk.data);
size_t start_index = _bit_shuffle_ptr->_cur_index;
dst->insert_many_dict_data(data_array, start_index, _dict_word_info, max_fetch);
dst->insert_many_dict_data(data_array, start_index, _dict_word_info, max_fetch, _dict_decoder->_num_elems);
_bit_shuffle_ptr->_cur_index += max_fetch;
return Status::OK();
}
Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) {
@ -275,7 +287,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) {
if (PREDICT_FALSE(*n == 0)) {
return Status::OK();
}
Slice* out = reinterpret_cast<Slice*>(dst->data());
auto* out = reinterpret_cast<Slice*>(dst->data());
_batch->resize(*n);

View File

@ -33,6 +33,7 @@
#include "olap/short_key_index.h"
#include "util/doris_metrics.h"
#include "util/simd/bits.h"
#include "vec/columns/column_dictionary.h"
using strings::Substitute;
@ -858,6 +859,19 @@ void SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_rowid_
for (auto column_predicate : _short_cir_eval_predicate) {
auto column_id = column_predicate->column_id();
auto& short_cir_column = _current_return_columns[column_id];
auto* col_ptr = short_cir_column.get();
// todo(zeno) define convert_dict_codes_if_dictionary interface in IColumn
if (short_cir_column->is_nullable()) {
auto nullable_col =
reinterpret_cast<vectorized::ColumnNullable*>(short_cir_column.get());
col_ptr = nullable_col->get_nested_column_ptr().get();
}
if (col_ptr->is_column_dictionary() && column_predicate->is_range_comparison_predicate()) {
auto& dict_col =
reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(*col_ptr);
dict_col.convert_dict_codes();
}
column_predicate->evaluate(*short_cir_column, vec_sel_rowid_idx, selected_size_ptr);
}

View File

@ -20,6 +20,7 @@
#include "olap/row_block2.h"
#include "olap/uint24.h"
#include "vec/columns/column_complex.h"
#include "vec/columns/column_dictionary.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/predicate_column.h"
#include "vec/core/types.h"
@ -203,6 +204,9 @@ vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(FieldType type)
case OLAP_FIELD_TYPE_CHAR:
case OLAP_FIELD_TYPE_VARCHAR:
case OLAP_FIELD_TYPE_STRING:
if (config::enable_low_cardinality_optimize) {
return doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create();
}
return doris::vectorized::PredicateColumnType<StringValue>::create();
case OLAP_FIELD_TYPE_DECIMAL:

View File

@ -22,9 +22,53 @@
#include "udf/udf.h"
#include "util/hash_util.hpp"
#include "util/cpu_info.h"
#include "vec/common/string_ref.h"
#ifdef __SSE4_2__
#include "util/sse_util.hpp"
#endif
namespace doris {
// Compare two strings using sse4.2 intrinsics if they are available. This code assumes
// that the trivial cases are already handled (i.e. one string is empty).
// Returns:
// < 0 if s1 < s2
// 0 if s1 == s2
// > 0 if s1 > s2
// The SSE code path is just under 2x faster than the non-sse code path.
// - s1/n1: ptr/len for the first string
// - s2/n2: ptr/len for the second string
// - len: min(n1, n2) - this can be more cheaply passed in by the caller
static inline int string_compare(const char* s1, int64_t n1, const char* s2, int64_t n2,
int64_t len) {
DCHECK_EQ(len, std::min(n1, n2));
#ifdef __SSE4_2__
while (len >= sse_util::CHARS_PER_128_BIT_REGISTER) {
__m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
__m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
int chars_match =
_mm_cmpestri(xmm0, sse_util::CHARS_PER_128_BIT_REGISTER, xmm1,
sse_util::CHARS_PER_128_BIT_REGISTER, sse_util::STRCMP_MODE);
if (chars_match != sse_util::CHARS_PER_128_BIT_REGISTER) {
return (unsigned char)s1[chars_match] - (unsigned char)s2[chars_match];
}
len -= sse_util::CHARS_PER_128_BIT_REGISTER;
s1 += sse_util::CHARS_PER_128_BIT_REGISTER;
s2 += sse_util::CHARS_PER_128_BIT_REGISTER;
}
#endif
unsigned char u1, u2;
while (len-- > 0) {
u1 = (unsigned char)*s1++;
u2 = (unsigned char)*s2++;
if (u1 != u2) return u1 - u2;
if (u1 == '\0') return n1 - n2;
}
return n1 - n2;
}
// The format of a string-typed slot.
// The returned StringValue of all functions that return StringValue
// shares its buffer the parent.
@ -43,6 +87,7 @@ struct StringValue {
size_t len;
StringValue(char* ptr, int len) : ptr(ptr), len(len) {}
StringValue(const char* ptr, int len) : ptr(const_cast<char*>(ptr)), len(len) {}
StringValue() : ptr(nullptr), len(0) {}
/// Construct a StringValue from 's'. 's' must be valid for as long as
@ -60,10 +105,36 @@ struct StringValue {
// this < other: -1
// this == other: 0
// this > other: 1
int compare(const StringValue& other) const;
inline int compare(const StringValue& other) const {
int l = std::min(len, other.len);
if (l == 0) {
if (len == other.len) {
return 0;
} else if (len == 0) {
return -1;
} else {
DCHECK_EQ(other.len, 0);
return 1;
}
}
return string_compare(this->ptr, this->len, other.ptr, other.len, l);
}
// ==
bool eq(const StringValue& other) const;
inline bool eq(const StringValue& other) const {
if (this->len != other.len) {
return false;
}
#if defined(__SSE2__)
return memequalSSE2Wide(this->ptr, other.ptr, this->len);
#endif
return string_compare(this->ptr, this->len, other.ptr, other.len, this->len) == 0;
}
bool operator==(const StringValue& other) const { return eq(other); }
// !=
bool ne(const StringValue& other) const { return !eq(other); }
@ -111,6 +182,12 @@ struct StringValue {
static StringValue min_string_val();
static StringValue max_string_val();
struct Comparator {
bool operator()(const StringValue& a, const StringValue& b) const {
return a.compare(b) < 0;
}
};
};
// This function must be called 'hash_value' to be picked up by boost.

View File

@ -21,81 +21,9 @@
#include <cstring>
#include "runtime/string_value.h"
#include "util/cpu_info.h"
#include "vec/common/string_ref.h"
#ifdef __SSE4_2__
#include "util/sse_util.hpp"
#endif
namespace doris {
// Compare two strings using sse4.2 intrinsics if they are available. This code assumes
// that the trivial cases are already handled (i.e. one string is empty).
// Returns:
// < 0 if s1 < s2
// 0 if s1 == s2
// > 0 if s1 > s2
// The SSE code path is just under 2x faster than the non-sse code path.
// - s1/n1: ptr/len for the first string
// - s2/n2: ptr/len for the second string
// - len: min(n1, n2) - this can be more cheaply passed in by the caller
static inline int string_compare(const char* s1, int64_t n1, const char* s2, int64_t n2,
int64_t len) {
DCHECK_EQ(len, std::min(n1, n2));
#ifdef __SSE4_2__
while (len >= sse_util::CHARS_PER_128_BIT_REGISTER) {
__m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
__m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
int chars_match =
_mm_cmpestri(xmm0, sse_util::CHARS_PER_128_BIT_REGISTER, xmm1,
sse_util::CHARS_PER_128_BIT_REGISTER, sse_util::STRCMP_MODE);
if (chars_match != sse_util::CHARS_PER_128_BIT_REGISTER) {
return (unsigned char)s1[chars_match] - (unsigned char)s2[chars_match];
}
len -= sse_util::CHARS_PER_128_BIT_REGISTER;
s1 += sse_util::CHARS_PER_128_BIT_REGISTER;
s2 += sse_util::CHARS_PER_128_BIT_REGISTER;
}
#endif
unsigned char u1, u2;
while (len-- > 0) {
u1 = (unsigned char)*s1++;
u2 = (unsigned char)*s2++;
if (u1 != u2) return u1 - u2;
if (u1 == '\0') return n1 - n2;
}
return n1 - n2;
}
inline int StringValue::compare(const StringValue& other) const {
int l = std::min(len, other.len);
if (l == 0) {
if (len == other.len) {
return 0;
} else if (len == 0) {
return -1;
} else {
DCHECK_EQ(other.len, 0);
return 1;
}
}
return string_compare(this->ptr, this->len, other.ptr, other.len, l);
}
inline bool StringValue::eq(const StringValue& other) const {
if (this->len != other.len) {
return false;
}
#if defined(__SSE2__)
return memequalSSE2Wide(this->ptr, other.ptr, this->len);
#endif
return string_compare(this->ptr, this->len, other.ptr, other.len, this->len) == 0;
}
inline StringValue StringValue::substring(int start_pos) const {
return StringValue(ptr + start_pos, len - start_pos);
}

View File

@ -178,8 +178,10 @@ public:
virtual void insert_many_fix_len_data(const char* pos, size_t num) {
LOG(FATAL) << "Method insert_many_fix_len_data is not supported for " << get_name();
}
virtual void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) {
// todo(zeno) Use dict_args temp object to cover all arguments
virtual void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict, size_t data_num, uint32_t dict_num = 0) {
LOG(FATAL) << "Method insert_many_dict_data is not supported for " << get_name();
}
@ -426,6 +428,8 @@ public:
virtual bool is_predicate_column() const { return false; }
virtual bool is_column_dictionary() const { return false; }
/// If the only value column can contain is NULL.
/// Does not imply type of object, because it can be ColumnNullable(ColumnNothing) or ColumnConst(ColumnNullable(ColumnNothing))
virtual bool only_null() const { return false; }

View File

@ -0,0 +1,386 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <parallel_hashmap/phmap.h>
#include "gutil/hash/string_hash.h"
#include "olap/decimal12.h"
#include "olap/uint24.h"
#include "runtime/string_value.h"
#include "util/slice.h"
#include "vec/columns/column.h"
#include "vec/columns/column_decimal.h"
#include "vec/columns/column_impl.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/columns/predicate_column.h"
#include "vec/core/types.h"
namespace doris::vectorized {
/**
* For low cardinality string columns, using ColumnDictionary can reduce memory
* usage and improve query efficiency.
* For equal predicate comparisons, convert the predicate constant to encodings
* according to the dictionary, so that encoding comparisons are used instead
* of string comparisons to improve performance.
* For range comparison predicates, it is necessary to sort the dictionary
* contents, convert the encoding column, and then compare the encoding directly.
* If the read data page contains plain-encoded data pages, the dictionary
* columns are converted into PredicateColumn for processing.
* Currently ColumnDictionary is only used for storage layer.
*/
template <typename T>
class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> {
private:
friend class COWHelper<IColumn, ColumnDictionary>;
ColumnDictionary() {}
ColumnDictionary(const size_t n) : codes(n) {}
ColumnDictionary(const ColumnDictionary& src) : codes(src.codes.begin(), src.codes.end()) {}
public:
using Self = ColumnDictionary;
using value_type = T;
using Container = PaddedPODArray<value_type>;
using DictContainer = PaddedPODArray<StringValue>;
bool is_numeric() const override { return false; }
bool is_predicate_column() const override { return false; }
bool is_column_dictionary() const override { return true; }
size_t size() const override { return codes.size(); }
[[noreturn]] StringRef get_data_at(size_t n) const override {
LOG(FATAL) << "get_data_at not supported in ColumnDictionary";
}
void insert_from(const IColumn& src, size_t n) override {
LOG(FATAL) << "insert_from not supported in ColumnDictionary";
}
void insert_range_from(const IColumn& src, size_t start, size_t length) override {
LOG(FATAL) << "insert_range_from not supported in ColumnDictionary";
}
void insert_indices_from(const IColumn& src, const int* indices_begin,
const int* indices_end) override {
LOG(FATAL) << "insert_indices_from not supported in ColumnDictionary";
}
void pop_back(size_t n) override { LOG(FATAL) << "pop_back not supported in ColumnDictionary"; }
void update_hash_with_value(size_t n, SipHash& hash) const override {
LOG(FATAL) << "update_hash_with_value not supported in ColumnDictionary";
}
void insert_data(const char* pos, size_t /*length*/) override {
codes.push_back(unaligned_load<T>(pos));
}
void insert_data(const T value) { codes.push_back(value); }
void insert_default() override { codes.push_back(T()); }
void clear() override { codes.clear(); }
// TODO: Make dict memory usage more precise
size_t byte_size() const override { return codes.size() * sizeof(codes[0]); }
size_t allocated_bytes() const override { return byte_size(); }
void protect() override {}
void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
IColumn::Permutation& res) const override {
LOG(FATAL) << "get_permutation not supported in ColumnDictionary";
}
void reserve(size_t n) override { codes.reserve(n); }
[[noreturn]] const char* get_family_name() const override {
LOG(FATAL) << "get_family_name not supported in ColumnDictionary";
}
[[noreturn]] MutableColumnPtr clone_resized(size_t size) const override {
LOG(FATAL) << "clone_resized not supported in ColumnDictionary";
}
void insert(const Field& x) override {
LOG(FATAL) << "insert not supported in ColumnDictionary";
}
Field operator[](size_t n) const override { return codes[n]; }
void get(size_t n, Field& res) const override { res = (*this)[n]; }
[[noreturn]] UInt64 get64(size_t n) const override {
LOG(FATAL) << "get field not supported in ColumnDictionary";
}
[[noreturn]] Float64 get_float64(size_t n) const override {
LOG(FATAL) << "get field not supported in ColumnDictionary";
}
[[noreturn]] UInt64 get_uint(size_t n) const override {
LOG(FATAL) << "get field not supported in ColumnDictionary";
}
[[noreturn]] bool get_bool(size_t n) const override {
LOG(FATAL) << "get field not supported in ColumnDictionary";
}
[[noreturn]] Int64 get_int(size_t n) const override {
LOG(FATAL) << "get field not supported in ColumnDictionary";
}
Container& get_data() { return codes; }
const Container& get_data() const { return codes; }
T find_code(const StringValue& value) const { return dict.find_code(value); }
T find_bound_code(const StringValue& value, bool lower, bool eq) const {
return dict.find_bound_code(value, lower, eq);
}
phmap::flat_hash_set<T> find_codes(const phmap::flat_hash_set<StringValue>& values) const {
return dict.find_codes(values);
}
// it's impossable to use ComplexType as key , so we don't have to implemnt them
[[noreturn]] StringRef serialize_value_into_arena(size_t n, Arena& arena,
char const*& begin) const {
LOG(FATAL) << "serialize_value_into_arena not supported in ColumnDictionary";
}
[[noreturn]] const char* deserialize_and_insert_from_arena(const char* pos) {
LOG(FATAL) << "deserialize_and_insert_from_arena not supported in ColumnDictionary";
}
[[noreturn]] int compare_at(size_t n, size_t m, const IColumn& rhs,
int nan_direction_hint) const {
LOG(FATAL) << "compare_at not supported in ColumnDictionary";
}
void get_extremes(Field& min, Field& max) const {
LOG(FATAL) << "get_extremes not supported in ColumnDictionary";
}
bool can_be_inside_nullable() const override { return true; }
bool is_fixed_and_contiguous() const override { return true; }
size_t size_of_value_if_fixed() const override { return sizeof(T); }
[[noreturn]] StringRef get_raw_data() const override {
LOG(FATAL) << "get_raw_data not supported in ColumnDictionary";
}
[[noreturn]] bool structure_equals(const IColumn& rhs) const override {
LOG(FATAL) << "structure_equals not supported in ColumnDictionary";
}
[[noreturn]] ColumnPtr filter(const IColumn::Filter& filt,
ssize_t result_size_hint) const override {
LOG(FATAL) << "filter not supported in ColumnDictionary";
};
[[noreturn]] ColumnPtr permute(const IColumn::Permutation& perm, size_t limit) const override {
LOG(FATAL) << "permute not supported in ColumnDictionary";
};
[[noreturn]] ColumnPtr replicate(const IColumn::Offsets& replicate_offsets) const override {
LOG(FATAL) << "replicate not supported in ColumnDictionary";
};
[[noreturn]] MutableColumns scatter(IColumn::ColumnIndex num_columns,
const IColumn::Selector& selector) const override {
LOG(FATAL) << "scatter not supported in ColumnDictionary";
}
Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override {
auto* res_col = reinterpret_cast<vectorized::ColumnString*>(col_ptr);
for (size_t i = 0; i < sel_size; i++) {
uint16_t n = sel[i];
auto& code = reinterpret_cast<T&>(codes[n]);
auto value = dict.get_value(code);
res_col->insert_data(value.ptr, value.len);
}
return Status::OK();
}
void replace_column_data(const IColumn&, size_t row, size_t self_row = 0) override {
LOG(FATAL) << "should not call replace_column_data in ColumnDictionary";
}
void replace_column_data_default(size_t self_row = 0) override {
LOG(FATAL) << "should not call replace_column_data_default in ColumnDictionary";
}
void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict_array, size_t data_num,
uint32_t dict_num) override {
if (!is_dict_inited()) {
dict.reserve(dict_num);
for (uint32_t i = 0; i < dict_num; ++i) {
auto value = StringValue(dict_array[i].data, dict_array[i].size);
dict.insert_value(value);
}
_dict_inited = true;
}
char* end_ptr = (char*)codes.get_end_ptr();
memcpy(end_ptr, data_array + start_index, data_num * sizeof(T));
end_ptr += data_num * sizeof(T);
codes.set_end_ptr(end_ptr);
}
bool is_dict_inited() const { return _dict_inited; }
bool is_dict_sorted() const { return _dict_sorted; }
bool is_dict_code_converted() const { return _dict_code_converted; }
ColumnPtr convert_to_predicate_column() {
auto res = vectorized::PredicateColumnType<StringValue>::create();
size_t size = codes.size();
res->reserve(size);
for (size_t i = 0; i < size; ++i) {
auto& code = reinterpret_cast<T&>(codes[i]);
auto value = dict.get_value(code);
res->insert_data(value.ptr, value.len);
}
dict.clear();
return res;
}
void convert_dict_codes() {
if (!is_dict_sorted()) {
sort_dict();
}
if (!is_dict_code_converted()) {
for (size_t i = 0; i < size(); ++i) {
codes[i] = dict.convert_code(codes[i]);
}
_dict_code_converted = true;
}
}
void sort_dict() {
dict.sort();
_dict_sorted = true;
}
class Dictionary {
public:
Dictionary() = default;
void reserve(size_t n) {
dict_data.reserve(n);
inverted_index.reserve(n);
}
inline void insert_value(StringValue& value) {
dict_data.push_back_without_reserve(value);
inverted_index[value] = inverted_index.size();
}
inline T find_code(const StringValue& value) const {
auto it = inverted_index.find(value);
if (it != inverted_index.end()) {
return it->second;
}
return -1;
}
inline T find_bound_code(const StringValue& value, bool lower, bool eq) const {
auto code = find_code(value);
if (code >= 0) {
return code;
}
if (lower) {
return std::lower_bound(dict_data.begin(), dict_data.end(), value) - dict_data.begin() - eq;
} else {
return std::upper_bound(dict_data.begin(), dict_data.end(), value) - dict_data.begin() + eq;
}
}
inline phmap::flat_hash_set<T> find_codes(const phmap::flat_hash_set<StringValue>& values) const {
phmap::flat_hash_set<T> code_set;
for (const auto& value : values) {
auto it = inverted_index.find(value);
if (it != inverted_index.end()) {
code_set.insert(it->second);
}
}
return code_set;
}
inline StringValue& get_value(T code) { return dict_data[code]; }
void clear() {
dict_data.clear();
inverted_index.clear();
code_convert_map.clear();
}
void sort() {
size_t dict_size = dict_data.size();
std::sort(dict_data.begin(), dict_data.end(), comparator);
for (size_t i = 0; i < dict_size; ++i) {
code_convert_map[inverted_index.find(dict_data[i])->second] = (T)i;
inverted_index[dict_data[i]] = (T)i;
}
}
inline T convert_code(const T& code) const { return code_convert_map.find(code)->second; }
size_t byte_size() { return dict_data.size() * sizeof(dict_data[0]); }
private:
struct HashOfStringValue {
size_t operator()(const StringValue& value) const {
return HashStringThoroughly(value.ptr, value.len);
}
};
StringValue::Comparator comparator;
// dict code -> dict value
DictContainer dict_data;
// dict value -> dict code
phmap::flat_hash_map<StringValue, T, HashOfStringValue> inverted_index;
// data page code -> sorted dict code, only used for range comparison predicate
phmap::flat_hash_map<T, T> code_convert_map;
};
private:
bool _dict_inited = false;
bool _dict_sorted = false;
bool _dict_code_converted = false;
Dictionary dict;
Container codes;
};
} // namespace doris::vectorized

View File

@ -99,9 +99,10 @@ public:
get_nested_column().insert_many_fix_len_data(pos, num);
}
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) override {
get_null_map_column().fill(0, num);
get_nested_column().insert_many_dict_data(data_array, start_index, dict, num);
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict,
size_t data_num, uint32_t dict_num) override {
get_null_map_column().fill(0, data_num);
get_nested_column().insert_many_dict_data(data_array, start_index, dict, data_num, dict_num);
}
void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override {

View File

@ -165,7 +165,8 @@ public:
}
};
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) override {
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict,
size_t num, uint32_t /*dict_num*/) override {
for (size_t end_index = start_index+num; start_index < end_index; ++start_index) {
int32_t codeword = data_array[start_index];
insert_data(dict[codeword].data, dict[codeword].size);

View File

@ -217,7 +217,8 @@ public:
}
}
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) override {
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict,
size_t num, uint32_t /*dict_num*/) override {
if constexpr (std::is_same_v<T, StringValue>) {
for (size_t end_index = start_index+num; start_index < end_index; ++start_index) {
int32_t codeword = data_array[start_index];