[Bug][Vectorized] Fix core bug of segment vectorized (#8800)

* [Bug][Vectorized] Fix core bug of segment vectorized
1. Read table with delete condition
2. Read table with default value HLL/Bitmap Column

* refactor some code

Co-authored-by: lihaopeng <lihaopeng@baidu.com>
This commit is contained in:
HappenLee
2022-04-03 19:50:25 +08:00
committed by GitHub
parent 33736e45fa
commit fcefed7c1c
5 changed files with 91 additions and 50 deletions

View File

@ -775,39 +775,58 @@ Status DefaultValueColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, b
void DefaultValueColumnIterator::insert_default_data(vectorized::MutableColumnPtr &dst, size_t n) {
vectorized::Int128 int128;
char* data_ptr = (char*)&int128;
char* data_ptr = (char *) &int128;
size_t data_len = sizeof(int128);
auto type = _type_info->type();
if (type == OLAP_FIELD_TYPE_DATE) {
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
auto insert_column_data = [&]() {
for (size_t i = 0; i < n; ++i) {
dst->insert_data(data_ptr, data_len);
}
};
vectorized::VecDateTimeValue value;
value.from_date_str(str.c_str(), str.length());
value.cast_to_date();
//TODO: here is int128 = int64
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
} else if (type == OLAP_FIELD_TYPE_DATETIME) {
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
switch (_type_info->type()) {
case OLAP_FIELD_TYPE_OBJECT:
case OLAP_FIELD_TYPE_HLL:{
dst->insert_many_defaults(n);
break;
}
vectorized::VecDateTimeValue value;
value.from_date_str(str.c_str(), str.length());
value.to_datetime();
case OLAP_FIELD_TYPE_DATE: {
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
} else if (type == OLAP_FIELD_TYPE_DECIMAL) {
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
decimal12_t* d = (decimal12_t*)_mem_value;
int128 = DecimalV2Value(d->integer, d->fraction).value();
} else {
data_ptr = (char*)_mem_value;
data_len = _type_size;
}
vectorized::VecDateTimeValue value;
value.from_date_str(str.c_str(), str.length());
value.cast_to_date();
//TODO: here is int128 = int64, here rely on the logic of little endian
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
insert_column_data();
break;
}
case OLAP_FIELD_TYPE_DATETIME: {
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
for (size_t i = 0; i < n; ++i) {
dst->insert_data(data_ptr, data_len);
vectorized::VecDateTimeValue value;
value.from_date_str(str.c_str(), str.length());
value.to_datetime();
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
insert_column_data();
break;
}
case OLAP_FIELD_TYPE_DECIMAL: {
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
decimal12_t *d = (decimal12_t *) _mem_value;
int128 = DecimalV2Value(d->integer, d->fraction).value();
insert_column_data();
break;
}
default: {
data_ptr = (char *) _mem_value;
data_len = _type_size;
insert_column_data();
}
}
}

View File

@ -134,6 +134,7 @@ Status SegmentIterator::_init(bool is_vec) {
RETURN_IF_ERROR(_get_row_ranges_by_column_conditions());
if (is_vec) {
_vec_init_lazy_materialization();
_vec_init_char_column_id();
} else {
_init_lazy_materialization();
}
@ -707,6 +708,17 @@ void SegmentIterator::_vec_init_lazy_materialization() {
}
}
void SegmentIterator::_vec_init_char_column_id() {
for (size_t i = 0; i < _schema.num_column_ids(); i++) {
auto cid = _schema.column_id(i);
auto column_desc = _schema.column(cid);
if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
_char_type_idx.emplace_back(i);
}
}
}
Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
vectorized::MutableColumns& column_block, size_t nrows) {
for (auto cid : column_ids) {
@ -720,8 +732,6 @@ Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
void SegmentIterator::_init_current_block(
vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) {
_char_type_idx.clear();
bool is_block_mem_reuse = block->mem_reuse();
if (is_block_mem_reuse) {
block->clear_column_data(_schema.num_column_ids());
@ -738,11 +748,7 @@ void SegmentIterator::_init_current_block(
auto cid = _schema.column_id(i);
auto column_desc = _schema.column(cid);
if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
_char_type_idx.emplace_back(i);
}
if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after output block
if (_is_pred_column[cid]) { //todo(wb) maybe we can release it after output block
current_columns[cid]->clear();
} else { // non-predicate column
if (is_block_mem_reuse) {

View File

@ -79,6 +79,10 @@ private:
void _init_lazy_materialization();
void _vec_init_lazy_materialization();
// TODO: Fix Me
// CHAR type in storge layer padding the 0 in length. But query engine need ignore the padding 0.
// so segment iterator need to shrink char column before output it. only use in vec query engine.
void _vec_init_char_column_id();
uint32_t segment_id() const { return _segment->id(); }
uint32_t num_rows() const { return _segment->num_rows(); }

View File

@ -924,22 +924,24 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const {
return temp_block;
}
void Block::shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx) {
void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) {
for (auto idx : char_type_idx) {
if (this->get_by_position(idx).column->is_nullable()) {
this->get_by_position(idx).column = ColumnNullable::create(
reinterpret_cast<const ColumnString*>(
reinterpret_cast<const ColumnNullable*>(
this->get_by_position(idx).column.get())
->get_nested_column_ptr()
.get())
->get_shinked_column(),
reinterpret_cast<const ColumnNullable*>(this->get_by_position(idx).column.get())
->get_null_map_column_ptr());
} else {
this->get_by_position(idx).column =
reinterpret_cast<const ColumnString*>(this->get_by_position(idx).column.get())
->get_shinked_column();
if (idx < data.size()) {
if (this->get_by_position(idx).column->is_nullable()) {
this->get_by_position(idx).column = ColumnNullable::create(
reinterpret_cast<const ColumnString *>(
reinterpret_cast<const ColumnNullable *>(
this->get_by_position(idx).column.get())
->get_nested_column_ptr()
.get())
->get_shinked_column(),
reinterpret_cast<const ColumnNullable *>(this->get_by_position(idx).column.get())
->get_null_map_column_ptr());
} else {
this->get_by_position(idx).column =
reinterpret_cast<const ColumnString *>(this->get_by_position(idx).column.get())
->get_shinked_column();
}
}
}
}

View File

@ -104,6 +104,16 @@ public:
doris::vectorized::IColumn* input_col_ptr,
uint16_t* sel_rowid_idx, uint16_t select_size, int block_cid,
size_t batch_size) {
// Only the additional deleted filter condition need to materialize column be at the end of the block
// We should not to materialize the column of query engine do not need. So here just return OK.
// Eg:
// `delete from table where a = 10;`
// `select b from table;`
// a column only effective in segment iterator, the block from query engine only contain the b column.
// so the `block_cid >= data.size()` is true
if (block_cid >= data.size())
return Status::OK();
if (is_block_mem_reuse) {
auto* raw_res_ptr = this->get_by_position(block_cid).column.get();
const_cast<doris::vectorized::IColumn*>(raw_res_ptr)->reserve(batch_size);
@ -296,7 +306,7 @@ public:
doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int,
bool padding_char = false);
void shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx);
void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx);
private:
void erase_impl(size_t position);