[Bug][Vectorized] Fix core bug of segment vectorized (#8800)
* [Bug][Vectorized] Fix core bug of segment vectorized 1. Read table with delete condition 2. Read table with default value HLL/Bitmap Column * refactor some code Co-authored-by: lihaopeng <lihaopeng@baidu.com>
This commit is contained in:
@ -775,39 +775,58 @@ Status DefaultValueColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, b
|
||||
|
||||
void DefaultValueColumnIterator::insert_default_data(vectorized::MutableColumnPtr &dst, size_t n) {
|
||||
vectorized::Int128 int128;
|
||||
char* data_ptr = (char*)&int128;
|
||||
char* data_ptr = (char *) &int128;
|
||||
size_t data_len = sizeof(int128);
|
||||
|
||||
auto type = _type_info->type();
|
||||
if (type == OLAP_FIELD_TYPE_DATE) {
|
||||
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
|
||||
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
|
||||
auto insert_column_data = [&]() {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
dst->insert_data(data_ptr, data_len);
|
||||
}
|
||||
};
|
||||
|
||||
vectorized::VecDateTimeValue value;
|
||||
value.from_date_str(str.c_str(), str.length());
|
||||
value.cast_to_date();
|
||||
//TODO: here is int128 = int64
|
||||
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
|
||||
} else if (type == OLAP_FIELD_TYPE_DATETIME) {
|
||||
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
|
||||
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
|
||||
switch (_type_info->type()) {
|
||||
case OLAP_FIELD_TYPE_OBJECT:
|
||||
case OLAP_FIELD_TYPE_HLL:{
|
||||
dst->insert_many_defaults(n);
|
||||
break;
|
||||
}
|
||||
|
||||
vectorized::VecDateTimeValue value;
|
||||
value.from_date_str(str.c_str(), str.length());
|
||||
value.to_datetime();
|
||||
case OLAP_FIELD_TYPE_DATE: {
|
||||
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
|
||||
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
|
||||
|
||||
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
|
||||
} else if (type == OLAP_FIELD_TYPE_DECIMAL) {
|
||||
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
|
||||
decimal12_t* d = (decimal12_t*)_mem_value;
|
||||
int128 = DecimalV2Value(d->integer, d->fraction).value();
|
||||
} else {
|
||||
data_ptr = (char*)_mem_value;
|
||||
data_len = _type_size;
|
||||
}
|
||||
vectorized::VecDateTimeValue value;
|
||||
value.from_date_str(str.c_str(), str.length());
|
||||
value.cast_to_date();
|
||||
//TODO: here is int128 = int64, here rely on the logic of little endian
|
||||
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
|
||||
insert_column_data();
|
||||
break;
|
||||
}
|
||||
case OLAP_FIELD_TYPE_DATETIME: {
|
||||
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
|
||||
std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
dst->insert_data(data_ptr, data_len);
|
||||
vectorized::VecDateTimeValue value;
|
||||
value.from_date_str(str.c_str(), str.length());
|
||||
value.to_datetime();
|
||||
|
||||
int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
|
||||
insert_column_data();
|
||||
break;
|
||||
}
|
||||
case OLAP_FIELD_TYPE_DECIMAL: {
|
||||
assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
|
||||
decimal12_t *d = (decimal12_t *) _mem_value;
|
||||
int128 = DecimalV2Value(d->integer, d->fraction).value();
|
||||
insert_column_data();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
data_ptr = (char *) _mem_value;
|
||||
data_len = _type_size;
|
||||
insert_column_data();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -134,6 +134,7 @@ Status SegmentIterator::_init(bool is_vec) {
|
||||
RETURN_IF_ERROR(_get_row_ranges_by_column_conditions());
|
||||
if (is_vec) {
|
||||
_vec_init_lazy_materialization();
|
||||
_vec_init_char_column_id();
|
||||
} else {
|
||||
_init_lazy_materialization();
|
||||
}
|
||||
@ -707,6 +708,17 @@ void SegmentIterator::_vec_init_lazy_materialization() {
|
||||
}
|
||||
}
|
||||
|
||||
void SegmentIterator::_vec_init_char_column_id() {
|
||||
for (size_t i = 0; i < _schema.num_column_ids(); i++) {
|
||||
auto cid = _schema.column_id(i);
|
||||
auto column_desc = _schema.column(cid);
|
||||
|
||||
if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
|
||||
_char_type_idx.emplace_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
|
||||
vectorized::MutableColumns& column_block, size_t nrows) {
|
||||
for (auto cid : column_ids) {
|
||||
@ -720,8 +732,6 @@ Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
|
||||
|
||||
void SegmentIterator::_init_current_block(
|
||||
vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) {
|
||||
_char_type_idx.clear();
|
||||
|
||||
bool is_block_mem_reuse = block->mem_reuse();
|
||||
if (is_block_mem_reuse) {
|
||||
block->clear_column_data(_schema.num_column_ids());
|
||||
@ -738,11 +748,7 @@ void SegmentIterator::_init_current_block(
|
||||
auto cid = _schema.column_id(i);
|
||||
auto column_desc = _schema.column(cid);
|
||||
|
||||
if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
|
||||
_char_type_idx.emplace_back(i);
|
||||
}
|
||||
|
||||
if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after output block
|
||||
if (_is_pred_column[cid]) { //todo(wb) maybe we can release it after output block
|
||||
current_columns[cid]->clear();
|
||||
} else { // non-predicate column
|
||||
if (is_block_mem_reuse) {
|
||||
|
||||
@ -79,6 +79,10 @@ private:
|
||||
|
||||
void _init_lazy_materialization();
|
||||
void _vec_init_lazy_materialization();
|
||||
// TODO: Fix Me
|
||||
// CHAR type in storge layer padding the 0 in length. But query engine need ignore the padding 0.
|
||||
// so segment iterator need to shrink char column before output it. only use in vec query engine.
|
||||
void _vec_init_char_column_id();
|
||||
|
||||
uint32_t segment_id() const { return _segment->id(); }
|
||||
uint32_t num_rows() const { return _segment->num_rows(); }
|
||||
|
||||
@ -924,22 +924,24 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const {
|
||||
return temp_block;
|
||||
}
|
||||
|
||||
void Block::shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx) {
|
||||
void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) {
|
||||
for (auto idx : char_type_idx) {
|
||||
if (this->get_by_position(idx).column->is_nullable()) {
|
||||
this->get_by_position(idx).column = ColumnNullable::create(
|
||||
reinterpret_cast<const ColumnString*>(
|
||||
reinterpret_cast<const ColumnNullable*>(
|
||||
this->get_by_position(idx).column.get())
|
||||
->get_nested_column_ptr()
|
||||
.get())
|
||||
->get_shinked_column(),
|
||||
reinterpret_cast<const ColumnNullable*>(this->get_by_position(idx).column.get())
|
||||
->get_null_map_column_ptr());
|
||||
} else {
|
||||
this->get_by_position(idx).column =
|
||||
reinterpret_cast<const ColumnString*>(this->get_by_position(idx).column.get())
|
||||
->get_shinked_column();
|
||||
if (idx < data.size()) {
|
||||
if (this->get_by_position(idx).column->is_nullable()) {
|
||||
this->get_by_position(idx).column = ColumnNullable::create(
|
||||
reinterpret_cast<const ColumnString *>(
|
||||
reinterpret_cast<const ColumnNullable *>(
|
||||
this->get_by_position(idx).column.get())
|
||||
->get_nested_column_ptr()
|
||||
.get())
|
||||
->get_shinked_column(),
|
||||
reinterpret_cast<const ColumnNullable *>(this->get_by_position(idx).column.get())
|
||||
->get_null_map_column_ptr());
|
||||
} else {
|
||||
this->get_by_position(idx).column =
|
||||
reinterpret_cast<const ColumnString *>(this->get_by_position(idx).column.get())
|
||||
->get_shinked_column();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -104,6 +104,16 @@ public:
|
||||
doris::vectorized::IColumn* input_col_ptr,
|
||||
uint16_t* sel_rowid_idx, uint16_t select_size, int block_cid,
|
||||
size_t batch_size) {
|
||||
// Only the additional deleted filter condition need to materialize column be at the end of the block
|
||||
// We should not to materialize the column of query engine do not need. So here just return OK.
|
||||
// Eg:
|
||||
// `delete from table where a = 10;`
|
||||
// `select b from table;`
|
||||
// a column only effective in segment iterator, the block from query engine only contain the b column.
|
||||
// so the `block_cid >= data.size()` is true
|
||||
if (block_cid >= data.size())
|
||||
return Status::OK();
|
||||
|
||||
if (is_block_mem_reuse) {
|
||||
auto* raw_res_ptr = this->get_by_position(block_cid).column.get();
|
||||
const_cast<doris::vectorized::IColumn*>(raw_res_ptr)->reserve(batch_size);
|
||||
@ -296,7 +306,7 @@ public:
|
||||
doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int,
|
||||
bool padding_char = false);
|
||||
|
||||
void shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx);
|
||||
void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx);
|
||||
|
||||
private:
|
||||
void erase_impl(size_t position);
|
||||
|
||||
Reference in New Issue
Block a user