Files
doris/be/src/vec/olap/vgeneric_iterators.cpp
Mingyu Chen c0e59e59aa [fix][refactor] fix bugs and refactor some code by lint (#7871)
1. Fix some `passedByValue` issues.
2. Fix some `dereferenceBeforeCheck` issues.
3. Fix some `uninitMemberVar` issues.
4. Fix some iterator `eraseDereference` issues.
5. Fix compile issue introduced from #7923 #7905 #7848
2022-02-01 14:31:14 +08:00

444 lines
14 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <queue>
#include <utility>
#include "olap/iterators.h"
#include "olap/row.h"
#include "olap/row_block2.h"
#include "olap/row_cursor_cell.h"
namespace doris {
namespace vectorized {
// This iterator will generate ordered data. For example for schema
// (int, int) this iterator will generator data like
// (0, 1), (1, 2), (2, 3), (3, 4)...
//
// Usage:
// Schema schema;
// VAutoIncrementIterator iter(schema, 1000);
// StorageReadOptions opts;
// RETURN_IF_ERROR(iter.init(opts));
// RowBlockV2 block;
// do {
// st = iter.next_batch(&block);
// } while (st.ok());
class VAutoIncrementIterator : public RowwiseIterator {
public:
// Will generate num_rows rows in total
VAutoIncrementIterator(const Schema& schema, size_t num_rows)
: _schema(schema), _num_rows(num_rows), _rows_returned() {}
~VAutoIncrementIterator() override {}
// NOTE: Currently, this function will ignore StorageReadOptions
Status init(const StorageReadOptions& opts) override;
Status next_batch(vectorized::Block* block) override {
int row_idx = 0;
while (_rows_returned < _num_rows) {
for (int j = 0; j < _schema.num_columns(); ++j) {
vectorized::ColumnWithTypeAndName& vc = block->get_by_position(j);
vectorized::IColumn& vi = (vectorized::IColumn&)(*vc.column);
char data[16] = {};
size_t data_len = 0;
const auto* col_schema = _schema.column(j);
switch (col_schema->type()) {
case OLAP_FIELD_TYPE_SMALLINT:
*(int16_t*)data = _rows_returned + j;
data_len = sizeof(int16_t);
break;
case OLAP_FIELD_TYPE_INT:
*(int32_t*)data = _rows_returned + j;
data_len = sizeof(int32_t);
break;
case OLAP_FIELD_TYPE_BIGINT:
*(int64_t*)data = _rows_returned + j;
data_len = sizeof(int64_t);
break;
case OLAP_FIELD_TYPE_FLOAT:
*(float*)data = _rows_returned + j;
data_len = sizeof(float);
break;
case OLAP_FIELD_TYPE_DOUBLE:
*(double*)data = _rows_returned + j;
data_len = sizeof(double);
break;
default:
break;
}
vi.insert_data(data, data_len);
}
++row_idx;
++_rows_returned;
}
if (row_idx > 0)
return Status::OK();
return Status::EndOfFile("End of VAutoIncrementIterator");
}
const Schema& schema() const override { return _schema; }
private:
Schema _schema;
size_t _num_rows;
size_t _rows_returned;
};
Status VAutoIncrementIterator::init(const StorageReadOptions& opts) {
return Status::OK();
}
// Used to store merge state for a VMergeIterator input.
// This class will iterate all data from internal iterator
// through client call advance().
// Usage:
// VMergeIteratorContext ctx(iter);
// RETURN_IF_ERROR(ctx.init());
// while (ctx.valid()) {
// visit(ctx.current_row());
// RETURN_IF_ERROR(ctx.advance());
// }
class VMergeIteratorContext {
public:
VMergeIteratorContext(RowwiseIterator* iter) : _iter(iter) {}
VMergeIteratorContext(const VMergeIteratorContext&) = delete;
VMergeIteratorContext(VMergeIteratorContext&&) = delete;
VMergeIteratorContext& operator=(const VMergeIteratorContext&) = delete;
VMergeIteratorContext& operator=(VMergeIteratorContext&&) = delete;
~VMergeIteratorContext() {
delete _iter;
_iter = nullptr;
}
Status block_reset()
{
if (!_block) {
const Schema& schema = _iter->schema();
for (auto &column_desc : schema.columns()) {
auto data_type = Schema::get_data_type_ptr(column_desc->type());
if (data_type == nullptr) {
return Status::RuntimeError("invalid data type");
}
_block.insert(ColumnWithTypeAndName(data_type->create_column(), data_type, column_desc->name()));
}
} else {
_block.clear_column_data();
}
return Status::OK();
}
// Initialize this context and will prepare data for current_row()
Status init(const StorageReadOptions& opts);
int compare_row(const VMergeIteratorContext& rhs) const {
const Schema& schema = _iter->schema();
int num = schema.num_key_columns();
for (uint32_t cid = 0; cid < num; ++cid) {
#if 0
auto name = schema.column(cid)->name();
auto l_col = this->_block.get_by_name(name);
auto r_col = rhs._block.get_by_name(name);
#else
//because the columns of block will be inserted by cid asc order
//so no need to get column by get_by_name()
auto l_col = this->_block.get_by_position(cid);
auto r_col = rhs._block.get_by_position(cid);
#endif
auto l_cp = l_col.column;
auto r_cp = r_col.column;
auto res = l_cp->compare_at(_index_in_block, rhs._index_in_block, *r_cp, -1);
if (res) {
return res;
}
}
return 0;
}
bool compare(const VMergeIteratorContext& rhs) const {
int cmp_res = this->compare_row(rhs);
if (cmp_res != 0) {
return cmp_res > 0;
}
return this->data_id() < rhs.data_id();
}
void copy_row_to(vectorized::Block* block) {
vectorized::Block& src = _block;
vectorized::Block& dst = *block;
auto columns = _iter->schema().columns();
for (size_t i = 0; i < columns.size(); ++i) {
vectorized::ColumnWithTypeAndName s_col = src.get_by_position(i);
vectorized::ColumnWithTypeAndName d_col = dst.get_by_position(i);
vectorized::ColumnPtr s_cp = s_col.column;
vectorized::ColumnPtr d_cp = d_col.column;
//copy a row to dst block column by column
((vectorized::IColumn&)(*d_cp)).insert_range_from(*s_cp, _index_in_block, 1);
}
}
// Advance internal row index to next valid row
// Return error if error happens
// Don't call this when valid() is false, action is undefined
Status advance();
// Return if has remaining data in this context.
// Only when this function return true, current_row()
// will return a valid row
bool valid() const { return _valid; }
uint64_t data_id() const { return _iter->data_id(); }
private:
// Load next block into _block
Status _load_next_block();
private:
RowwiseIterator* _iter;
// used to store data load from iteerator->next_batch(Vectorized::Block*)
vectorized::Block _block;
bool _valid = false;
size_t _index_in_block = -1;
};
Status VMergeIteratorContext::init(const StorageReadOptions& opts) {
RETURN_IF_ERROR(_iter->init(opts));
RETURN_IF_ERROR(block_reset());
RETURN_IF_ERROR(_load_next_block());
if (valid()) {
RETURN_IF_ERROR(advance());
}
return Status::OK();
}
Status VMergeIteratorContext::advance() {
// NOTE: we increase _index_in_block directly to valid one check
do {
_index_in_block++;
if (_index_in_block < _block.rows()) {
return Status::OK();
}
// current batch has no data, load next batch
RETURN_IF_ERROR(_load_next_block());
} while (_valid);
return Status::OK();
}
Status VMergeIteratorContext::_load_next_block() {
do {
block_reset();
Status st = _iter->next_batch(&_block);
if (!st.ok()) {
_valid = false;
if (st.is_end_of_file()) {
return Status::OK();
} else {
return st;
}
}
} while (_block.rows() == 0);
_index_in_block = -1;
_valid = true;
return Status::OK();
}
class VMergeIterator : public RowwiseIterator {
public:
// VMergeIterator takes the ownership of input iterators
VMergeIterator(std::vector<RowwiseIterator*>& iters, std::shared_ptr<MemTracker> parent) : _origin_iters(iters) {
// use for count the mem use of Block use in Merge
_mem_tracker = MemTracker::CreateTracker(-1, "VMergeIterator", parent, false);
}
~VMergeIterator() override {
while (!_merge_heap.empty()) {
auto ctx = _merge_heap.top();
_merge_heap.pop();
delete ctx;
}
}
Status init(const StorageReadOptions& opts) override;
Status next_batch(vectorized::Block* block) override;
const Schema& schema() const override { return *_schema; }
private:
// It will be released after '_merge_heap' has been built.
std::vector<RowwiseIterator*> _origin_iters;
std::unique_ptr<Schema> _schema;
struct VMergeContextComparator {
bool operator()(const VMergeIteratorContext* lhs, const VMergeIteratorContext* rhs) const {
return lhs->compare(*rhs);
}
};
using VMergeHeap = std::priority_queue<VMergeIteratorContext*,
std::vector<VMergeIteratorContext*>,
VMergeContextComparator>;
VMergeHeap _merge_heap;
int block_row_max = 0;
};
Status VMergeIterator::init(const StorageReadOptions& opts) {
if (_origin_iters.empty()) {
return Status::OK();
}
_schema.reset(new Schema((*(_origin_iters.begin()))->schema()));
for (auto iter : _origin_iters) {
std::unique_ptr<VMergeIteratorContext> ctx(new VMergeIteratorContext(iter));
RETURN_IF_ERROR(ctx->init(opts));
if (!ctx->valid()) {
continue;
}
_merge_heap.push(ctx.release());
}
_origin_iters.clear();
block_row_max = opts.block_row_max;
return Status::OK();
}
Status VMergeIterator::next_batch(vectorized::Block* block) {
while (block->rows() < block_row_max) {
if (_merge_heap.empty())
break;
auto ctx = _merge_heap.top();
_merge_heap.pop();
// copy current row to block
ctx->copy_row_to(block);
RETURN_IF_ERROR(ctx->advance());
if (ctx->valid()) {
_merge_heap.push(ctx);
} else {
// Release ctx earlier to reduce resource consumed
delete ctx;
}
}
return Status::EndOfFile("no more data in segment");
}
// VUnionIterator will read data from input iterator one by one.
class VUnionIterator : public RowwiseIterator {
public:
// Iterators' ownership it transfered to this class.
// This class will delete all iterators when destructs
// Client should not use iterators any more.
VUnionIterator(std::vector<RowwiseIterator*>& v, std::shared_ptr<MemTracker> parent)
: _origin_iters(v.begin(), v.end()) {
_mem_tracker = MemTracker::CreateTracker(-1, "VUnionIterator", parent, false);
}
~VUnionIterator() override {
std::for_each(_origin_iters.begin(), _origin_iters.end(), std::default_delete<RowwiseIterator>());
}
Status init(const StorageReadOptions& opts) override;
Status next_batch(vectorized::Block* block) override;
const Schema& schema() const override { return *_schema; }
private:
std::unique_ptr<Schema> _schema;
RowwiseIterator* _cur_iter = nullptr;
std::deque<RowwiseIterator*> _origin_iters;
};
Status VUnionIterator::init(const StorageReadOptions& opts) {
if (_origin_iters.empty()) {
return Status::OK();
}
for (auto iter : _origin_iters) {
RETURN_IF_ERROR(iter->init(opts));
}
_schema.reset(new Schema((*(_origin_iters.begin()))->schema()));
_cur_iter = *(_origin_iters.begin());
return Status::OK();
}
Status VUnionIterator::next_batch(vectorized::Block* block) {
while (_cur_iter != nullptr) {
auto st = _cur_iter->next_batch(block);
if (st.is_end_of_file()) {
delete _cur_iter;
_origin_iters.pop_front();
if (!_origin_iters.empty()) {
_cur_iter = *(_origin_iters.begin());
} else {
_cur_iter = nullptr;
}
} else {
return st;
}
}
return Status::EndOfFile("End of VUnionIterator");
}
RowwiseIterator* new_merge_iterator(std::vector<RowwiseIterator*>& inputs, std::shared_ptr<MemTracker> parent) {
if (inputs.size() == 1) {
return *(inputs.begin());
}
return new VMergeIterator(inputs, parent);
}
RowwiseIterator* new_union_iterator(std::vector<RowwiseIterator*>& inputs, std::shared_ptr<MemTracker> parent) {
if (inputs.size() == 1) {
return *(inputs.begin());
}
return new VUnionIterator(inputs, parent);
}
RowwiseIterator* new_auto_increment_iterator(const Schema& schema, size_t num_rows) {
return new VAutoIncrementIterator(schema, num_rows);
}
}
} // namespace doris