Files
doris/be/src/exec/analytic_eval_node.cpp
chenhao7253886 37b4cafe87 Change variable and namespace name in BE (#268)
Change 'palo' to 'doris'
2018-11-02 10:22:32 +08:00

922 lines
37 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exec/analytic_eval_node.h"
#include "exprs/agg_fn_evaluator.h"
#include "exprs/anyval_util.h"
#include "runtime/buffered_tuple_stream.h"
#include "runtime/descriptors.h"
#include "runtime/row_batch.h"
#include "runtime/runtime_state.h"
#include "udf/udf_internal.h"
static const int MAX_TUPLE_POOL_SIZE = 8 * 1024 * 1024; // 8MB
namespace doris {
using doris_udf::BigIntVal;
AnalyticEvalNode::AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode,
const DescriptorTbl& descs) :
ExecNode(pool, tnode, descs),
_window(tnode.analytic_node.window),
_intermediate_tuple_desc(
descs.get_tuple_descriptor(tnode.analytic_node.intermediate_tuple_id)),
_result_tuple_desc(
descs.get_tuple_descriptor(tnode.analytic_node.output_tuple_id)),
_buffered_tuple_desc(NULL),
_partition_by_eq_expr_ctx(NULL),
_order_by_eq_expr_ctx(NULL),
_rows_start_offset(0),
_rows_end_offset(0),
_has_first_val_null_offset(false),
_first_val_null_offset(0),
_last_result_idx(-1),
_prev_pool_last_result_idx(-1),
_prev_pool_last_window_idx(-1),
_curr_tuple(NULL),
_dummy_result_tuple(NULL),
_curr_partition_idx(-1),
_prev_input_row(NULL),
_input_eos(false),
_evaluation_timer(NULL) {
if (tnode.analytic_node.__isset.buffered_tuple_id) {
_buffered_tuple_desc = descs.get_tuple_descriptor(
tnode.analytic_node.buffered_tuple_id);
}
if (!tnode.analytic_node.__isset.window) {
_fn_scope = AnalyticEvalNode::PARTITION;
} else if (tnode.analytic_node.window.type == TAnalyticWindowType::RANGE) {
_fn_scope = AnalyticEvalNode::RANGE;
DCHECK(!_window.__isset.window_start)
<< "RANGE windows must have UNBOUNDED PRECEDING";
DCHECK(!_window.__isset.window_end ||
_window.window_end.type == TAnalyticWindowBoundaryType::CURRENT_ROW)
<< "RANGE window end bound must be CURRENT ROW or UNBOUNDED FOLLOWING";
} else {
DCHECK_EQ(tnode.analytic_node.window.type, TAnalyticWindowType::ROWS);
_fn_scope = AnalyticEvalNode::ROWS;
if (_window.__isset.window_start) {
TAnalyticWindowBoundary b = _window.window_start;
if (b.__isset.rows_offset_value) {
_rows_start_offset = b.rows_offset_value;
if (b.type == TAnalyticWindowBoundaryType::PRECEDING) {
_rows_start_offset *= -1;
}
} else {
DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::CURRENT_ROW);
_rows_start_offset = 0;
}
}
if (_window.__isset.window_end) {
TAnalyticWindowBoundary b = _window.window_end;
if (b.__isset.rows_offset_value) {
_rows_end_offset = b.rows_offset_value;
if (b.type == TAnalyticWindowBoundaryType::PRECEDING) {
_rows_end_offset *= -1;
}
} else {
DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::CURRENT_ROW);
_rows_end_offset = 0;
}
}
}
VLOG_ROW << "tnode=" << apache::thrift::ThriftDebugString(tnode);
}
Status AnalyticEvalNode::init(const TPlanNode& tnode, RuntimeState* state) {
RETURN_IF_ERROR(ExecNode::init(tnode, state));
const TAnalyticNode& analytic_node = tnode.analytic_node;
bool has_lead_fn = false;
for (int i = 0; i < analytic_node.analytic_functions.size(); ++i) {
AggFnEvaluator* evaluator = NULL;
RETURN_IF_ERROR(AggFnEvaluator::create(
_pool, analytic_node.analytic_functions[i], true, &evaluator));
_evaluators.push_back(evaluator);
const TFunction& fn = analytic_node.analytic_functions[i].nodes[0].fn;
_is_lead_fn.push_back("lead" == fn.name.function_name);
has_lead_fn = has_lead_fn || _is_lead_fn.back();
}
DCHECK(!has_lead_fn || !_window.__isset.window_start);
DCHECK(_fn_scope != PARTITION || analytic_node.order_by_exprs.empty());
DCHECK(_window.__isset.window_end || !_window.__isset.window_start)
<< "UNBOUNDED FOLLOWING is only supported with UNBOUNDED PRECEDING.";
if (analytic_node.__isset.partition_by_eq) {
DCHECK(analytic_node.__isset.buffered_tuple_id);
RETURN_IF_ERROR(Expr::create_expr_tree(
_pool, analytic_node.partition_by_eq, &_partition_by_eq_expr_ctx));
}
if (analytic_node.__isset.order_by_eq) {
DCHECK(analytic_node.__isset.buffered_tuple_id);
RETURN_IF_ERROR(Expr::create_expr_tree(
_pool, analytic_node.order_by_eq, &_order_by_eq_expr_ctx));
}
return Status::OK;
}
Status AnalyticEvalNode::prepare(RuntimeState* state) {
SCOPED_TIMER(_runtime_profile->total_time_counter());
RETURN_IF_ERROR(ExecNode::prepare(state));
DCHECK(child(0)->row_desc().is_prefix_of(row_desc()));
_child_tuple_desc = child(0)->row_desc().tuple_descriptors()[0];
_curr_tuple_pool.reset(new MemPool(mem_tracker()));
_prev_tuple_pool.reset(new MemPool(mem_tracker()));
_mem_pool.reset(new MemPool(mem_tracker()));
_evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime");
DCHECK_EQ(_result_tuple_desc->slots().size(), _evaluators.size());
for (int i = 0; i < _evaluators.size(); ++i) {
doris_udf::FunctionContext* ctx;
RETURN_IF_ERROR(_evaluators[i]->prepare(state, child(0)->row_desc(), _mem_pool.get(),
_intermediate_tuple_desc->slots()[i], _result_tuple_desc->slots()[i],
mem_tracker(), &ctx));
_fn_ctxs.push_back(ctx);
state->obj_pool()->add(ctx);
}
if (_partition_by_eq_expr_ctx != NULL || _order_by_eq_expr_ctx != NULL) {
DCHECK(_buffered_tuple_desc != NULL);
vector<TTupleId> tuple_ids;
tuple_ids.push_back(child(0)->row_desc().tuple_descriptors()[0]->id());
tuple_ids.push_back(_buffered_tuple_desc->id());
RowDescriptor cmp_row_desc(state->desc_tbl(), tuple_ids, vector<bool>(2, false));
if (_partition_by_eq_expr_ctx != NULL) {
RETURN_IF_ERROR(
_partition_by_eq_expr_ctx->prepare(state, cmp_row_desc, expr_mem_tracker()));
//AddExprCtxToFree(_partition_by_eq_expr_ctx);
}
if (_order_by_eq_expr_ctx != NULL) {
RETURN_IF_ERROR(
_order_by_eq_expr_ctx->prepare(state, cmp_row_desc, expr_mem_tracker()));
//AddExprCtxToFree(_order_by_eq_expr_ctx);
}
}
_child_tuple_cmp_row = reinterpret_cast<TupleRow*>(
_mem_pool->allocate(sizeof(Tuple*) * 2));
return Status::OK;
}
Status AnalyticEvalNode::open(RuntimeState* state) {
SCOPED_TIMER(_runtime_profile->total_time_counter());
RETURN_IF_ERROR(ExecNode::open(state));
RETURN_IF_CANCELLED(state);
//RETURN_IF_ERROR(QueryMaintenance(state));
RETURN_IF_ERROR(child(0)->open(state));
// RETURN_IF_ERROR(state->block_mgr()->RegisterClient(2, mem_tracker(), state, &client_));
_input_stream.reset(new BufferedTupleStream(state, child(0)->row_desc(), state->block_mgr()));
RETURN_IF_ERROR(_input_stream->init(runtime_profile()));
DCHECK_EQ(_evaluators.size(), _fn_ctxs.size());
for (int i = 0; i < _evaluators.size(); ++i) {
RETURN_IF_ERROR(_evaluators[i]->open(state, _fn_ctxs[i]));
if ("first_value_rewrite" == _evaluators[i]->fn_name() &&
_fn_ctxs[i]->get_num_args() == 2) {
DCHECK(!_has_first_val_null_offset);
_first_val_null_offset =
reinterpret_cast<BigIntVal*>(_fn_ctxs[i]->get_constant_arg(1))->val;
VLOG_FILE << id() << " FIRST_VAL rewrite null offset: " << _first_val_null_offset;
_has_first_val_null_offset = true;
}
}
if (_partition_by_eq_expr_ctx != NULL) {
RETURN_IF_ERROR(_partition_by_eq_expr_ctx->open(state));
}
if (_order_by_eq_expr_ctx != NULL) {
RETURN_IF_ERROR(_order_by_eq_expr_ctx->open(state));
}
// An intermediate tuple is only allocated once and is reused.
_curr_tuple = Tuple::create(_intermediate_tuple_desc->byte_size(), _mem_pool.get());
AggFnEvaluator::init(_evaluators, _fn_ctxs, _curr_tuple);
_dummy_result_tuple = Tuple::create(_result_tuple_desc->byte_size(), _mem_pool.get());
// Initialize state for the first partition.
init_next_partition(0);
// Fetch the first input batch so that some _prev_input_row can be set here to avoid
// special casing in GetNext().
_prev_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker()));
_curr_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker()));
while (!_input_eos && _prev_input_row == NULL) {
RETURN_IF_ERROR(child(0)->get_next(state, _curr_child_batch.get(), &_input_eos));
if (_curr_child_batch->num_rows() > 0) {
_prev_input_row = _curr_child_batch->get_row(0);
process_child_batches(state);
} else {
// Empty batch, still need to reset.
_curr_child_batch->reset();
}
}
if (_prev_input_row == NULL) {
DCHECK(_input_eos);
// Delete _curr_child_batch to indicate there is no batch to process in GetNext()
_curr_child_batch.reset();
}
return Status::OK;
}
string debug_window_bound_string(const TAnalyticWindowBoundary& b) {
if (b.type == TAnalyticWindowBoundaryType::CURRENT_ROW) {
return "CURRENT_ROW";
}
stringstream ss;
if (b.__isset.rows_offset_value) {
ss << b.rows_offset_value;
} else {
// TODO: Return debug string when range offsets are supported
DCHECK(false) << "Range offsets not yet implemented";
}
if (b.type == TAnalyticWindowBoundaryType::PRECEDING) {
ss << " PRECEDING";
} else {
DCHECK_EQ(b.type, TAnalyticWindowBoundaryType::FOLLOWING);
ss << " FOLLOWING";
}
return ss.str();
}
std::string AnalyticEvalNode::debug_window_string() const {
std::stringstream ss;
if (_fn_scope == PARTITION) {
ss << "NO WINDOW";
return ss.str();
}
ss << "{type=";
if (_fn_scope == RANGE) {
ss << "RANGE";
} else {
ss << "ROWS";
}
ss << ", start=";
if (_window.__isset.window_start) {
ss << debug_window_bound_string(_window.window_start);
} else {
ss << "UNBOUNDED_PRECEDING";
}
ss << ", end=";
if (_window.__isset.window_end) {
ss << debug_window_bound_string(_window.window_end) << "}";
} else {
ss << "UNBOUNDED_FOLLOWING";
}
return ss.str();
}
std::string AnalyticEvalNode::debug_state_string(bool detailed) const {
stringstream ss;
ss << "num_returned=" << _input_stream->rows_returned()
<< " num_rows=" << _input_stream->num_rows()
<< " _curr_partition_idx=" << _curr_partition_idx
<< " last_result_idx=" << _last_result_idx;
if (detailed) {
ss << " result_tuples idx: [";
for (std::list<std::pair<int64_t, Tuple*> >::const_iterator it = _result_tuples.begin();
it != _result_tuples.end(); ++it) {
ss << it->first;
if (*it != _result_tuples.back()) {
ss << ", ";
}
}
ss << "]";
if (_fn_scope == ROWS && _window.__isset.window_start) {
ss << " window_tuples idx: [";
for (std::list<std::pair<int64_t, Tuple*> >::const_iterator it = _window_tuples.begin();
it != _window_tuples.end(); ++it) {
ss << it->first;
if (*it != _window_tuples.back()) {
ss << ", ";
}
}
ss << "]";
}
} else {
if (_fn_scope == ROWS && _window.__isset.window_start) {
if (_window_tuples.empty()) {
ss << " window_tuples empty";
} else {
ss << " window_tuples idx range: (" << _window_tuples.front().first << ","
<< _window_tuples.back().first << ")";
}
}
if (_result_tuples.empty()) {
ss << " result_tuples empty";
} else {
ss << " result_tuples idx range: (" << _result_tuples.front().first << ","
<< _result_tuples.back().first << ")";
}
}
return ss.str();
}
void AnalyticEvalNode::add_result_tuple(int64_t stream_idx) {
VLOG_ROW << id() << " add_result_tuple idx=" << stream_idx;
DCHECK(_curr_tuple != NULL);
Tuple* result_tuple = Tuple::create(_result_tuple_desc->byte_size(),
_curr_tuple_pool.get());
AggFnEvaluator::get_value(_evaluators, _fn_ctxs, _curr_tuple, result_tuple);
DCHECK_GT(stream_idx, _last_result_idx);
_result_tuples.push_back(std::pair<int64_t, Tuple*>(stream_idx, result_tuple));
_last_result_idx = stream_idx;
VLOG_ROW << id() << " Added result tuple, final state: " << debug_state_string(true);
}
inline void AnalyticEvalNode::try_add_result_tuple_for_prev_row(bool next_partition,
int64_t stream_idx, TupleRow* row) {
// The analytic fns are finalized after the previous row if we found a new partition
// or the window is a RANGE and the order by exprs changed. For ROWS windows we do not
// need to compare the current row to the previous row.
VLOG_ROW << id() << " try_add_result_tuple_for_prev_row partition=" << next_partition
<< " idx=" << stream_idx;
if (_fn_scope == ROWS) {
return;
}
if (next_partition || (_fn_scope == RANGE && _window.__isset.window_end &&
!prev_row_compare(_order_by_eq_expr_ctx))) {
add_result_tuple(stream_idx - 1);
}
}
inline void AnalyticEvalNode::try_add_result_tuple_for_curr_row(int64_t stream_idx,
TupleRow* row) {
VLOG_ROW << id() << " try_add_result_tuple_for_curr_row idx=" << stream_idx;
// We only add results at this point for ROWS windows (unless unbounded following)
if (_fn_scope != ROWS || !_window.__isset.window_end) {
return;
}
// Nothing to add if the end offset is before the start of the partition.
if (stream_idx - _rows_end_offset < _curr_partition_idx) {
return;
}
add_result_tuple(stream_idx - _rows_end_offset);
}
inline void AnalyticEvalNode::try_remove_rows_before_window(int64_t stream_idx) {
if (_fn_scope != ROWS || !_window.__isset.window_start) {
return;
}
// The start of the window may have been before the current partition, in which case
// there is no tuple to remove in _window_tuples. Check the index of the row at which
// tuples from _window_tuples should begin to be removed.
int64_t remove_idx = stream_idx - _rows_end_offset + std::min(_rows_start_offset, 0L) - 1;
if (remove_idx < _curr_partition_idx) {
return;
}
VLOG_ROW << id() << " Remove idx=" << remove_idx << " stream_idx=" << stream_idx;
DCHECK(!_window_tuples.empty()) << debug_state_string(true);
DCHECK_EQ(remove_idx + std::max(_rows_start_offset, 0L), _window_tuples.front().first)
<< debug_state_string(true);
TupleRow* remove_row = reinterpret_cast<TupleRow*>(&_window_tuples.front().second);
AggFnEvaluator::remove(_evaluators, _fn_ctxs, remove_row, _curr_tuple);
_window_tuples.pop_front();
}
inline void AnalyticEvalNode::try_add_remaining_results(int64_t partition_idx,
int64_t prev_partition_idx) {
DCHECK_LT(prev_partition_idx, partition_idx);
// For PARTITION, RANGE, or ROWS with UNBOUNDED PRECEDING: add a result tuple for the
// remaining rows in the partition that do not have an associated result tuple yet.
if (_fn_scope != ROWS || !_window.__isset.window_end) {
if (_last_result_idx < partition_idx - 1) {
add_result_tuple(partition_idx - 1);
}
return;
}
// lead() is re-written to a ROWS window with an end bound FOLLOWING. Any remaining
// results need the default value (set by Init()). If this is the case, the start bound
// is UNBOUNDED PRECEDING (DCHECK in Init()).
for (int i = 0; i < _evaluators.size(); ++i) {
if (_is_lead_fn[i]) {
_evaluators[i]->init(_fn_ctxs[i], _curr_tuple);
}
}
// If the start bound is not UNBOUNDED PRECEDING and there are still rows in the
// partition for which we need to produce result tuples, we need to continue removing
// input tuples at the start of the window from each row that we're adding results for.
VLOG_ROW << id() << " try_add_remaining_results prev_partition_idx=" << prev_partition_idx
<< " " << debug_state_string(true);
for (int64_t next_result_idx = _last_result_idx + 1; next_result_idx < partition_idx;
++next_result_idx) {
if (_window_tuples.empty()) {
break;
}
if (next_result_idx + _rows_start_offset > _window_tuples.front().first) {
DCHECK_EQ(next_result_idx + _rows_start_offset - 1, _window_tuples.front().first);
// For every tuple that is removed from the window: Remove() from the evaluators
// and add the result tuple at the next index.
VLOG_ROW << id() << " Remove window_row_idx=" << _window_tuples.front().first
<< " for result row at idx=" << next_result_idx;
TupleRow* remove_row = reinterpret_cast<TupleRow*>(&_window_tuples.front().second);
AggFnEvaluator::remove(_evaluators, _fn_ctxs, remove_row, _curr_tuple);
_window_tuples.pop_front();
}
add_result_tuple(_last_result_idx + 1);
}
// If there are still rows between the row with the last result (add_result_tuple() may
// have updated _last_result_idx) and the partition boundary, add the current results
// for the remaining rows with the same result tuple (_curr_tuple is not modified).
if (_last_result_idx < partition_idx - 1) {
add_result_tuple(partition_idx - 1);
}
}
inline void AnalyticEvalNode::init_next_partition(int64_t stream_idx) {
VLOG_FILE << id() << " init_next_partition idx=" << stream_idx;
DCHECK_LT(_curr_partition_idx, stream_idx);
int64_t prev_partition_stream_idx = _curr_partition_idx;
_curr_partition_idx = stream_idx;
// If the window has an end bound preceding the current row, we will have output
// tuples for rows beyond the partition so they should be removed. If there was only
// one result tuple left in the partition it will remain in _result_tuples because it
// is the empty result tuple (i.e. called Init() and never Update()) that was added
// when initializing the previous partition so that the first rows have the default
// values (where there are no preceding rows in the window).
bool removed_results_past_partition = false;
while (!_result_tuples.empty() && _last_result_idx >= _curr_partition_idx) {
removed_results_past_partition = true;
DCHECK(_window.__isset.window_end &&
_window.window_end.type == TAnalyticWindowBoundaryType::PRECEDING);
VLOG_ROW << id() << " Removing result past partition idx: "
<< _result_tuples.back().first;
Tuple* prev_result_tuple = _result_tuples.back().second;
_result_tuples.pop_back();
if (_result_tuples.empty() ||
_result_tuples.back().first < prev_partition_stream_idx) {
// prev_result_tuple was the last result tuple in the partition, add it back with
// the index of the last row in the partition so that all output rows in this
// partition get the default result tuple.
_result_tuples.push_back(
std::pair<int64_t, Tuple*>(_curr_partition_idx - 1, prev_result_tuple));
}
_last_result_idx = _result_tuples.back().first;
}
if (removed_results_past_partition) {
VLOG_ROW << id() << " After removing results past partition: "
<< debug_state_string(true);
DCHECK_EQ(_last_result_idx, _curr_partition_idx - 1);
DCHECK_LE(_input_stream->rows_returned(), _last_result_idx);
}
if (_fn_scope == ROWS && stream_idx > 0 && (!_window.__isset.window_end ||
_window.window_end.type == TAnalyticWindowBoundaryType::FOLLOWING)) {
try_add_remaining_results(stream_idx, prev_partition_stream_idx);
}
_window_tuples.clear();
// Re-initialize _curr_tuple.
VLOG_ROW << id() << " Reset curr_tuple";
// Call finalize to release resources; result is not needed but the dst tuple must be
// a tuple described by _result_tuple_desc.
AggFnEvaluator::finalize(_evaluators, _fn_ctxs, _curr_tuple, _dummy_result_tuple);
_curr_tuple->init(_intermediate_tuple_desc->byte_size());
AggFnEvaluator::init(_evaluators, _fn_ctxs, _curr_tuple);
// Add a result tuple containing values set by Init() (e.g. NULL for sum(), 0 for
// count()) for output rows that have no input rows in the window. We need to add this
// result tuple before any input rows are consumed and the evaluators are updated.
if (_fn_scope == ROWS && _window.__isset.window_end &&
_window.window_end.type == TAnalyticWindowBoundaryType::PRECEDING) {
if (_has_first_val_null_offset) {
// Special handling for FIRST_VALUE which has the window rewritten in the FE
// in order to evaluate the fn efficiently with a trivial agg fn implementation.
// This occurs when the original analytic window has a start bound X PRECEDING. In
// that case, the window is rewritten to have an end bound X PRECEDING which would
// normally mean we add the newly Init()'d result tuple X rows down (so that those
// first rows have the initial value because they have no rows in their windows).
// However, the original query did not actually have X PRECEDING so we need to do
// one of the following:
// 1) Do not insert the initial result tuple with at all, indicated by
// _first_val_null_offset == -1. This happens when the original end bound was
// actually CURRENT ROW or Y FOLLOWING.
// 2) Insert the initial result tuple at _first_val_null_offset. This happens when
// the end bound was actually Y PRECEDING.
if (_first_val_null_offset != -1) {
add_result_tuple(_curr_partition_idx + _first_val_null_offset - 1);
}
} else {
add_result_tuple(_curr_partition_idx - _rows_end_offset - 1);
}
}
}
inline bool AnalyticEvalNode::prev_row_compare(ExprContext* pred_ctx) {
DCHECK(pred_ctx != NULL);
doris_udf::BooleanVal result = pred_ctx->get_boolean_val(_child_tuple_cmp_row);
DCHECK(!result.is_null);
return result.val;
}
Status AnalyticEvalNode::process_child_batches(RuntimeState* state) {
// Consume child batches until eos or there are enough rows to return more than an
// output batch. Ensuring there is at least one more row left after returning results
// allows us to simplify the logic dealing with _last_result_idx and _result_tuples.
while (_curr_child_batch.get() != NULL &&
num_output_rows_ready() < state->batch_size() + 1) {
RETURN_IF_CANCELLED(state);
//RETURN_IF_ERROR(QueryMaintenance(state));
RETURN_IF_ERROR(process_child_batch(state));
// TODO: DCHECK that the size of _result_tuples is bounded. It shouldn't be larger
// than 2x the batch size unless the end bound has an offset preceding, in which
// case it may be slightly larger (proportional to the offset but still bounded).
if (_input_eos) {
// Already processed the last child batch. Clean up and break.
_curr_child_batch.reset();
_prev_child_batch.reset();
break;
}
_prev_child_batch->reset();
_prev_child_batch.swap(_curr_child_batch);
RETURN_IF_ERROR(child(0)->get_next(state, _curr_child_batch.get(), &_input_eos));
}
return Status::OK;
}
Status AnalyticEvalNode::process_child_batch(RuntimeState* state) {
// TODO: DCHECK input is sorted (even just first row vs _prev_input_row)
VLOG_FILE << id() << " process_child_batch: " << debug_state_string(false)
<< " input batch size:" << _curr_child_batch->num_rows()
<< " tuple pool size:" << _curr_tuple_pool->total_allocated_bytes();
SCOPED_TIMER(_evaluation_timer);
// BufferedTupleStream::num_rows() returns the total number of rows that have been
// inserted into the stream (it does not decrease when we read rows), so the index of
// the next input row that will be inserted will be the current size of the stream.
int64_t stream_idx = _input_stream->num_rows();
// Stores the stream_idx of the row that was last inserted into _window_tuples.
int64_t last_window_tuple_idx = -1;
for (int i = 0; i < _curr_child_batch->num_rows(); ++i, ++stream_idx) {
TupleRow* row = _curr_child_batch->get_row(i);
_child_tuple_cmp_row->set_tuple(0, _prev_input_row->get_tuple(0));
_child_tuple_cmp_row->set_tuple(1, row->get_tuple(0));
try_remove_rows_before_window(stream_idx);
// Every row is compared against the previous row to determine if (a) the row
// starts a new partition or (b) the row does not share the same values for the
// ordering exprs. When either of these occurs, the _evaluators are finalized and
// the result tuple is added to _result_tuples so that it may be added to output
// rows in get_next_output_batch(). When a new partition is found (a), a new, empty
// result tuple is created and initialized over the _evaluators. If the row has
// different values for the ordering exprs (b), then a new tuple is created but
// copied from _curr_tuple because the original is used for one or more previous
// row(s) but the incremental state still applies to the current row.
bool next_partition = false;
if (_partition_by_eq_expr_ctx != NULL) {
// _partition_by_eq_expr_ctx checks equality over the predicate exprs
next_partition = !prev_row_compare(_partition_by_eq_expr_ctx);
}
try_add_result_tuple_for_prev_row(next_partition, stream_idx, row);
if (next_partition) {
init_next_partition(stream_idx);
}
// The _evaluators are updated with the current row.
if (_fn_scope != ROWS || !_window.__isset.window_start ||
stream_idx - _rows_start_offset >= _curr_partition_idx) {
VLOG_ROW << id() << " Update idx=" << stream_idx;
AggFnEvaluator::add(_evaluators, _fn_ctxs, row, _curr_tuple);
if (_window.__isset.window_start) {
VLOG_ROW << id() << " Adding tuple to window at idx=" << stream_idx;
Tuple* tuple = row->get_tuple(0)->deep_copy(*_child_tuple_desc,
_curr_tuple_pool.get());
_window_tuples.push_back(std::pair<int64_t, Tuple*>(stream_idx, tuple));
last_window_tuple_idx = stream_idx;
}
}
try_add_result_tuple_for_curr_row(stream_idx, row);
// Buffer the entire input row to be returned later with the analytic eval results.
if (UNLIKELY(!_input_stream->add_row(row))) {
// AddRow returns false if an error occurs (available via status()) or there is
// not enough memory (status() is OK). If there isn't enough memory, we unpin
// the stream and continue writing/reading in unpinned mode.
// TODO: Consider re-pinning later if the output stream is fully consumed.
RETURN_IF_ERROR(_input_stream->status());
// RETURN_IF_ERROR(_input_stream->UnpinStream());
VLOG_FILE << id() << " Unpin input stream while adding row idx=" << stream_idx;
if (!_input_stream->add_row(row)) {
// Rows should be added in unpinned mode unless an error occurs.
RETURN_IF_ERROR(_input_stream->status());
DCHECK(false);
}
}
_prev_input_row = row;
}
// We need to add the results for the last row(s).
if (_input_eos) {
try_add_remaining_results(stream_idx, _curr_partition_idx);
}
// Transfer resources to _prev_tuple_pool when enough resources have accumulated
// and the _prev_tuple_pool has already been transfered to an output batch.
if (_curr_tuple_pool->total_allocated_bytes() > MAX_TUPLE_POOL_SIZE &&
(_prev_pool_last_result_idx == -1 || _prev_pool_last_window_idx == -1)) {
_prev_tuple_pool->acquire_data(_curr_tuple_pool.get(), false);
_prev_pool_last_result_idx = _last_result_idx;
_prev_pool_last_window_idx = last_window_tuple_idx;
VLOG_FILE << id() << " Transfer resources from curr to prev pool at idx: "
<< stream_idx << ", stores tuples with last result idx: "
<< _prev_pool_last_result_idx << " last window idx: "
<< _prev_pool_last_window_idx;
}
return Status::OK;
}
Status AnalyticEvalNode::get_next_output_batch(RuntimeState* state, RowBatch* output_batch,
bool* eos) {
SCOPED_TIMER(_evaluation_timer);
VLOG_FILE << id() << " get_next_output_batch: " << debug_state_string(false)
<< " tuple pool size:" << _curr_tuple_pool->total_allocated_bytes();
if (_input_stream->rows_returned() == _input_stream->num_rows()) {
*eos = true;
return Status::OK;
}
const int num_child_tuples = child(0)->row_desc().tuple_descriptors().size();
ExprContext** ctxs = &_conjunct_ctxs[0];
int num_ctxs = _conjunct_ctxs.size();
RowBatch input_batch(child(0)->row_desc(), output_batch->capacity(), mem_tracker());
int64_t stream_idx = _input_stream->rows_returned();
RETURN_IF_ERROR(_input_stream->get_next(&input_batch, eos));
for (int i = 0; i < input_batch.num_rows(); ++i) {
if (reached_limit()) {
break;
}
DCHECK(!output_batch->is_full());
DCHECK(!_result_tuples.empty());
VLOG_ROW << id() << " Output row idx=" << stream_idx << " " << debug_state_string(true);
// CopyRow works as expected: input_batch tuples form a prefix of output_batch
// tuples.
TupleRow* dest = output_batch->get_row(output_batch->add_row());
input_batch.copy_row(input_batch.get_row(i), dest);
dest->set_tuple(num_child_tuples, _result_tuples.front().second);
if (ExecNode::eval_conjuncts(ctxs, num_ctxs, dest)) {
output_batch->commit_last_row();
++_num_rows_returned;
}
// Remove the head of _result_tuples if all rows using that evaluated tuple
// have been returned.
DCHECK_LE(stream_idx, _result_tuples.front().first);
if (stream_idx >= _result_tuples.front().first) {
_result_tuples.pop_front();
}
++stream_idx;
}
input_batch.transfer_resource_ownership(output_batch);
if (reached_limit()) {
*eos = true;
}
return Status::OK;
}
inline int64_t AnalyticEvalNode::num_output_rows_ready() const {
if (_result_tuples.empty()) {
return 0;
}
int64_t rows_to_return = _last_result_idx - _input_stream->rows_returned();
if (_last_result_idx > _input_stream->num_rows()) {
// This happens when we were able to add a result tuple before consuming child rows,
// e.g. initializing a new partition with an end bound that is X preceding. The first
// X rows get the default value and we add that tuple to _result_tuples before
// consuming child rows. It's possible the result is negative, and that's fine
// because this result is only used to determine if the number of rows to return
// is at least as big as the batch size.
rows_to_return -= _last_result_idx - _input_stream->num_rows();
} else {
DCHECK_GE(rows_to_return, 0);
}
return rows_to_return;
return 0;
}
Status AnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
SCOPED_TIMER(_runtime_profile->total_time_counter());
RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT));
RETURN_IF_CANCELLED(state);
//RETURN_IF_ERROR(QueryMaintenance(state));
VLOG_FILE << id() << " GetNext: " << debug_state_string(false);
if (reached_limit()) {
*eos = true;
return Status::OK;
} else {
*eos = false;
}
RETURN_IF_ERROR(process_child_batches(state));
bool output_eos = false;
RETURN_IF_ERROR(get_next_output_batch(state, row_batch, &output_eos));
if (_curr_child_batch.get() == NULL && output_eos) {
*eos = true;
}
// Transfer resources to the output row batch if enough have accumulated and they're
// no longer needed by output rows to be returned later.
if (_prev_pool_last_result_idx != -1 &&
_prev_pool_last_result_idx < _input_stream->rows_returned() &&
_prev_pool_last_window_idx < _window_tuples.front().first) {
VLOG_FILE << id() << " Transfer prev pool to output batch, "
<< " pool size: " << _prev_tuple_pool->total_allocated_bytes()
<< " last result idx: " << _prev_pool_last_result_idx
<< " last window idx: " << _prev_pool_last_window_idx;
row_batch->tuple_data_pool()->acquire_data(_prev_tuple_pool.get(), !*eos);
_prev_pool_last_result_idx = -1;
_prev_pool_last_window_idx = -1;
}
COUNTER_SET(_rows_returned_counter, _num_rows_returned);
return Status::OK;
}
Status AnalyticEvalNode::close(RuntimeState* state) {
if (is_closed()) {
return Status::OK;
}
if (_input_stream.get() != NULL) {
_input_stream->close();
}
// Close all evaluators and fn ctxs. If an error occurred in Init or rrepare there may
// be fewer ctxs than evaluators. We also need to Finalize if _curr_tuple was created
// in Open.
DCHECK_LE(_fn_ctxs.size(), _evaluators.size());
DCHECK(_curr_tuple == NULL || _fn_ctxs.size() == _evaluators.size());
for (int i = 0; i < _evaluators.size(); ++i) {
// Need to make sure finalize is called in case there is any state to clean up.
if (_curr_tuple != NULL) {
_evaluators[i]->finalize(_fn_ctxs[i], _curr_tuple, _dummy_result_tuple);
}
_evaluators[i]->close(state);
}
for (int i = 0; i < _fn_ctxs.size(); ++i) {
_fn_ctxs[i]->impl()->close();
}
if (_partition_by_eq_expr_ctx != NULL) {
_partition_by_eq_expr_ctx->close(state);
}
if (_order_by_eq_expr_ctx != NULL) {
_order_by_eq_expr_ctx->close(state);
}
if (_prev_child_batch.get() != NULL) {
_prev_child_batch.reset();
}
if (_curr_child_batch.get() != NULL) {
_curr_child_batch.reset();
}
if (_curr_tuple_pool.get() != NULL) {
_curr_tuple_pool->free_all();
}
if (_prev_tuple_pool.get() != NULL) {
_prev_tuple_pool->free_all();
}
if (_mem_pool.get() != NULL) {
_mem_pool->free_all();
}
ExecNode::close(state);
return Status::OK;
}
void AnalyticEvalNode::debug_string(int indentation_level, stringstream* out) const {
*out << string(indentation_level * 2, ' ');
*out << "AnalyticEvalNode("
<< " window=" << debug_window_string();
if (_partition_by_eq_expr_ctx != NULL) {
// *out << " partition_exprs=" << _partition_by_eq_expr_ctx->debug_string();
}
if (_order_by_eq_expr_ctx != NULL) {
// *out << " order_by_exprs=" << _order_by_eq_expr_ctx->debug_string();
}
*out << AggFnEvaluator::debug_string(_evaluators);
ExecNode::debug_string(indentation_level, out);
*out << ")";
}
//Status AnalyticEvalNode::QueryMaintenance(RuntimeState* state) {
// for (int i = 0; i < evaluators_.size(); ++i) {
// Expr::FreeLocalAllocations(evaluators_[i]->input_expr_ctxs());
// }
// return ExecNode::QueryMaintenance(state);
//}
}