// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "vec/exec/vaggregation_node.h" #include #include #include #include #include #include #include #include #include "common/status.h" #include "exec/exec_node.h" #include "runtime/block_spill_manager.h" #include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" #include "runtime/memory/mem_tracker.h" #include "runtime/primitive_type.h" #include "runtime/runtime_state.h" #include "runtime/thread_context.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/common/hash_table/hash.h" #include "vec/common/hash_table/hash_map_context_creator.h" #include "vec/common/hash_table/partitioned_hash_map.h" #include "vec/common/hash_table/string_hash_table.h" #include "vec/common/string_buffer.hpp" #include "vec/core/block.h" #include "vec/core/columns_with_type_and_name.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_string.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" #include "vec/utils/util.hpp" namespace doris { class ObjectPool; } // namespace doris namespace doris::vectorized { /// The minimum reduction factor (input rows divided by output rows) to grow hash tables /// in a streaming preaggregation, given that the hash tables are currently the given /// size or above. The sizes roughly correspond to hash table sizes where the bucket /// arrays will fit in a cache level. Intuitively, we don't want the working set of the /// aggregation to expand to the next level of cache unless we're reducing the input /// enough to outweigh the increased memory latency we'll incur for each hash table /// lookup. /// /// Note that the current reduction achieved is not always a good estimate of the /// final reduction. It may be biased either way depending on the ordering of the /// input. If the input order is random, we will underestimate the final reduction /// factor because the probability of a row having the same key as a previous row /// increases as more input is processed. If the input order is correlated with the /// key, skew may bias the estimate. If high cardinality keys appear first, we /// may overestimate and if low cardinality keys appear first, we underestimate. /// To estimate the eventual reduction achieved, we estimate the final reduction /// using the planner's estimated input cardinality and the assumption that input /// is in a random order. This means that we assume that the reduction factor will /// increase over time. struct StreamingHtMinReductionEntry { // Use 'streaming_ht_min_reduction' if the total size of hash table bucket directories in // bytes is greater than this threshold. int min_ht_mem; // The minimum reduction factor to expand the hash tables. double streaming_ht_min_reduction; }; // TODO: experimentally tune these values and also programmatically get the cache size // of the machine that we're running on. static constexpr StreamingHtMinReductionEntry STREAMING_HT_MIN_REDUCTION[] = { // Expand up to L2 cache always. {0, 0.0}, // Expand into L3 cache if we look like we're getting some reduction. // At present, The L2 cache is generally 1024k or more {1024 * 1024, 1.1}, // Expand into main memory if we're getting a significant reduction. // The L3 cache is generally 16MB or more {16 * 1024 * 1024, 2.0}, }; static constexpr int STREAMING_HT_MIN_REDUCTION_SIZE = sizeof(STREAMING_HT_MIN_REDUCTION) / sizeof(STREAMING_HT_MIN_REDUCTION[0]); AggregationNode::AggregationNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : ExecNode(pool, tnode, descs), _intermediate_tuple_id(tnode.agg_node.intermediate_tuple_id), _output_tuple_id(tnode.agg_node.output_tuple_id), _needs_finalize(tnode.agg_node.need_finalize), _is_merge(false) { if (tnode.agg_node.__isset.use_streaming_preaggregation) { _is_streaming_preagg = tnode.agg_node.use_streaming_preaggregation; if (_is_streaming_preagg) { DCHECK(!tnode.agg_node.grouping_exprs.empty()) << "Streaming preaggs do grouping"; DCHECK(_limit == -1) << "Preaggs have no limits"; } } else { _is_streaming_preagg = false; } _is_first_phase = tnode.agg_node.__isset.is_first_phase && tnode.agg_node.is_first_phase; _agg_data = std::make_unique(); _agg_arena_pool = std::make_unique(); } AggregationNode::~AggregationNode() = default; Status AggregationNode::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(ExecNode::init(tnode, state)); // ignore return status for now , so we need to introduce ExecNode::init() RETURN_IF_ERROR(VExpr::create_expr_trees(tnode.agg_node.grouping_exprs, _probe_expr_ctxs)); // init aggregate functions _aggregate_evaluators.reserve(tnode.agg_node.aggregate_functions.size()); // In case of : `select * from (select GoodEvent from hits union select CounterID from hits) as h limit 10;` // only union with limit: we can short circuit query the pipeline exec engine. _can_short_circuit = tnode.agg_node.aggregate_functions.empty() && state->enable_pipeline_exec(); TSortInfo dummy; for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i) { AggFnEvaluator* evaluator = nullptr; RETURN_IF_ERROR(AggFnEvaluator::create( _pool, tnode.agg_node.aggregate_functions[i], tnode.agg_node.__isset.agg_sort_infos ? tnode.agg_node.agg_sort_infos[i] : dummy, &evaluator)); _aggregate_evaluators.push_back(evaluator); } const auto& agg_functions = tnode.agg_node.aggregate_functions; _external_agg_bytes_threshold = state->external_agg_bytes_threshold(); if (_external_agg_bytes_threshold > 0) { size_t spill_partition_count_bits = 4; if (state->query_options().__isset.external_agg_partition_bits) { spill_partition_count_bits = state->query_options().external_agg_partition_bits; } _spill_partition_helper = std::make_unique(spill_partition_count_bits); } _is_merge = std::any_of(agg_functions.cbegin(), agg_functions.cend(), [](const auto& e) { return e.nodes[0].agg_expr.is_merge_agg; }); return Status::OK(); } void AggregationNode::_init_hash_method(const VExprContextSPtrs& probe_exprs) { DCHECK(probe_exprs.size() >= 1); using Type = AggregatedDataVariants::Type; Type t(Type::serialized); if (probe_exprs.size() == 1) { auto is_nullable = probe_exprs[0]->root()->is_nullable(); PrimitiveType type = probe_exprs[0]->root()->result_type(); switch (type) { case TYPE_TINYINT: case TYPE_BOOLEAN: case TYPE_SMALLINT: case TYPE_INT: case TYPE_FLOAT: case TYPE_DATEV2: case TYPE_BIGINT: case TYPE_DOUBLE: case TYPE_DATE: case TYPE_DATETIME: case TYPE_DATETIMEV2: case TYPE_LARGEINT: case TYPE_DECIMALV2: case TYPE_DECIMAL32: case TYPE_DECIMAL64: case TYPE_DECIMAL128I: { size_t size = get_primitive_type_size(type); if (size == 1) { t = Type::int8_key; } else if (size == 2) { t = Type::int16_key; } else if (size == 4) { t = Type::int32_key; } else if (size == 8) { t = Type::int64_key; } else if (size == 16) { t = Type::int128_key; } else { throw Exception(ErrorCode::INTERNAL_ERROR, "meet invalid type size, size={}, type={}", size, type_to_string(type)); } break; } case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_STRING: { t = Type::string_key; break; } default: t = Type::serialized; } _agg_data->init(get_hash_key_type_with_phase(t, !_is_first_phase), is_nullable); } else { if (!try_get_hash_map_context_fixed( _agg_data->method_variant, probe_exprs)) { _agg_data->init(Type::serialized); } } } Status AggregationNode::prepare_profile(RuntimeState* state) { _memory_usage_counter = ADD_LABEL_COUNTER(runtime_profile(), "MemoryUsage"); _hash_table_memory_usage = ADD_CHILD_COUNTER(runtime_profile(), "HashTable", TUnit::BYTES, "MemoryUsage"); _serialize_key_arena_memory_usage = runtime_profile()->AddHighWaterMarkCounter( "SerializeKeyArena", TUnit::BYTES, "MemoryUsage"); _build_table_convert_timer = ADD_TIMER(runtime_profile(), "BuildConvertToPartitionedTime"); _serialize_key_timer = ADD_TIMER(runtime_profile(), "SerializeKeyTime"); _merge_timer = ADD_TIMER(runtime_profile(), "MergeTime"); _expr_timer = ADD_TIMER(runtime_profile(), "ExprTime"); _get_results_timer = ADD_TIMER(runtime_profile(), "GetResultsTime"); _serialize_data_timer = ADD_TIMER(runtime_profile(), "SerializeDataTime"); _serialize_result_timer = ADD_TIMER(runtime_profile(), "SerializeResultTime"); _deserialize_data_timer = ADD_TIMER(runtime_profile(), "DeserializeAndMergeTime"); _hash_table_compute_timer = ADD_TIMER(runtime_profile(), "HashTableComputeTime"); _hash_table_emplace_timer = ADD_TIMER(runtime_profile(), "HashTableEmplaceTime"); _hash_table_iterate_timer = ADD_TIMER(runtime_profile(), "HashTableIterateTime"); _insert_keys_to_column_timer = ADD_TIMER(runtime_profile(), "InsertKeysToColumnTime"); _streaming_agg_timer = ADD_TIMER(runtime_profile(), "StreamingAggTime"); _hash_table_size_counter = ADD_COUNTER(runtime_profile(), "HashTableSize", TUnit::UNIT); _hash_table_input_counter = ADD_COUNTER(runtime_profile(), "HashTableInputCount", TUnit::UNIT); _max_row_size_counter = ADD_COUNTER(runtime_profile(), "MaxRowSizeInBytes", TUnit::UNIT); COUNTER_SET(_max_row_size_counter, (int64_t)0); _intermediate_tuple_desc = state->desc_tbl().get_tuple_descriptor(_intermediate_tuple_id); _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size()); RETURN_IF_ERROR(VExpr::prepare(_probe_expr_ctxs, state, child(0)->row_desc())); _agg_profile_arena = std::make_unique(); int j = _probe_expr_ctxs.size(); for (int i = 0; i < j; ++i) { auto nullable_output = _output_tuple_desc->slots()[i]->is_nullable(); auto nullable_input = _probe_expr_ctxs[i]->root()->is_nullable(); if (nullable_output != nullable_input) { DCHECK(nullable_output); _make_nullable_keys.emplace_back(i); } } for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j]; SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j]; RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare( state, child(0)->row_desc(), intermediate_slot_desc, output_slot_desc)); } // set profile timer to evaluators for (auto& evaluator : _aggregate_evaluators) { evaluator->set_timer(_merge_timer, _expr_timer); } _offsets_of_aggregate_states.resize(_aggregate_evaluators.size()); for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { _offsets_of_aggregate_states[i] = _total_size_of_aggregate_states; const auto& agg_function = _aggregate_evaluators[i]->function(); // aggreate states are aligned based on maximum requirement _align_aggregate_states = std::max(_align_aggregate_states, agg_function->align_of_data()); _total_size_of_aggregate_states += agg_function->size_of_data(); // If not the last aggregate_state, we need pad it so that next aggregate_state will be aligned. if (i + 1 < _aggregate_evaluators.size()) { size_t alignment_of_next_state = _aggregate_evaluators[i + 1]->function()->align_of_data(); if ((alignment_of_next_state & (alignment_of_next_state - 1)) != 0) { return Status::RuntimeError("Logical error: align_of_data is not 2^N"); } /// Extend total_size to next alignment requirement /// Add padding by rounding up 'total_size_of_aggregate_states' to be a multiplier of alignment_of_next_state. _total_size_of_aggregate_states = (_total_size_of_aggregate_states + alignment_of_next_state - 1) / alignment_of_next_state * alignment_of_next_state; } } if (_probe_expr_ctxs.empty()) { _agg_data->init(AggregatedDataVariants::Type::without_key); _agg_data->without_key = reinterpret_cast( _agg_profile_arena->alloc(_total_size_of_aggregate_states)); if (_is_merge) { _executor.execute = std::bind(&AggregationNode::_merge_without_key, this, std::placeholders::_1); } else { _executor.execute = std::bind(&AggregationNode::_execute_without_key, this, std::placeholders::_1); } if (_needs_finalize) { _executor.get_result = std::bind(&AggregationNode::_get_without_key_result, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } else { _executor.get_result = std::bind(&AggregationNode::_serialize_without_key, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } _executor.update_memusage = std::bind(&AggregationNode::_update_memusage_without_key, this); _executor.close = std::bind(&AggregationNode::_close_without_key, this); } else { _init_hash_method(_probe_expr_ctxs); std::visit( [&](auto&& agg_method) { using HashTableType = std::decay_t; using KeyType = typename HashTableType::Key; /// some aggregate functions (like AVG for decimal) have align issues. _aggregate_data_container.reset(new AggregateDataContainer( sizeof(KeyType), ((_total_size_of_aggregate_states + _align_aggregate_states - 1) / _align_aggregate_states) * _align_aggregate_states)); }, _agg_data->method_variant); if (_is_merge) { _executor.execute = std::bind(&AggregationNode::_merge_with_serialized_key, this, std::placeholders::_1); } else { _executor.execute = std::bind(&AggregationNode::_execute_with_serialized_key, this, std::placeholders::_1); } if (_is_streaming_preagg) { _executor.pre_agg = std::bind(&AggregationNode::_pre_agg_with_serialized_key, this, std::placeholders::_1, std::placeholders::_2); } if (_needs_finalize) { _executor.get_result = std::bind( &AggregationNode::_get_with_serialized_key_result, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } else { _executor.get_result = std::bind( &AggregationNode::_serialize_with_serialized_key_result, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); } _executor.update_memusage = std::bind(&AggregationNode::_update_memusage_with_serialized_key, this); _executor.close = std::bind(&AggregationNode::_close_with_serialized_key, this); _should_limit_output = _limit != -1 && // has limit _conjuncts.empty() && // no having conjunct _needs_finalize; // agg's finalize step } fmt::memory_buffer msg; fmt::format_to(msg, "(_is_merge: {}, _needs_finalize: {}, Streaming Preaggregation: {}, agg size: " "{}, limit: {})", _is_merge ? "true" : "false", _needs_finalize ? "true" : "false", _is_streaming_preagg ? "true" : "false", std::to_string(_aggregate_evaluators.size()), std::to_string(_limit)); runtime_profile()->add_info_string("AggInfos", fmt::to_string(msg)); return Status::OK(); } Status AggregationNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); SCOPED_TIMER(_exec_timer); RETURN_IF_ERROR(prepare_profile(state)); return Status::OK(); } Status AggregationNode::alloc_resource(doris::RuntimeState* state) { SCOPED_TIMER(_exec_timer); RETURN_IF_ERROR(ExecNode::alloc_resource(state)); RETURN_IF_ERROR(VExpr::open(_probe_expr_ctxs, state)); for (int i = 0; i < _aggregate_evaluators.size(); ++i) { RETURN_IF_ERROR(_aggregate_evaluators[i]->open(state)); _aggregate_evaluators[i]->set_version(state->be_exec_version()); } // move _create_agg_status to open not in during prepare, // because during prepare and open thread is not the same one, // this could cause unable to get JVM if (_probe_expr_ctxs.empty()) { // _create_agg_status may acquire a lot of memory, may allocate failed when memory is very few RETURN_IF_CATCH_EXCEPTION(static_cast(_create_agg_status(_agg_data->without_key))); _agg_data_created_without_key = true; } return Status::OK(); } Status AggregationNode::open(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::open(state)); RETURN_IF_ERROR(_children[0]->open(state)); // Streaming preaggregations do all processing in GetNext(). if (_is_streaming_preagg) { return Status::OK(); } bool eos = false; Block block; while (!eos) { RETURN_IF_CANCELLED(state); release_block_memory(block); RETURN_IF_ERROR(_children[0]->get_next_after_projects( state, &block, &eos, std::bind((Status(ExecNode::*)(RuntimeState*, vectorized::Block*, bool*)) & ExecNode::get_next, _children[0], std::placeholders::_1, std::placeholders::_2, std::placeholders::_3))); RETURN_IF_ERROR(sink(state, &block, eos)); } static_cast(_children[0]->close(state)); return Status::OK(); } Status AggregationNode::do_pre_agg(vectorized::Block* input_block, vectorized::Block* output_block) { SCOPED_TIMER(_exec_timer); RETURN_IF_ERROR(_executor.pre_agg(input_block, output_block)); // pre stream agg need use _num_row_return to decide whether to do pre stream agg _num_rows_returned += output_block->rows(); _make_nullable_output_key(output_block); COUNTER_SET(_rows_returned_counter, _num_rows_returned); _executor.update_memusage(); return Status::OK(); } Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) { SCOPED_TIMER(_runtime_profile->total_time_counter()); if (_is_streaming_preagg) { RETURN_IF_CANCELLED(state); release_block_memory(_preagg_block); while (_preagg_block.rows() == 0 && !_child_eos) { RETURN_IF_ERROR(_children[0]->get_next_after_projects( state, &_preagg_block, &_child_eos, std::bind((Status(ExecNode::*)(RuntimeState*, vectorized::Block*, bool*)) & ExecNode::get_next, _children[0], std::placeholders::_1, std::placeholders::_2, std::placeholders::_3))); }; { if (_preagg_block.rows() != 0) { RETURN_IF_ERROR(do_pre_agg(&_preagg_block, block)); } else { RETURN_IF_ERROR(pull(state, block, eos)); } } } else { RETURN_IF_ERROR(pull(state, block, eos)); } return Status::OK(); } Status AggregationNode::pull(doris::RuntimeState* state, vectorized::Block* block, bool* eos) { SCOPED_TIMER(_exec_timer); RETURN_IF_ERROR(_executor.get_result(state, block, eos)); _make_nullable_output_key(block); // dispose the having clause, should not be execute in prestreaming agg RETURN_IF_ERROR(VExprContext::filter_block(_conjuncts, block, block->columns())); reached_limit(block, eos); return Status::OK(); } Status AggregationNode::sink(doris::RuntimeState* state, vectorized::Block* in_block, bool eos) { SCOPED_TIMER(_exec_timer); if (in_block->rows() > 0) { RETURN_IF_ERROR(_executor.execute(in_block)); RETURN_IF_ERROR(_try_spill_disk()); _executor.update_memusage(); } if (eos) { if (_spill_context.has_data) { static_cast(_try_spill_disk(true)); RETURN_IF_ERROR(_spill_context.prepare_for_reading()); } _can_read = true; } return Status::OK(); } void AggregationNode::release_resource(RuntimeState* state) { if (_executor.close) { _executor.close(); } /// _hash_table_size_counter may be null if prepare failed. if (_hash_table_size_counter) { std::visit( [&](auto&& agg_method) { COUNTER_SET(_hash_table_size_counter, int64_t(agg_method.hash_table->size())); }, _agg_data->method_variant); } _release_mem(); ExecNode::release_resource(state); } Status AggregationNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } return ExecNode::close(state); } Status AggregationNode::_create_agg_status(AggregateDataPtr data) { for (int i = 0; i < _aggregate_evaluators.size(); ++i) { try { _aggregate_evaluators[i]->create(data + _offsets_of_aggregate_states[i]); } catch (...) { for (int j = 0; j < i; ++j) { _aggregate_evaluators[j]->destroy(data + _offsets_of_aggregate_states[j]); } throw; } } return Status::OK(); } Status AggregationNode::_destroy_agg_status(AggregateDataPtr data) { for (int i = 0; i < _aggregate_evaluators.size(); ++i) { _aggregate_evaluators[i]->function()->destroy(data + _offsets_of_aggregate_states[i]); } return Status::OK(); } Status AggregationNode::_get_without_key_result(RuntimeState* state, Block* block, bool* eos) { DCHECK(_agg_data->without_key != nullptr); block->clear(); *block = VectorizedUtils::create_empty_columnswithtypename(_row_descriptor); int agg_size = _aggregate_evaluators.size(); MutableColumns columns(agg_size); std::vector data_types(agg_size); for (int i = 0; i < _aggregate_evaluators.size(); ++i) { data_types[i] = _aggregate_evaluators[i]->function()->get_return_type(); columns[i] = data_types[i]->create_column(); } for (int i = 0; i < _aggregate_evaluators.size(); ++i) { auto* column = columns[i].get(); _aggregate_evaluators[i]->insert_result_info( _agg_data->without_key + _offsets_of_aggregate_states[i], column); } const auto& block_schema = block->get_columns_with_type_and_name(); DCHECK_EQ(block_schema.size(), columns.size()); for (int i = 0; i < block_schema.size(); ++i) { const auto column_type = block_schema[i].type; if (!column_type->equals(*data_types[i])) { if (!is_array(remove_nullable(column_type))) { if (!column_type->is_nullable() || data_types[i]->is_nullable() || !remove_nullable(column_type)->equals(*data_types[i])) { return Status::InternalError( "column_type not match data_types, column_type={}, data_types={}", column_type->get_name(), data_types[i]->get_name()); } } if (column_type->is_nullable() && !data_types[i]->is_nullable()) { ColumnPtr ptr = std::move(columns[i]); // unless `count`, other aggregate function dispose empty set should be null // so here check the children row return ptr = make_nullable(ptr, _children[0]->rows_returned() == 0); columns[i] = ptr->assume_mutable(); } } } block->set_columns(std::move(columns)); *eos = true; return Status::OK(); } Status AggregationNode::_serialize_without_key(RuntimeState* state, Block* block, bool* eos) { // 1. `child(0)->rows_returned() == 0` mean not data from child // in level two aggregation node should return NULL result // level one aggregation node set `eos = true` return directly SCOPED_TIMER(_serialize_result_timer); if (UNLIKELY(_children[0]->rows_returned() == 0)) { *eos = true; return Status::OK(); } block->clear(); DCHECK(_agg_data->without_key != nullptr); int agg_size = _aggregate_evaluators.size(); MutableColumns value_columns(agg_size); std::vector data_types(agg_size); // will serialize data to string column for (int i = 0; i < _aggregate_evaluators.size(); ++i) { data_types[i] = _aggregate_evaluators[i]->function()->get_serialized_type(); value_columns[i] = _aggregate_evaluators[i]->function()->create_serialize_column(); } for (int i = 0; i < _aggregate_evaluators.size(); ++i) { _aggregate_evaluators[i]->function()->serialize_without_key_to_column( _agg_data->without_key + _offsets_of_aggregate_states[i], *value_columns[i]); } { ColumnsWithTypeAndName data_with_schema; for (int i = 0; i < _aggregate_evaluators.size(); ++i) { ColumnWithTypeAndName column_with_schema = {nullptr, data_types[i], ""}; data_with_schema.push_back(std::move(column_with_schema)); } *block = Block(data_with_schema); } block->set_columns(std::move(value_columns)); *eos = true; return Status::OK(); } Status AggregationNode::_execute_without_key(Block* block) { DCHECK(_agg_data->without_key != nullptr); for (int i = 0; i < _aggregate_evaluators.size(); ++i) { RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_single_add( block, _agg_data->without_key + _offsets_of_aggregate_states[i], _agg_arena_pool.get())); } return Status::OK(); } Status AggregationNode::_merge_without_key(Block* block) { SCOPED_TIMER(_merge_timer); DCHECK(_agg_data->without_key != nullptr); for (int i = 0; i < _aggregate_evaluators.size(); ++i) { if (_aggregate_evaluators[i]->is_merge()) { int col_id = _get_slot_column_id(_aggregate_evaluators[i]); auto column = block->get_by_position(col_id).column; if (column->is_nullable()) { column = ((ColumnNullable*)column.get())->get_nested_column_ptr(); } SCOPED_TIMER(_deserialize_data_timer); _aggregate_evaluators[i]->function()->deserialize_and_merge_from_column( _agg_data->without_key + _offsets_of_aggregate_states[i], *column, _agg_arena_pool.get()); } else { RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_single_add( block, _agg_data->without_key + _offsets_of_aggregate_states[i], _agg_arena_pool.get())); } } return Status::OK(); } void AggregationNode::_update_memusage_without_key() { auto arena_memory_usage = _agg_arena_pool->size() - _mem_usage_record.used_in_arena; mem_tracker()->consume(arena_memory_usage); _serialize_key_arena_memory_usage->add(arena_memory_usage); _mem_usage_record.used_in_arena = _agg_arena_pool->size(); } void AggregationNode::_close_without_key() { //because prepare maybe failed, and couldn't create agg data. //but finally call close to destory agg data, if agg data has bitmapValue //will be core dump, it's not initialized if (_agg_data_created_without_key) { static_cast(_destroy_agg_status(_agg_data->without_key)); _agg_data_created_without_key = false; } release_tracker(); } void AggregationNode::_make_nullable_output_key(Block* block) { if (block->rows() != 0) { for (auto cid : _make_nullable_keys) { block->get_by_position(cid).column = make_nullable(block->get_by_position(cid).column); block->get_by_position(cid).type = make_nullable(block->get_by_position(cid).type); } } } bool AggregationNode::_should_expand_preagg_hash_tables() { if (!_should_expand_hash_table) { return false; } return std::visit( [&](auto&& agg_method) -> bool { auto& hash_tbl = *agg_method.hash_table; auto [ht_mem, ht_rows] = std::pair {hash_tbl.get_buffer_size_in_bytes(), hash_tbl.size()}; // Need some rows in tables to have valid statistics. if (ht_rows == 0) { return true; } // Find the appropriate reduction factor in our table for the current hash table sizes. int cache_level = 0; while (cache_level + 1 < STREAMING_HT_MIN_REDUCTION_SIZE && ht_mem >= STREAMING_HT_MIN_REDUCTION[cache_level + 1].min_ht_mem) { ++cache_level; } // Compare the number of rows in the hash table with the number of input rows that // were aggregated into it. Exclude passed through rows from this calculation since // they were not in hash tables. const int64_t input_rows = _children[0]->rows_returned(); const int64_t aggregated_input_rows = input_rows - _num_rows_returned; // TODO chenhao // const int64_t expected_input_rows = estimated_input_cardinality_ - num_rows_returned_; double current_reduction = static_cast(aggregated_input_rows) / ht_rows; // TODO: workaround for IMPALA-2490: subplan node rows_returned counter may be // inaccurate, which could lead to a divide by zero below. if (aggregated_input_rows <= 0) { return true; } // Extrapolate the current reduction factor (r) using the formula // R = 1 + (N / n) * (r - 1), where R is the reduction factor over the full input data // set, N is the number of input rows, excluding passed-through rows, and n is the // number of rows inserted or merged into the hash tables. This is a very rough // approximation but is good enough to be useful. // TODO: consider collecting more statistics to better estimate reduction. // double estimated_reduction = aggregated_input_rows >= expected_input_rows // ? current_reduction // : 1 + (expected_input_rows / aggregated_input_rows) * (current_reduction - 1); double min_reduction = STREAMING_HT_MIN_REDUCTION[cache_level].streaming_ht_min_reduction; // COUNTER_SET(preagg_estimated_reduction_, estimated_reduction); // COUNTER_SET(preagg_streaming_ht_min_reduction_, min_reduction); // return estimated_reduction > min_reduction; _should_expand_hash_table = current_reduction > min_reduction; return _should_expand_hash_table; }, _agg_data->method_variant); } size_t AggregationNode::_memory_usage() const { size_t usage = 0; if (_agg_arena_pool) { usage += _agg_arena_pool->size(); } if (_aggregate_data_container) { usage += _aggregate_data_container->memory_usage(); } return usage; } Status AggregationNode::_reset_hash_table() { return std::visit( [&](auto&& agg_method) { auto& hash_table = *agg_method.hash_table; using HashTableType = std::decay_t; agg_method.reset(); hash_table.for_each_mapped([&](auto& mapped) { if (mapped) { static_cast(_destroy_agg_status(mapped)); mapped = nullptr; } }); _aggregate_data_container = std::make_unique( sizeof(typename HashTableType::key_type), ((_total_size_of_aggregate_states + _align_aggregate_states - 1) / _align_aggregate_states) * _align_aggregate_states); agg_method.hash_table.reset(new HashTableType()); _agg_arena_pool = std::make_unique(); return Status::OK(); }, _agg_data->method_variant); } size_t AggregationNode::_get_hash_table_size() { return std::visit([&](auto&& agg_method) { return agg_method.hash_table->size(); }, _agg_data->method_variant); } void AggregationNode::_emplace_into_hash_table(AggregateDataPtr* places, ColumnRawPtrs& key_columns, const size_t num_rows) { std::visit( [&](auto&& agg_method) -> void { SCOPED_TIMER(_hash_table_compute_timer); using HashMethodType = std::decay_t; using AggState = typename HashMethodType::State; AggState state(key_columns); agg_method.init_serialized_keys(key_columns, num_rows); auto creator = [this](const auto& ctor, auto& key, auto& origin) { try { HashMethodType::try_presis_key_and_origin(key, origin, *_agg_arena_pool); auto mapped = _aggregate_data_container->append_data(origin); auto st = _create_agg_status(mapped); if (!st) { throw Exception(st.code(), st.to_string()); } ctor(key, mapped); } catch (...) { // Exception-safety - if it can not allocate memory or create status, // the destructors will not be called. ctor(key, nullptr); throw; } }; auto creator_for_null_key = [this](auto& mapped) { mapped = _agg_arena_pool->aligned_alloc(_total_size_of_aggregate_states, _align_aggregate_states); auto st = _create_agg_status(mapped); if (!st) { throw Exception(st.code(), st.to_string()); } }; SCOPED_TIMER(_hash_table_emplace_timer); for (size_t i = 0; i < num_rows; ++i) { places[i] = agg_method.lazy_emplace(state, i, creator, creator_for_null_key); } COUNTER_UPDATE(_hash_table_input_counter, num_rows); }, _agg_data->method_variant); } void AggregationNode::_find_in_hash_table(AggregateDataPtr* places, ColumnRawPtrs& key_columns, size_t num_rows) { std::visit( [&](auto&& agg_method) -> void { using HashMethodType = std::decay_t; using AggState = typename HashMethodType::State; AggState state(key_columns); agg_method.init_serialized_keys(key_columns, num_rows); /// For all rows. for (size_t i = 0; i < num_rows; ++i) { auto find_result = agg_method.find(state, i); if (find_result.is_found()) { places[i] = find_result.get_mapped(); } else { places[i] = nullptr; } } }, _agg_data->method_variant); } Status AggregationNode::_pre_agg_with_serialized_key(doris::vectorized::Block* in_block, doris::vectorized::Block* out_block) { DCHECK(!_probe_expr_ctxs.empty()); size_t key_size = _probe_expr_ctxs.size(); ColumnRawPtrs key_columns(key_size); { SCOPED_TIMER(_expr_timer); for (size_t i = 0; i < key_size; ++i) { int result_column_id = -1; RETURN_IF_ERROR(_probe_expr_ctxs[i]->execute(in_block, &result_column_id)); in_block->get_by_position(result_column_id).column = in_block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); key_columns[i] = in_block->get_by_position(result_column_id).column.get(); } } int rows = in_block->rows(); if (_places.size() < rows) { _places.resize(rows); } // Stop expanding hash tables if we're not reducing the input sufficiently. As our // hash tables expand out of each level of cache hierarchy, every hash table lookup // will take longer. We also may not be able to expand hash tables because of memory // pressure. In either case we should always use the remaining space in the hash table // to avoid wasting memory. // But for fixed hash map, it never need to expand bool ret_flag = false; RETURN_IF_ERROR(std::visit( [&](auto&& agg_method) -> Status { if (auto& hash_tbl = *agg_method.hash_table; hash_tbl.add_elem_size_overflow(rows)) { /// If too much memory is used during the pre-aggregation stage, /// it is better to output the data directly without performing further aggregation. const bool used_too_much_memory = (_external_agg_bytes_threshold > 0 && _memory_usage() > _external_agg_bytes_threshold); // do not try to do agg, just init and serialize directly return the out_block if (!_should_expand_preagg_hash_tables() || used_too_much_memory) { SCOPED_TIMER(_streaming_agg_timer); ret_flag = true; // will serialize value data to string column. // non-nullable column(id in `_make_nullable_keys`) // will be converted to nullable. bool mem_reuse = _make_nullable_keys.empty() && out_block->mem_reuse(); std::vector data_types; MutableColumns value_columns; for (int i = 0; i < _aggregate_evaluators.size(); ++i) { auto data_type = _aggregate_evaluators[i]->function()->get_serialized_type(); if (mem_reuse) { value_columns.emplace_back( std::move(*out_block->get_by_position(i + key_size).column) .mutate()); } else { // slot type of value it should always be string type value_columns.emplace_back(_aggregate_evaluators[i] ->function() ->create_serialize_column()); } data_types.emplace_back(data_type); } for (int i = 0; i != _aggregate_evaluators.size(); ++i) { SCOPED_TIMER(_serialize_data_timer); RETURN_IF_ERROR( _aggregate_evaluators[i]->streaming_agg_serialize_to_column( in_block, value_columns[i], rows, _agg_arena_pool.get())); } if (!mem_reuse) { ColumnsWithTypeAndName columns_with_schema; for (int i = 0; i < key_size; ++i) { columns_with_schema.emplace_back( key_columns[i]->clone_resized(rows), _probe_expr_ctxs[i]->root()->data_type(), _probe_expr_ctxs[i]->root()->expr_name()); } for (int i = 0; i < value_columns.size(); ++i) { columns_with_schema.emplace_back(std::move(value_columns[i]), data_types[i], ""); } out_block->swap(Block(columns_with_schema)); } else { for (int i = 0; i < key_size; ++i) { std::move(*out_block->get_by_position(i).column) .mutate() ->insert_range_from(*key_columns[i], 0, rows); } } } } return Status::OK(); }, _agg_data->method_variant)); if (!ret_flag) { RETURN_IF_CATCH_EXCEPTION(_emplace_into_hash_table(_places.data(), key_columns, rows)); for (int i = 0; i < _aggregate_evaluators.size(); ++i) { RETURN_IF_ERROR(_aggregate_evaluators[i]->execute_batch_add( in_block, _offsets_of_aggregate_states[i], _places.data(), _agg_arena_pool.get(), _should_expand_hash_table)); } } return Status::OK(); } template Status AggregationNode::_serialize_hash_table_to_block(HashTableCtxType& context, HashTableType& hash_table, Block& block, std::vector& keys_) { int key_size = _probe_expr_ctxs.size(); int agg_size = _aggregate_evaluators.size(); MutableColumns value_columns(agg_size); DataTypes value_data_types(agg_size); MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { key_columns.emplace_back(_probe_expr_ctxs[i]->root()->data_type()->create_column()); } for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { value_data_types[i] = _aggregate_evaluators[i]->function()->get_serialized_type(); value_columns[i] = _aggregate_evaluators[i]->function()->create_serialize_column(); } context.init_iterator(); const auto size = hash_table.size(); std::vector keys(size); if (_values.size() < size) { _values.resize(size); } size_t num_rows = 0; _aggregate_data_container->init_once(); auto& iter = _aggregate_data_container->iterator; { while (iter != _aggregate_data_container->end()) { keys[num_rows] = iter.get_key(); _values[num_rows] = iter.get_aggregate_data(); ++iter; ++num_rows; } } { context.insert_keys_into_columns(keys, key_columns, num_rows); } if (hash_table.has_null_key_data()) { // only one key of group by support wrap null key // here need additional processing logic on the null key / value CHECK(key_columns.size() == 1); CHECK(key_columns[0]->is_nullable()); key_columns[0]->insert_data(nullptr, 0); // Here is no need to set `keys[num_rows]`, keep it as default value. _values[num_rows] = hash_table.template get_null_key_data(); ++num_rows; } for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { _aggregate_evaluators[i]->function()->serialize_to_column( _values, _offsets_of_aggregate_states[i], value_columns[i], num_rows); } ColumnsWithTypeAndName columns_with_schema; for (int i = 0; i < key_size; ++i) { columns_with_schema.emplace_back(std::move(key_columns[i]), _probe_expr_ctxs[i]->root()->data_type(), _probe_expr_ctxs[i]->root()->expr_name()); } for (int i = 0; i < agg_size; ++i) { columns_with_schema.emplace_back(std::move(value_columns[i]), value_data_types[i], _aggregate_evaluators[i]->function()->get_name()); } block = columns_with_schema; keys_.swap(keys); return Status::OK(); } template Status AggregationNode::_spill_hash_table(HashTableCtxType& agg_method, HashTableType& hash_table) { Block block; std::vector keys; RETURN_IF_ERROR(_serialize_hash_table_to_block(agg_method, hash_table, block, keys)); CHECK_EQ(block.rows(), hash_table.size()); CHECK_EQ(keys.size(), block.rows()); if (!_spill_context.has_data) { _spill_context.has_data = true; _spill_context.runtime_profile = _runtime_profile->create_child("Spill", true, true); } BlockSpillWriterUPtr writer; RETURN_IF_ERROR(ExecEnv::GetInstance()->block_spill_mgr()->get_writer( std::numeric_limits::max(), writer, _spill_context.runtime_profile)); Defer defer {[&]() { // redundant call is ok static_cast(writer->close()); }}; _spill_context.stream_ids.emplace_back(writer->get_id()); std::vector partitioned_indices(block.rows()); std::vector blocks_rows(_spill_partition_helper->partition_count); // The last row may contain a null key. const size_t rows = hash_table.has_null_key_data() ? block.rows() - 1 : block.rows(); for (size_t i = 0; i < rows; ++i) { const auto index = _spill_partition_helper->get_index(hash_table.hash(keys[i])); partitioned_indices[i] = index; blocks_rows[index]++; } if (hash_table.has_null_key_data()) { // Here put the row with null key at the last partition. const auto index = _spill_partition_helper->partition_count - 1; partitioned_indices[rows] = index; blocks_rows[index]++; } for (size_t i = 0; i < _spill_partition_helper->partition_count; ++i) { Block block_to_write = block.clone_empty(); if (blocks_rows[i] == 0) { /// Here write one empty block to ensure there are enough blocks in the file, /// blocks' count should be equal with partition_count. static_cast(writer->write(block_to_write)); continue; } MutableBlock mutable_block(std::move(block_to_write)); for (auto& column : mutable_block.mutable_columns()) { column->reserve(blocks_rows[i]); } size_t begin = 0; size_t length = 0; for (size_t j = 0; j < partitioned_indices.size(); ++j) { if (partitioned_indices[j] != i) { if (length > 0) { mutable_block.add_rows(&block, begin, length); } length = 0; continue; } if (length == 0) { begin = j; } length++; } if (length > 0) { mutable_block.add_rows(&block, begin, length); } CHECK_EQ(mutable_block.rows(), blocks_rows[i]); RETURN_IF_ERROR(writer->write(mutable_block.to_block())); } RETURN_IF_ERROR(writer->close()); return Status::OK(); } Status AggregationNode::_try_spill_disk(bool eos) { if (_external_agg_bytes_threshold == 0) { return Status::OK(); } return std::visit( [&](auto&& agg_method) -> Status { auto& hash_table = *agg_method.hash_table; if (!eos && _memory_usage() < _external_agg_bytes_threshold) { return Status::OK(); } if (_get_hash_table_size() == 0) { return Status::OK(); } RETURN_IF_ERROR(_spill_hash_table(agg_method, hash_table)); return _reset_hash_table(); }, _agg_data->method_variant); } Status AggregationNode::_execute_with_serialized_key(Block* block) { if (_reach_limit) { return _execute_with_serialized_key_helper(block); } else { return _execute_with_serialized_key_helper(block); } } Status AggregationNode::_merge_spilt_data() { CHECK(!_spill_context.stream_ids.empty()); for (auto& reader : _spill_context.readers) { CHECK_LT(_spill_context.read_cursor, reader->block_count()); reader->seek(_spill_context.read_cursor); Block block; bool eos; RETURN_IF_ERROR(reader->read(&block, &eos)); if (!block.empty()) { auto st = _merge_with_serialized_key_helper( &block); RETURN_IF_ERROR(st); } } _spill_context.read_cursor++; return Status::OK(); } Status AggregationNode::_get_result_with_spilt_data(RuntimeState* state, Block* block, bool* eos) { CHECK(!_spill_context.stream_ids.empty()); CHECK(_spill_partition_helper != nullptr) << "_spill_partition_helper should not be null"; _aggregate_data_container->init_once(); while (_aggregate_data_container->iterator == _aggregate_data_container->end()) { if (_spill_context.read_cursor == _spill_partition_helper->partition_count) { break; } RETURN_IF_ERROR(_reset_hash_table()); RETURN_IF_ERROR(_merge_spilt_data()); _aggregate_data_container->init_once(); } RETURN_IF_ERROR(_get_result_with_serialized_key_non_spill(state, block, eos)); if (*eos) { *eos = _spill_context.read_cursor == _spill_partition_helper->partition_count; } CHECK(!block->empty() || *eos); return Status::OK(); } Status AggregationNode::_get_with_serialized_key_result(RuntimeState* state, Block* block, bool* eos) { if (_spill_context.has_data) { return _get_result_with_spilt_data(state, block, eos); } else { return _get_result_with_serialized_key_non_spill(state, block, eos); } } Status AggregationNode::_get_result_with_serialized_key_non_spill(RuntimeState* state, Block* block, bool* eos) { // non-nullable column(id in `_make_nullable_keys`) will be converted to nullable. bool mem_reuse = _make_nullable_keys.empty() && block->mem_reuse(); auto columns_with_schema = VectorizedUtils::create_columns_with_type_and_name(_row_descriptor); int key_size = _probe_expr_ctxs.size(); MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { if (!mem_reuse) { key_columns.emplace_back(columns_with_schema[i].type->create_column()); } else { key_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); } } MutableColumns value_columns; for (int i = key_size; i < columns_with_schema.size(); ++i) { if (!mem_reuse) { value_columns.emplace_back(columns_with_schema[i].type->create_column()); } else { value_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); } } SCOPED_TIMER(_get_results_timer); std::visit( [&](auto&& agg_method) -> void { auto& data = *agg_method.hash_table; agg_method.init_iterator(); const auto size = std::min(data.size(), size_t(state->batch_size())); using KeyType = std::decay_tget_first())>; std::vector keys(size); if (_values.size() < size) { _values.resize(size); } size_t num_rows = 0; _aggregate_data_container->init_once(); auto& iter = _aggregate_data_container->iterator; { SCOPED_TIMER(_hash_table_iterate_timer); while (iter != _aggregate_data_container->end() && num_rows < state->batch_size()) { keys[num_rows] = iter.get_key(); _values[num_rows] = iter.get_aggregate_data(); ++iter; ++num_rows; } } { SCOPED_TIMER(_insert_keys_to_column_timer); agg_method.insert_keys_into_columns(keys, key_columns, num_rows); } for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { _aggregate_evaluators[i]->insert_result_info_vec( _values, _offsets_of_aggregate_states[i], value_columns[i].get(), num_rows); } if (iter == _aggregate_data_container->end()) { if (agg_method.hash_table->has_null_key_data()) { // only one key of group by support wrap null key // here need additional processing logic on the null key / value DCHECK(key_columns.size() == 1); DCHECK(key_columns[0]->is_nullable()); if (key_columns[0]->size() < state->batch_size()) { key_columns[0]->insert_data(nullptr, 0); auto mapped = agg_method.hash_table ->template get_null_key_data(); for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { _aggregate_evaluators[i]->insert_result_info( mapped + _offsets_of_aggregate_states[i], value_columns[i].get()); } *eos = true; } } else { *eos = true; } } }, _agg_data->method_variant); if (!mem_reuse) { *block = columns_with_schema; MutableColumns columns(block->columns()); for (int i = 0; i < block->columns(); ++i) { if (i < key_size) { columns[i] = std::move(key_columns[i]); } else { columns[i] = std::move(value_columns[i - key_size]); } } block->set_columns(std::move(columns)); } return Status::OK(); } Status AggregationNode::_serialize_with_serialized_key_result(RuntimeState* state, Block* block, bool* eos) { if (_spill_context.has_data) { return _serialize_with_serialized_key_result_with_spilt_data(state, block, eos); } else { return _serialize_with_serialized_key_result_non_spill(state, block, eos); } } Status AggregationNode::_serialize_with_serialized_key_result_with_spilt_data(RuntimeState* state, Block* block, bool* eos) { CHECK(!_spill_context.stream_ids.empty()); CHECK(_spill_partition_helper != nullptr) << "_spill_partition_helper should not be null"; _aggregate_data_container->init_once(); while (_aggregate_data_container->iterator == _aggregate_data_container->end()) { if (_spill_context.read_cursor == _spill_partition_helper->partition_count) { break; } RETURN_IF_ERROR(_reset_hash_table()); RETURN_IF_ERROR(_merge_spilt_data()); _aggregate_data_container->init_once(); } RETURN_IF_ERROR(_serialize_with_serialized_key_result_non_spill(state, block, eos)); if (*eos) { *eos = _spill_context.read_cursor == _spill_partition_helper->partition_count; } CHECK(!block->empty() || *eos); return Status::OK(); } Status AggregationNode::_serialize_with_serialized_key_result_non_spill(RuntimeState* state, Block* block, bool* eos) { SCOPED_TIMER(_serialize_result_timer); int key_size = _probe_expr_ctxs.size(); int agg_size = _aggregate_evaluators.size(); MutableColumns value_columns(agg_size); DataTypes value_data_types(agg_size); // non-nullable column(id in `_make_nullable_keys`) will be converted to nullable. bool mem_reuse = _make_nullable_keys.empty() && block->mem_reuse(); MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { if (mem_reuse) { key_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); } else { key_columns.emplace_back(_probe_expr_ctxs[i]->root()->data_type()->create_column()); } } SCOPED_TIMER(_get_results_timer); std::visit( [&](auto&& agg_method) -> void { agg_method.init_iterator(); auto& data = *agg_method.hash_table; const auto size = std::min(data.size(), size_t(state->batch_size())); using KeyType = std::decay_tget_first())>; std::vector keys(size); _values.resize(size + 1); size_t num_rows = 0; _aggregate_data_container->init_once(); auto& iter = _aggregate_data_container->iterator; { SCOPED_TIMER(_hash_table_iterate_timer); while (iter != _aggregate_data_container->end() && num_rows < state->batch_size()) { keys[num_rows] = iter.get_key(); _values[num_rows] = iter.get_aggregate_data(); ++iter; ++num_rows; } } { SCOPED_TIMER(_insert_keys_to_column_timer); agg_method.insert_keys_into_columns(keys, key_columns, num_rows); } if (iter == _aggregate_data_container->end()) { if (agg_method.hash_table->has_null_key_data()) { // only one key of group by support wrap null key // here need additional processing logic on the null key / value DCHECK(key_columns.size() == 1); DCHECK(key_columns[0]->is_nullable()); if (agg_method.hash_table->has_null_key_data()) { key_columns[0]->insert_data(nullptr, 0); _values[num_rows] = agg_method.hash_table ->template get_null_key_data(); ++num_rows; *eos = true; } } else { *eos = true; } } { SCOPED_TIMER(_serialize_data_timer); for (size_t i = 0; i < _aggregate_evaluators.size(); ++i) { value_data_types[i] = _aggregate_evaluators[i]->function()->get_serialized_type(); if (mem_reuse) { value_columns[i] = std::move(*block->get_by_position(i + key_size).column) .mutate(); } else { value_columns[i] = _aggregate_evaluators[i]->function()->create_serialize_column(); } _aggregate_evaluators[i]->function()->serialize_to_column( _values, _offsets_of_aggregate_states[i], value_columns[i], num_rows); } } }, _agg_data->method_variant); if (!mem_reuse) { ColumnsWithTypeAndName columns_with_schema; for (int i = 0; i < key_size; ++i) { columns_with_schema.emplace_back(std::move(key_columns[i]), _probe_expr_ctxs[i]->root()->data_type(), _probe_expr_ctxs[i]->root()->expr_name()); } for (int i = 0; i < agg_size; ++i) { columns_with_schema.emplace_back(std::move(value_columns[i]), value_data_types[i], ""); } *block = Block(columns_with_schema); } return Status::OK(); } Status AggregationNode::_merge_with_serialized_key(Block* block) { if (_reach_limit) { return _merge_with_serialized_key_helper(block); } else { return _merge_with_serialized_key_helper(block); } } void AggregationNode::_update_memusage_with_serialized_key() { std::visit( [&](auto&& agg_method) -> void { auto& data = *agg_method.hash_table; auto arena_memory_usage = _agg_arena_pool->size() + _aggregate_data_container->memory_usage() - _mem_usage_record.used_in_arena; mem_tracker()->consume(arena_memory_usage); mem_tracker()->consume(data.get_buffer_size_in_bytes() - _mem_usage_record.used_in_state); _serialize_key_arena_memory_usage->add(arena_memory_usage); COUNTER_UPDATE(_hash_table_memory_usage, data.get_buffer_size_in_bytes() - _mem_usage_record.used_in_state); _mem_usage_record.used_in_state = data.get_buffer_size_in_bytes(); _mem_usage_record.used_in_arena = _agg_arena_pool->size() + _aggregate_data_container->memory_usage(); }, _agg_data->method_variant); } void AggregationNode::_close_with_serialized_key() { std::visit( [&](auto&& agg_method) -> void { auto& data = *agg_method.hash_table; data.for_each_mapped([&](auto& mapped) { if (mapped) { static_cast(_destroy_agg_status(mapped)); mapped = nullptr; } }); if (data.has_null_key_data()) { auto st = _destroy_agg_status( data.template get_null_key_data()); if (!st) { throw Exception(st.code(), st.to_string()); } } }, _agg_data->method_variant); release_tracker(); } void AggregationNode::release_tracker() { mem_tracker()->release(_mem_usage_record.used_in_state + _mem_usage_record.used_in_arena); } void AggregationNode::_release_mem() { _agg_data = nullptr; _aggregate_data_container = nullptr; _agg_profile_arena = nullptr; _agg_arena_pool = nullptr; _preagg_block.clear(); PODArray tmp_places; _places.swap(tmp_places); std::vector tmp_deserialize_buffer; _deserialize_buffer.swap(tmp_deserialize_buffer); std::vector tmp_values; _values.swap(tmp_values); } Status AggSpillContext::prepare_for_reading() { if (readers_prepared) { return Status::OK(); } readers_prepared = true; readers.resize(stream_ids.size()); auto* manager = ExecEnv::GetInstance()->block_spill_mgr(); for (size_t i = 0; i != stream_ids.size(); ++i) { RETURN_IF_ERROR(manager->get_reader(stream_ids[i], readers[i], runtime_profile, true)); } return Status::OK(); } } // namespace doris::vectorized