// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "exec/partitioned_aggregation_node.h" #include "exec/partitioned_hash_table.inline.h" #include "exprs/expr_context.h" #include "exprs/new_agg_fn_evaluator.h" #include "runtime/buffered_tuple_stream3.inline.h" #include "runtime/row_batch.h" #include "runtime/tuple_row.h" #include "util/runtime_profile.h" using namespace doris; Status PartitionedAggregationNode::ProcessBatchNoGrouping(RowBatch* batch) { Tuple* output_tuple = singleton_output_tuple_; FOREACH_ROW(batch, 0, batch_iter) { UpdateTuple(agg_fn_evals_.data(), output_tuple, batch_iter.get()); } return Status::OK(); } template Status PartitionedAggregationNode::ProcessBatch(RowBatch* batch, PartitionedHashTableCtx* ht_ctx) { DCHECK(!hash_partitions_.empty()); DCHECK(!is_streaming_preagg_); // Make sure that no resizes will happen when inserting individual rows to the hash // table of each partition by pessimistically assuming that all the rows in each batch // will end up to the same partition. // TODO: Once we have a histogram with the number of rows per partition, we will have // accurate resize calls. RETURN_IF_ERROR(CheckAndResizeHashPartitions(AGGREGATED_ROWS, batch->num_rows(), ht_ctx)); PartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); const int cache_size = expr_vals_cache->capacity(); const int num_rows = batch->num_rows(); for (int group_start = 0; group_start < num_rows; group_start += cache_size) { EvalAndHashPrefetchGroup(batch, group_start, ht_ctx); FOREACH_ROW_LIMIT(batch, group_start, cache_size, batch_iter) { RETURN_IF_ERROR(ProcessRow(batch_iter.get(), ht_ctx)); expr_vals_cache->NextRow(); } ht_ctx->expr_results_pool_->clear(); DCHECK(expr_vals_cache->AtEnd()); } return Status::OK(); } template void IR_ALWAYS_INLINE PartitionedAggregationNode::EvalAndHashPrefetchGroup( RowBatch* batch, int start_row_idx, PartitionedHashTableCtx* ht_ctx) { PartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); const int cache_size = expr_vals_cache->capacity(); expr_vals_cache->Reset(); FOREACH_ROW_LIMIT(batch, start_row_idx, cache_size, batch_iter) { TupleRow* row = batch_iter.get(); bool is_null; if (AGGREGATED_ROWS) { is_null = !ht_ctx->EvalAndHashBuild(row); } else { is_null = !ht_ctx->EvalAndHashProbe(row); } // Hoist lookups out of non-null branch to speed up non-null case. const uint32_t hash = expr_vals_cache->CurExprValuesHash(); const uint32_t partition_idx = hash >> (32 - NUM_PARTITIONING_BITS); PartitionedHashTable* hash_tbl = GetHashTable(partition_idx); if (is_null) { expr_vals_cache->SetRowNull(); } else if (config::enable_prefetch) { if (LIKELY(hash_tbl != NULL)) hash_tbl->PrefetchBucket(hash); } expr_vals_cache->NextRow(); } expr_vals_cache->ResetForRead(); } template Status PartitionedAggregationNode::ProcessRow(TupleRow* row, PartitionedHashTableCtx* ht_ctx) { PartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); // Hoist lookups out of non-null branch to speed up non-null case. const uint32_t hash = expr_vals_cache->CurExprValuesHash(); const uint32_t partition_idx = hash >> (32 - NUM_PARTITIONING_BITS); if (expr_vals_cache->IsRowNull()) return Status::OK(); // To process this row, we first see if it can be aggregated or inserted into this // partition's hash table. If we need to insert it and that fails, due to OOM, we // spill the partition. The partition to spill is not necessarily dst_partition, // so we can try again to insert the row. PartitionedHashTable* hash_tbl = GetHashTable(partition_idx); Partition* dst_partition = hash_partitions_[partition_idx]; DCHECK(dst_partition != nullptr); DCHECK_EQ(dst_partition->is_spilled(), hash_tbl == NULL); if (hash_tbl == NULL) { // This partition is already spilled, just append the row. return AppendSpilledRow(dst_partition, row); } DCHECK(dst_partition->aggregated_row_stream->is_pinned()); bool found; // Find the appropriate bucket in the hash table. There will always be a free // bucket because we checked the size above. PartitionedHashTable::Iterator it = hash_tbl->FindBuildRowBucket(ht_ctx, &found); DCHECK(!it.AtEnd()) << "Hash table had no free buckets"; if (AGGREGATED_ROWS) { // If the row is already an aggregate row, it cannot match anything in the // hash table since we process the aggregate rows first. These rows should // have been aggregated in the initial pass. DCHECK(!found); } else if (found) { // Row is already in hash table. Do the aggregation and we're done. UpdateTuple(dst_partition->agg_fn_evals.data(), it.GetTuple(), row); return Status::OK(); } // If we are seeing this result row for the first time, we need to construct the // result row and initialize it. return AddIntermediateTuple(dst_partition, row, hash, it); } template Status PartitionedAggregationNode::AddIntermediateTuple(Partition* partition, TupleRow* row, uint32_t hash, PartitionedHashTable::Iterator insert_it) { while (true) { DCHECK(partition->aggregated_row_stream->is_pinned()); Tuple* intermediate_tuple = ConstructIntermediateTuple( partition->agg_fn_evals, partition->aggregated_row_stream.get(), &process_batch_status_); if (LIKELY(intermediate_tuple != NULL)) { UpdateTuple(partition->agg_fn_evals.data(), intermediate_tuple, row, AGGREGATED_ROWS); // After copying and initializing the tuple, insert it into the hash table. insert_it.SetTuple(intermediate_tuple, hash); return Status::OK(); } else if (!process_batch_status_.ok()) { return std::move(process_batch_status_); } // We did not have enough memory to add intermediate_tuple to the stream. RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS)); if (partition->is_spilled()) { return AppendSpilledRow(partition, row); } } } Status PartitionedAggregationNode::ProcessBatchStreaming(bool needs_serialize, RowBatch* in_batch, RowBatch* out_batch, PartitionedHashTableCtx* ht_ctx, int remaining_capacity[PARTITION_FANOUT]) { DCHECK(is_streaming_preagg_); DCHECK_EQ(out_batch->num_rows(), 0); DCHECK_LE(in_batch->num_rows(), out_batch->capacity()); RowBatch::Iterator out_batch_iterator(out_batch, out_batch->num_rows()); PartitionedHashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache(); const int num_rows = in_batch->num_rows(); const int cache_size = expr_vals_cache->capacity(); for (int group_start = 0; group_start < num_rows; group_start += cache_size) { EvalAndHashPrefetchGroup(in_batch, group_start, ht_ctx); FOREACH_ROW_LIMIT(in_batch, group_start, cache_size, in_batch_iter) { // Hoist lookups out of non-null branch to speed up non-null case. TupleRow* in_row = in_batch_iter.get(); const uint32_t hash = expr_vals_cache->CurExprValuesHash(); const uint32_t partition_idx = hash >> (32 - NUM_PARTITIONING_BITS); if (!expr_vals_cache->IsRowNull() && !TryAddToHashTable(ht_ctx, hash_partitions_[partition_idx], GetHashTable(partition_idx), in_row, hash, &remaining_capacity[partition_idx], &process_batch_status_)) { RETURN_IF_ERROR(std::move(process_batch_status_)); // Tuple is not going into hash table, add it to the output batch. Tuple* intermediate_tuple = ConstructIntermediateTuple( agg_fn_evals_, out_batch->tuple_data_pool(), &process_batch_status_); if (UNLIKELY(intermediate_tuple == NULL)) { DCHECK(!process_batch_status_.ok()); return std::move(process_batch_status_); } UpdateTuple(agg_fn_evals_.data(), intermediate_tuple, in_row); out_batch_iterator.get()->set_tuple(0, intermediate_tuple); out_batch_iterator.next(); out_batch->commit_last_row(); } DCHECK(process_batch_status_.ok()); expr_vals_cache->NextRow(); } ht_ctx->expr_results_pool_->clear(); DCHECK(expr_vals_cache->AtEnd()); } if (needs_serialize) { FOREACH_ROW(out_batch, 0, out_batch_iter) { NewAggFnEvaluator::Serialize(agg_fn_evals_, out_batch_iter.get()->get_tuple(0)); } } return Status::OK(); } bool PartitionedAggregationNode::TryAddToHashTable(PartitionedHashTableCtx* ht_ctx, Partition* partition, PartitionedHashTable* hash_tbl, TupleRow* in_row, uint32_t hash, int* remaining_capacity, Status* status) { DCHECK(remaining_capacity != NULL); DCHECK_EQ(hash_tbl, partition->hash_tbl.get()); DCHECK_GE(*remaining_capacity, 0); bool found; // This is called from ProcessBatchStreaming() so the rows are not aggregated. PartitionedHashTable::Iterator it = hash_tbl->FindBuildRowBucket(ht_ctx, &found); Tuple* intermediate_tuple; if (found) { intermediate_tuple = it.GetTuple(); } else if (*remaining_capacity == 0) { return false; } else { intermediate_tuple = ConstructIntermediateTuple( partition->agg_fn_evals, partition->aggregated_row_stream.get(), status); if (LIKELY(intermediate_tuple != NULL)) { it.SetTuple(intermediate_tuple, hash); --(*remaining_capacity); } else { // Avoid repeatedly trying to add tuples when under memory pressure. *remaining_capacity = 0; return false; } } UpdateTuple(partition->agg_fn_evals.data(), intermediate_tuple, in_row); return true; } // Instantiate required templates. template Status PartitionedAggregationNode::ProcessBatch(RowBatch*, PartitionedHashTableCtx*); template Status PartitionedAggregationNode::ProcessBatch(RowBatch*, PartitionedHashTableCtx*);