baidu palo

2017-08-11 17:51:21 +08:00
commit e2311f656e
1988 changed files with 586941 additions and 0 deletions
--- a/be/src/runtime/data_spliter.cpp
+++ b/be/src/runtime/data_spliter.cpp
@ -0,0 +1,333 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/data_spliter.h"
+
+#include <sstream>
+
+#include <thrift/protocol/TDebugProtocol.h>
+
+#include "exprs/expr.h"
+#include "common/object_pool.h"
+#include "runtime/runtime_state.h"
+#include "runtime/raw_value.h"
+#include "runtime/row_batch.h"
+#include "runtime/tuple_row.h"
+#include "runtime/dpp_sink.h"
+#include "runtime/load_path_mgr.h"
+#include "runtime/mem_tracker.h"
+#include "util/runtime_profile.h"
+#include "util/debug_util.h"
+#include "util/file_utils.h"
+#include "gen_cpp/DataSinks_types.h"
+
+namespace palo {
+
+DataSpliter::DataSpliter(const RowDescriptor& row_desc) :
+        _obj_pool(new ObjectPool()),
+        _row_desc(row_desc) {
+}
+
+DataSpliter::~DataSpliter() {
+}
+
+// We use the ParttitionRange to compare here. It should not be a member function of PartitionInfo
+// class becaurce there are some other member in it.
+static bool compare_part_use_range(const PartitionInfo* v1, const PartitionInfo* v2) {
+    return v1->range() < v2->range();
+}
+
+Status DataSpliter::from_thrift(
+        ObjectPool* pool, const TDataSplitSink& t_sink, DataSpliter* spliter) {
+    VLOG_ROW << "TDataSplitSink: " << apache::thrift::ThriftDebugString(t_sink);
+
+    // Partition Exprs
+    RETURN_IF_ERROR(Expr::create_expr_trees(
+            pool, t_sink.partition_exprs, &spliter->_partition_expr_ctxs));
+    // Partition infos
+    int num_parts = t_sink.partition_infos.size();
+    if (num_parts == 0) {
+        return Status("Empty partition info.");
+    }
+    for (int i = 0; i < num_parts; ++i) {
+        PartitionInfo* info = pool->add(new PartitionInfo());
+        RETURN_IF_ERROR(PartitionInfo::from_thrift(pool, t_sink.partition_infos[i], info));
+        spliter->_partition_infos.push_back(info);
+    }
+
+    // partitions should be in ascending order
+    std::sort(spliter->_partition_infos.begin(),
+              spliter->_partition_infos.end(),
+              compare_part_use_range);
+
+    // schema infos
+    for (auto& iter : t_sink.rollup_schemas) {
+        RollupSchema* schema = pool->add(new RollupSchema());
+        RETURN_IF_ERROR(RollupSchema::from_thrift(pool, iter.second, schema));
+        spliter->_rollup_map[iter.first] = schema;
+    }
+
+    return Status::OK;
+}
+
+Status DataSpliter::prepare(RuntimeState* state) {
+    std::stringstream title;
+    title << "DataSplitSink (dst_id=" << state->fragment_instance_id() << ")";
+    RETURN_IF_ERROR(DataSink::prepare(state));
+    RETURN_IF_ERROR(Expr::prepare(
+            _partition_expr_ctxs, state, _row_desc, _expr_mem_tracker.get()));
+    for (auto& iter : _rollup_map) {
+        RETURN_IF_ERROR(iter.second->prepare(state, _row_desc, _expr_mem_tracker.get()));
+    }
+    _profile = state->obj_pool()->add(new RuntimeProfile(state->obj_pool(), title.str()));
+    for (auto iter : _partition_infos) {
+        RETURN_IF_ERROR(iter->prepare(state, _row_desc, _expr_mem_tracker.get()));
+    }
+    return Status::OK;
+}
+
+Status DataSpliter::open(RuntimeState* state) {
+    RETURN_IF_ERROR(Expr::open(_partition_expr_ctxs, state));
+
+    for (auto& iter : _rollup_map) {
+        RETURN_IF_ERROR(iter.second->open(state));
+    }
+
+    RETURN_IF_ERROR(state->create_load_dir());
+
+    for (auto iter : _partition_infos) {
+        RETURN_IF_ERROR(iter->open(state));
+
+        DppSink* dpp_sink = _obj_pool->add(new DppSink(_row_desc, _rollup_map));
+        _dpp_sink_vec.push_back(dpp_sink);
+
+        RETURN_IF_ERROR(dpp_sink->init(state));
+        _profile->add_child(dpp_sink->profile(), true, nullptr);
+    }
+
+    _split_timer = ADD_TIMER(_profile, "process batch");
+    _finish_timer = ADD_TIMER(_profile, "sort time");
+
+    return Status::OK;
+}
+
+int DataSpliter::binary_find_partition(const PartRangeKey& key) const {
+    int low = 0;
+    int high = _partition_infos.size() - 1;
+
+    VLOG_ROW << "range key: " << key.debug_string() << std::endl;
+    while (low <= high) {
+        int mid = low + (high - low) / 2;
+        int cmp = _partition_infos[mid]->range().compare_key(key);
+        if (cmp == 0) {
+            return mid;
+        } else if (cmp < 0) { // current < partition[mid]
+            low = mid + 1;
+        } else {
+            high = mid - 1;
+        }
+    }
+
+    return -1;
+}
+
+Status DataSpliter::process_partition(
+        RuntimeState* state, TupleRow* row, PartitionInfo** info, int32_t* part_index) {
+    if (_partition_expr_ctxs.size() == 0) {
+        *part_index = 0;
+        *info = _partition_infos[0];
+        return Status::OK;
+    } else {
+        // use binary search to get the right partition.
+        ExprContext* ctx = _partition_expr_ctxs[0];
+        void* partition_val = ctx->get_value(row);
+        // construct a PartRangeKey
+        PartRangeKey tmpPartKey;
+        if (NULL != partition_val) {
+            RETURN_IF_ERROR(PartRangeKey::from_value(
+                ctx->root()->type().type, partition_val, &tmpPartKey));
+        } else {
+            tmpPartKey = PartRangeKey::neg_infinite();
+        }
+
+        *part_index = binary_find_partition(tmpPartKey);
+        if (*part_index < 0) {
+            std::stringstream error_log;
+            error_log << "there is no corresponding partition for this key: ";
+            ctx->print_value(row, &error_log);
+            return Status(error_log.str(), true);
+        }
+        *info = _partition_infos[*part_index];
+    }
+    return Status::OK;
+}
+
+Status DataSpliter::process_distribute(
+        RuntimeState* state, TupleRow* row,
+        const PartitionInfo* part, uint32_t* mod) {
+    uint32_t hash_val = 0;
+
+    for (auto& ctx : part->distributed_expr_ctxs()) {
+        void* partition_val = ctx->get_value(row);
+        if (partition_val != NULL) {
+            hash_val = RawValue::zlib_crc32(partition_val, ctx->root()->type(), hash_val);
+        } else {
+            //NULL is treat as 0 when hash
+            static const int INT_VALUE = 0;
+            static const TypeDescriptor INT_TYPE(TYPE_INT);
+            hash_val = RawValue::zlib_crc32(&INT_VALUE, INT_TYPE, hash_val);
+        }
+    }
+
+    *mod = hash_val % part->distributed_bucket();
+
+    return Status::OK;
+}
+
+Status DataSpliter::send_row(
+        RuntimeState* state, const TabletDesc& desc, TupleRow* row, DppSink* dpp_sink) {
+    RowBatch* batch = nullptr;
+    auto batch_iter = _batch_map.find(desc);
+    if (batch_iter == _batch_map.end()) {
+        batch = _obj_pool->add(new RowBatch(_row_desc, state->batch_size(), _expr_mem_tracker.get()));
+        _batch_map[desc] = batch;
+    } else {
+        batch = batch_iter->second;
+    }
+
+    // Add this row to this batch
+    int idx = batch->add_row();
+    // Just deep copy this row
+    row->deep_copy(batch->get_row(idx), _row_desc.tuple_descriptors(),
+                   batch->tuple_data_pool(), false);
+    batch->commit_last_row();
+
+    // If this batch is full send this to dpp_sink
+    if (batch->is_full()) {
+        RETURN_IF_ERROR(dpp_sink->add_batch(_obj_pool.get(), state, desc, batch));
+        batch->reset();
+    }
+    return Status::OK;
+}
+
+Status DataSpliter::process_one_row(RuntimeState* state, TupleRow* row) {
+    TabletDesc desc;
+    int32_t part_index = 0;
+
+    // process partition
+    PartitionInfo* part = nullptr;
+    Status status = process_partition(state, row, &part, &part_index);
+    // TODO(lingbin): adjust 'process_partition' function's return value. It is a little inelegant
+    // to return another OK when pri-status is not OK.
+    // If find no partition, this row should be omitted.
+    if (!status.ok()) {
+        state->set_error_row_number(state->get_error_row_number() + 1);
+        state->set_normal_row_number(state->get_normal_row_number() - 1);
+
+        state->append_error_msg_to_file(
+                print_row(row, _row_desc),
+                status.get_error_msg());
+        return Status::OK;
+    }
+
+    desc.partition_id = part->id();
+
+    // process distribute
+    RETURN_IF_ERROR(process_distribute(state, row, part, &desc.bucket_id));
+
+    // construct dpp_sink map
+    _sink_map[desc] = _dpp_sink_vec[part_index];
+
+    // process distribute
+    RETURN_IF_ERROR(send_row(state, desc, row, _dpp_sink_vec[part_index]));
+
+    return Status::OK;
+}
+
+Status DataSpliter::send(RuntimeState* state, RowBatch* batch) {
+    SCOPED_TIMER(_split_timer);
+    int num_rows = batch->num_rows();
+    for (int i = 0; i < num_rows; ++i) {
+        RETURN_IF_ERROR(process_one_row(state, batch->get_row(i)));
+    }
+    return Status::OK;
+}
+
+Status DataSpliter::close(RuntimeState* state, Status close_status) {
+    bool is_ok = true;
+    Status err_status;
+    if (_closed) {
+        return Status::OK;
+    }
+    if (close_status.ok()) {
+        SCOPED_TIMER(_finish_timer);
+        // Flush data have not been sent
+        for (const auto& iter : _batch_map) {
+            if (iter.second->num_rows() > 0) {
+                DppSink* dpp_sink = _sink_map[iter.first];
+                Status status = dpp_sink->add_batch(_obj_pool.get(), state, iter.first, iter.second);
+                if (UNLIKELY(is_ok && !status.ok())) {
+                    LOG(WARNING) << "add_batch error"
+                                << " err_msg=" << status.get_error_msg();
+                    is_ok = false;
+                    err_status = status;
+                }
+                iter.second->reset();
+            }
+        }
+    }
+    // finish sink
+    for (const auto& iter : _dpp_sink_vec) {
+        Status status = iter->finish(state);
+        if (UNLIKELY(is_ok && !status.ok())) {
+            LOG(WARNING) << "finish dpp_sink error"
+                    << " err_msg=" << status.get_error_msg();
+            is_ok = false;
+            err_status = status;
+        }
+    }
+    Expr::close(_partition_expr_ctxs, state);
+    for (auto& iter : _rollup_map) {
+        Status status = iter.second->close(state);
+        if (UNLIKELY(is_ok && !status.ok())) {
+            LOG(WARNING) << "close rollup_map error"
+                    << " err_msg=" << status.get_error_msg();
+            is_ok = false;
+            err_status = status;
+        }
+    }
+    for (auto iter : _partition_infos) {
+        Status status = iter->close(state);
+        if (UNLIKELY(is_ok && !status.ok())) {
+            LOG(WARNING) << "close partition_info error"
+                    << " err_msg=" << status.get_error_msg();
+            is_ok = false;
+            err_status = status;
+        }
+    }
+
+    _closed = true;
+    if (is_ok) {
+        return Status::OK;
+    } else {
+        return err_status;
+    }
+}
+
+}