diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..6c01695fd2 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,16 @@ +# Use whitelist to set text on +# text means convert to LF when check in +# eol=lf means convert to LF when check out +*.cpp text eol=lf +*.cc text eol=lf +*.c text eol=lf +*.h text eol=lf +*.java text eol=lf +*.py text eol=lf +*.js text eol=lf +*.md text eol=lf +*.txt text eol=lf +*.sh text eol=lf +*.thrift text eol=lf +*.proto text eol=lf +*.conf text eol=lf diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index d70fbfc008..c3e426c695 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -1,877 +1,877 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "es_scan_node.h" - -#include -#include -#include - -#include "gen_cpp/PlanNodes_types.h" -#include "gen_cpp/Exprs_types.h" -#include "runtime/runtime_state.h" -#include "runtime/row_batch.h" -#include "runtime/string_value.h" -#include "runtime/tuple_row.h" -#include "runtime/client_cache.h" -#include "util/runtime_profile.h" -#include "util/debug_util.h" -#include "service/backend_options.h" -#include "olap/olap_common.h" -#include "olap/utils.h" -#include "exprs/expr_context.h" -#include "exprs/expr.h" -#include "exprs/in_predicate.h" -#include "exprs/slot_ref.h" - -namespace doris { - -// $0 = column type (e.g. INT) -const string ERROR_INVALID_COL_DATA = "Data source returned inconsistent column data. " - "Expected value of type $0 based on column metadata. This likely indicates a " - "problem with the data source library."; -const string ERROR_MEM_LIMIT_EXCEEDED = "DataSourceScanNode::$0() failed to allocate " - "$1 bytes for $2."; - -EsScanNode::EsScanNode( - ObjectPool* pool, - const TPlanNode& tnode, - const DescriptorTbl& descs) : - ScanNode(pool, tnode, descs), - _tuple_id(tnode.es_scan_node.tuple_id), - _scan_range_idx(0) { - if (tnode.es_scan_node.__isset.properties) { - _properties = tnode.es_scan_node.properties; - } -} - -EsScanNode::~EsScanNode() { -} - -Status EsScanNode::prepare(RuntimeState* state) { - VLOG(1) << "EsScanNode::Prepare"; - - RETURN_IF_ERROR(ScanNode::prepare(state)); - _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); - if (_tuple_desc == nullptr) { - std::stringstream ss; - ss << "es tuple descriptor is null, _tuple_id=" << _tuple_id; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } - _env = state->exec_env(); - - return Status::OK(); -} - -Status EsScanNode::open(RuntimeState* state) { - VLOG(1) << "EsScanNode::Open"; - - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); - RETURN_IF_CANCELLED(state); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - RETURN_IF_ERROR(ExecNode::open(state)); - - // TExtOpenParams.row_schema - vector cols; - for (const SlotDescriptor* slot : _tuple_desc->slots()) { - TExtColumnDesc col; - col.__set_name(slot->col_name()); - col.__set_type(slot->type().to_thrift()); - cols.emplace_back(std::move(col)); - } - TExtTableSchema row_schema; - row_schema.cols = std::move(cols); - row_schema.__isset.cols = true; - - // TExtOpenParams.predicates - vector > predicates; - vector predicate_to_conjunct; - for (int i = 0; i < _conjunct_ctxs.size(); ++i) { - VLOG(1) << "conjunct: " << _conjunct_ctxs[i]->root()->debug_string(); - vector disjuncts; - if (get_disjuncts(_conjunct_ctxs[i], _conjunct_ctxs[i]->root(), disjuncts)) { - predicates.emplace_back(std::move(disjuncts)); - predicate_to_conjunct.push_back(i); - } - } - - // open every scan range - vector conjunct_accepted_times(_conjunct_ctxs.size(), 0); - for (int i = 0; i < _scan_ranges.size(); ++i) { - TEsScanRange& es_scan_range = _scan_ranges[i]; - - if (es_scan_range.es_hosts.empty()) { - std::stringstream ss; - ss << "es fail to open: hosts empty"; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } - - - // TExtOpenParams - TExtOpenParams params; - params.__set_query_id(state->query_id()); - _properties["index"] = es_scan_range.index; - if (es_scan_range.__isset.type) { - _properties["type"] = es_scan_range.type; - } - _properties["shard_id"] = std::to_string(es_scan_range.shard_id); - params.__set_properties(_properties); - params.__set_row_schema(row_schema); - params.__set_batch_size(state->batch_size()); - params.__set_predicates(predicates); - TExtOpenResult result; - - // choose an es node, local is the first choice - std::string localhost = BackendOptions::get_localhost(); - bool is_success = false; - for (int j = 0; j < 2; ++j) { - for (auto& es_host : es_scan_range.es_hosts) { - if ((j == 0 && es_host.hostname != localhost) - || (j == 1 && es_host.hostname == localhost)) { - continue; - } - Status status = open_es(es_host, result, params); - if (status.ok()) { - is_success = true; - _addresses.push_back(es_host); - _scan_handles.push_back(result.scan_handle); - if (result.__isset.accepted_conjuncts) { - for (int index : result.accepted_conjuncts) { - conjunct_accepted_times[predicate_to_conjunct[index]]++; - } - } - break; - } else if (status.code() == TStatusCode::ES_SHARD_NOT_FOUND) { - // if shard not found, try other nodes - LOG(WARNING) << "shard not found on es node: " - << ", address=" << es_host - << ", scan_range_idx=" << i << ", try other nodes"; - } else { - LOG(WARNING) << "es open error: scan_range_idx=" << i - << ", address=" << es_host - << ", msg=" << status.get_error_msg(); - return status; - } - } - if (is_success) { - break; - } - } - - if (!is_success) { - std::stringstream ss; - ss << "es open error: scan_range_idx=" << i - << ", can't find shard on any node"; - return Status::InternalError(ss.str()); - } - } - - // remove those conjuncts that accepted by all scan ranges - for (int i = predicate_to_conjunct.size() - 1; i >= 0; i--) { - int conjunct_index = predicate_to_conjunct[i]; - if (conjunct_accepted_times[conjunct_index] == _scan_ranges.size()) { - _pushdown_conjunct_ctxs.push_back(*(_conjunct_ctxs.begin() + conjunct_index)); - _conjunct_ctxs.erase(_conjunct_ctxs.begin() + conjunct_index); - } - } - - for (int i = 0; i < _conjunct_ctxs.size(); ++i) { - if (!check_left_conjuncts(_conjunct_ctxs[i]->root())) { - return Status::InternalError("esquery could only be executed on es, but could not push down to es"); - } - } - - return Status::OK(); -} - -Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - VLOG(1) << "EsScanNode::GetNext"; - - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); - RETURN_IF_CANCELLED(state); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - SCOPED_TIMER(materialize_tuple_timer()); - - // create tuple - MemPool* tuple_pool = row_batch->tuple_data_pool(); - int64_t tuple_buffer_size; - uint8_t* tuple_buffer = nullptr; - RETURN_IF_ERROR(row_batch->resize_and_allocate_tuple_buffer(state, &tuple_buffer_size, &tuple_buffer)); - Tuple* tuple = reinterpret_cast(tuple_buffer); - - // get batch - TExtGetNextResult result; - RETURN_IF_ERROR(get_next_from_es(result)); - _offsets[_scan_range_idx] += result.rows.num_rows; - - // convert - VLOG(1) << "begin to convert: scan_range_idx=" << _scan_range_idx - << ", num_rows=" << result.rows.num_rows; - vector& cols = result.rows.cols; - // indexes of the next non-null value in the row batch, per column. - vector cols_next_val_idx(_tuple_desc->slots().size(), 0); - for (int row_idx = 0; row_idx < result.rows.num_rows; row_idx++) { - if (reached_limit()) { - *eos = true; - break; - } - RETURN_IF_ERROR(materialize_row(tuple_pool, tuple, cols, row_idx, cols_next_val_idx)); - TupleRow* tuple_row = row_batch->get_row(row_batch->add_row()); - tuple_row->set_tuple(0, tuple); - if (ExecNode::eval_conjuncts(_conjunct_ctxs.data(), _conjunct_ctxs.size(), tuple_row)) { - row_batch->commit_last_row(); - tuple = reinterpret_cast( - reinterpret_cast(tuple) + _tuple_desc->byte_size()); - ++_num_rows_returned; - } - } - - VLOG(1) << "finish one batch: num_rows=" << row_batch->num_rows(); - COUNTER_SET(_rows_returned_counter, _num_rows_returned); - if (result.__isset.eos && result.eos) { - VLOG(1) << "es finish one scan_range: scan_range_idx=" << _scan_range_idx; - ++_scan_range_idx; - } - if (_scan_range_idx == _scan_ranges.size()) { - *eos = true; - } - - return Status::OK(); -} - -Status EsScanNode::close(RuntimeState* state) { - if (is_closed()) return Status::OK(); - VLOG(1) << "EsScanNode::Close"; - RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - Expr::close(_pushdown_conjunct_ctxs, state); - RETURN_IF_ERROR(ExecNode::close(state)); - for (int i = 0; i < _addresses.size(); ++i) { - TExtCloseParams params; - params.__set_scan_handle(_scan_handles[i]); - TExtCloseResult result; - -#ifndef BE_TEST - const TNetworkAddress& address = _addresses[i]; - try { - Status status; - ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); - ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); - if (!status.ok()) { - LOG(WARNING) << "es create client error: scan_range_idx=" << i - << ", address=" << address - << ", msg=" << status.get_error_msg(); - return status; - } - - try { - VLOG(1) << "es close param=" << apache::thrift::ThriftDebugString(params); - client->close(result, params); - } catch (apache::thrift::transport::TTransportException& e) { - LOG(WARNING) << "es close retrying, because: " << e.what(); - RETURN_IF_ERROR(client.reopen()); - client->close(result, params); - } - } catch (apache::thrift::TException &e) { - std::stringstream ss; - ss << "es close error: scan_range_idx=" << i - << ", msg=" << e.what(); - LOG(WARNING) << ss.str(); - return Status::ThriftRpcError(ss.str()); - } - - VLOG(1) << "es close result=" << apache::thrift::ThriftDebugString(result); - Status status(result.status); - if (!status.ok()) { - LOG(WARNING) << "es close error: : scan_range_idx=" << i - << ", msg=" << status.get_error_msg(); - return status; - } -#else - TStatus status; - result.__set_status(status); -#endif - } - - return Status::OK(); -} - -void EsScanNode::debug_string(int indentation_level, stringstream* out) const { - *out << string(indentation_level * 2, ' '); - *out << "EsScanNode(tupleid=" << _tuple_id; - *out << ")" << std::endl; - - for (int i = 0; i < _children.size(); ++i) { - _children[i]->debug_string(indentation_level + 1, out); - } -} - -Status EsScanNode::set_scan_ranges(const vector& scan_ranges) { - for (int i = 0; i < scan_ranges.size(); ++i) { - TScanRangeParams scan_range = scan_ranges[i]; - DCHECK(scan_range.scan_range.__isset.es_scan_range); - TEsScanRange es_scan_range = scan_range.scan_range.es_scan_range; - _scan_ranges.push_back(es_scan_range); - } - - _offsets.resize(scan_ranges.size(), 0); - return Status::OK(); -} - -Status EsScanNode::open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params) { - - VLOG(1) << "es open param=" << apache::thrift::ThriftDebugString(params); -#ifndef BE_TEST - try { - ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); - Status status; - ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); - if (!status.ok()) { - std::stringstream ss; - ss << "es create client error: address=" << address - << ", msg=" << status.get_error_msg(); - return Status::InternalError(ss.str()); - } - - try { - client->open(result, params); - } catch (apache::thrift::transport::TTransportException& e) { - LOG(WARNING) << "es open retrying, because: " << e.what(); - RETURN_IF_ERROR(client.reopen()); - client->open(result, params); - } - VLOG(1) << "es open result=" << apache::thrift::ThriftDebugString(result); - return Status(result.status); - } catch (apache::thrift::TException &e) { - std::stringstream ss; - ss << "es open error: address=" << address << ", msg=" << e.what(); - return Status::InternalError(ss.str()); - } -#else - TStatus status; - result.__set_status(status); - result.__set_scan_handle("0"); - return Status(status); -#endif -} - -// legacy conjuncts must not contain match function -bool EsScanNode::check_left_conjuncts(Expr* conjunct) { - if (is_match_func(conjunct)) { - return false; - } else { - int num_children = conjunct->get_num_children(); - for (int child_idx = 0; child_idx < num_children; ++child_idx) { - if (!check_left_conjuncts(conjunct->get_child(child_idx))) { - return false; - } - } - return true; - } -} - -bool EsScanNode::ignore_cast(SlotDescriptor* slot, Expr* expr) { - if (slot->type().is_date_type() && expr->type().is_date_type()) { - return true; - } - if (slot->type().is_string_type() && expr->type().is_string_type()) { - return true; - } - return false; -} - -bool EsScanNode::get_disjuncts(ExprContext* context, Expr* conjunct, - vector& disjuncts) { - if (TExprNodeType::BINARY_PRED == conjunct->node_type()) { - if (conjunct->children().size() != 2) { - VLOG(1) << "get disjuncts fail: number of childs is not 2"; - return false; - } - SlotRef* slotRef; - TExprOpcode::type op; - Expr* expr; - if (TExprNodeType::SLOT_REF == conjunct->get_child(0)->node_type()) { - expr = conjunct->get_child(1); - slotRef = (SlotRef*)(conjunct->get_child(0)); - op = conjunct->op(); - } else if (TExprNodeType::SLOT_REF == conjunct->get_child(1)->node_type()) { - expr = conjunct->get_child(0); - slotRef = (SlotRef*)(conjunct->get_child(1)); - op = conjunct->op(); - } else { - VLOG(1) << "get disjuncts fail: no SLOT_REF child"; - return false; - } - - SlotDescriptor* slot_desc = get_slot_desc(slotRef); - if (slot_desc == nullptr) { - VLOG(1) << "get disjuncts fail: slot_desc is null"; - return false; - } - - TExtLiteral literal; - if (!to_ext_literal(context, expr, &literal)) { - VLOG(1) << "get disjuncts fail: can't get literal, node_type=" - << expr->node_type(); - return false; - } - - TExtColumnDesc columnDesc; - columnDesc.__set_name(slot_desc->col_name()); - columnDesc.__set_type(slot_desc->type().to_thrift()); - TExtBinaryPredicate binaryPredicate; - binaryPredicate.__set_col(columnDesc); - binaryPredicate.__set_op(op); - binaryPredicate.__set_value(std::move(literal)); - TExtPredicate predicate; - predicate.__set_node_type(TExprNodeType::BINARY_PRED); - predicate.__set_binary_predicate(binaryPredicate); - disjuncts.push_back(std::move(predicate)); - return true; - } else if (is_match_func(conjunct)) { - // if this is a function call expr and function name is match, then push - // down it to es - TExtFunction match_function; - match_function.__set_func_name(conjunct->fn().name.function_name); - vector query_conditions; - - - TExtLiteral literal; - if (!to_ext_literal(context, conjunct->get_child(1), &literal)) { - VLOG(1) << "get disjuncts fail: can't get literal, node_type=" - << conjunct->get_child(1)->node_type(); - return false; - } - - query_conditions.push_back(std::move(literal)); - match_function.__set_values(query_conditions); - TExtPredicate predicate; - predicate.__set_node_type(TExprNodeType::FUNCTION_CALL); - predicate.__set_ext_function(match_function); - disjuncts.push_back(std::move(predicate)); - return true; - } else if (TExprNodeType::IN_PRED == conjunct->node_type()) { - // the op code maybe FILTER_NEW_IN, it means there is function in list - // like col_a in (abs(1)) - if (TExprOpcode::FILTER_IN != conjunct->op() - && TExprOpcode::FILTER_NOT_IN != conjunct->op()) { - return false; - } - TExtInPredicate ext_in_predicate; - vector in_pred_values; - InPredicate* pred = dynamic_cast(conjunct); - ext_in_predicate.__set_is_not_in(pred->is_not_in()); - if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) { - return false; - } - - SlotRef* slot_ref = (SlotRef*)(conjunct->get_child(0)); - SlotDescriptor* slot_desc = get_slot_desc(slot_ref); - if (slot_desc == nullptr) { - return false; - } - TExtColumnDesc columnDesc; - columnDesc.__set_name(slot_desc->col_name()); - columnDesc.__set_type(slot_desc->type().to_thrift()); - ext_in_predicate.__set_col(columnDesc); - - if (pred->get_child(0)->type().type != slot_desc->type().type) { - if (!ignore_cast(slot_desc, pred->get_child(0))) { - return false; - } - } - - HybirdSetBase::IteratorBase* iter = pred->hybird_set()->begin(); - while (iter->has_next()) { - if (nullptr == iter->get_value()) { - return false; - } - TExtLiteral literal; - if (!to_ext_literal(slot_desc->type().type, const_cast(iter->get_value()), &literal)) { - VLOG(1) << "get disjuncts fail: can't get literal, node_type=" - << slot_desc->type().type; - return false; - } - in_pred_values.push_back(literal); - iter->next(); - } - ext_in_predicate.__set_values(in_pred_values); - TExtPredicate predicate; - predicate.__set_node_type(TExprNodeType::IN_PRED); - predicate.__set_in_predicate(ext_in_predicate); - disjuncts.push_back(std::move(predicate)); - return true; - } else if (TExprNodeType::COMPOUND_PRED == conjunct->node_type()) { - if (TExprOpcode::COMPOUND_OR != conjunct->op()) { - VLOG(1) << "get disjuncts fail: op is not COMPOUND_OR"; - return false; - } - if (!get_disjuncts(context, conjunct->get_child(0), disjuncts)) { - return false; - } - if (!get_disjuncts(context, conjunct->get_child(1), disjuncts)) { - return false; - } - return true; - } else { - VLOG(1) << "get disjuncts fail: node type is " << conjunct->node_type() - << ", should be BINARY_PRED or COMPOUND_PRED"; - return false; - } -} - -bool EsScanNode::is_match_func(Expr* conjunct) { - if (TExprNodeType::FUNCTION_CALL == conjunct->node_type() - && conjunct->fn().name.function_name == "esquery") { - return true; - } - return false; -} - -SlotDescriptor* EsScanNode::get_slot_desc(SlotRef* slotRef) { - std::vector slot_ids; - slotRef->get_slot_ids(&slot_ids); - SlotDescriptor* slot_desc = nullptr; - for (SlotDescriptor* slot : _tuple_desc->slots()) { - if (slot->id() == slot_ids[0]) { - slot_desc = slot; - break; - } - } - return slot_desc; -} - -bool EsScanNode::to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal) { - switch (expr->node_type()) { - case TExprNodeType::BOOL_LITERAL: - case TExprNodeType::INT_LITERAL: - case TExprNodeType::LARGE_INT_LITERAL: - case TExprNodeType::FLOAT_LITERAL: - case TExprNodeType::DECIMAL_LITERAL: - case TExprNodeType::STRING_LITERAL: - case TExprNodeType::DATE_LITERAL: - return to_ext_literal(expr->type().type, context->get_value(expr, NULL), literal); - default: - return false; - } -} - -bool EsScanNode::to_ext_literal(PrimitiveType slot_type, void* value, TExtLiteral* literal) { - TExprNodeType::type node_type; - switch (slot_type) { - case TYPE_BOOLEAN: { - node_type = (TExprNodeType::BOOL_LITERAL); - TBoolLiteral bool_literal; - bool_literal.__set_value(*reinterpret_cast(value)); - literal->__set_bool_literal(bool_literal); - break; - } - - case TYPE_TINYINT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - case TYPE_SMALLINT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - case TYPE_INT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - case TYPE_BIGINT: { - node_type = (TExprNodeType::INT_LITERAL); - TIntLiteral int_literal; - int_literal.__set_value(*reinterpret_cast(value)); - literal->__set_int_literal(int_literal); - break; - } - - case TYPE_LARGEINT: { - node_type = (TExprNodeType::LARGE_INT_LITERAL); - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string(*reinterpret_cast<__int128*>(value), buf, &len); - TLargeIntLiteral large_int_literal; - large_int_literal.__set_value(v); - literal->__set_large_int_literal(large_int_literal); - break; - } - - case TYPE_FLOAT: { - node_type = (TExprNodeType::FLOAT_LITERAL); - TFloatLiteral float_literal; - float_literal.__set_value(*reinterpret_cast(value)); - literal->__set_float_literal(float_literal); - break; - } - case TYPE_DOUBLE: { - node_type = (TExprNodeType::FLOAT_LITERAL); - TFloatLiteral float_literal; - float_literal.__set_value(*reinterpret_cast(value)); - literal->__set_float_literal(float_literal); - break; - } - - case TYPE_DECIMAL: { - node_type = (TExprNodeType::DECIMAL_LITERAL); - TDecimalLiteral decimal_literal; - decimal_literal.__set_value(reinterpret_cast(value)->to_string()); - literal->__set_decimal_literal(decimal_literal); - break; - } - - case TYPE_DATE: - case TYPE_DATETIME: { - node_type = (TExprNodeType::DATE_LITERAL); - const DateTimeValue date_value = *reinterpret_cast(value); - char str[MAX_DTVALUE_STR_LEN]; - date_value.to_string(str); - TDateLiteral date_literal; - date_literal.__set_value(str); - literal->__set_date_literal(date_literal); - break; - } - - case TYPE_CHAR: - case TYPE_VARCHAR: { - node_type = (TExprNodeType::STRING_LITERAL); - TStringLiteral string_literal; - string_literal.__set_value((reinterpret_cast(value))->debug_string()); - literal->__set_string_literal(string_literal); - break; - } - - default: { - DCHECK(false) << "Invalid type."; - return false; - } - } - literal->__set_node_type(node_type); - return true; -} - -Status EsScanNode::get_next_from_es(TExtGetNextResult& result) { - TExtGetNextParams params; - params.__set_scan_handle(_scan_handles[_scan_range_idx]); - params.__set_offset(_offsets[_scan_range_idx]); - - // getNext - const TNetworkAddress &address = _addresses[_scan_range_idx]; -#ifndef BE_TEST - try { - Status create_client_status; - ExtDataSourceServiceClientCache *client_cache = _env->extdatasource_client_cache(); - ExtDataSourceServiceConnection client(client_cache, address, 10000, &create_client_status); - if (!create_client_status.ok()) { - LOG(WARNING) << "es create client error: scan_range_idx=" << _scan_range_idx - << ", address=" << address - << ", msg=" << create_client_status.get_error_msg(); - return create_client_status; - } - - try { - VLOG(1) << "es get_next param=" << apache::thrift::ThriftDebugString(params); - client->getNext(result, params); - } catch (apache::thrift::transport::TTransportException& e) { - std::stringstream ss; - ss << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", msg=" << e.what(); - LOG(WARNING) << ss.str(); - RETURN_IF_ERROR(client.reopen()); - return Status::ThriftRpcError(ss.str()); - } - } catch (apache::thrift::TException &e) { - std::stringstream ss; - ss << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", msg=" << e.what(); - LOG(WARNING) << ss.str(); - return Status::ThriftRpcError(ss.str()); - } -#else - TStatus status; - result.__set_status(status); - result.__set_eos(true); - TExtColumnData col_data; - std::vector is_null; - is_null.push_back(false); - col_data.__set_is_null(is_null); - std::vector int_vals; - int_vals.push_back(1); - int_vals.push_back(2); - col_data.__set_int_vals(int_vals); - std::vector cols; - cols.push_back(col_data); - TExtRowBatch rows; - rows.__set_cols(cols); - rows.__set_num_rows(2); - result.__set_rows(rows); - return Status(status); -#endif - - // check result - VLOG(1) << "es get_next result=" << apache::thrift::ThriftDebugString(result); - Status get_next_status(result.status); - if (!get_next_status.ok()) { - LOG(WARNING) << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", address=" << address - << ", msg=" << get_next_status.get_error_msg(); - return get_next_status; - } - if (!result.__isset.rows || !result.rows.__isset.num_rows) { - std::stringstream ss; - ss << "es get_next error: scan_range_idx=" << _scan_range_idx - << ", msg=rows or num_rows not in result"; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } - - return Status::OK(); -} - -Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, - const vector& cols, int row_idx, - vector& cols_next_val_idx) { - tuple->init(_tuple_desc->byte_size()); - - for (int i = 0; i < _tuple_desc->slots().size(); ++i) { - const SlotDescriptor* slot_desc = _tuple_desc->slots()[i]; - - if (!slot_desc->is_materialized()) { - continue; - } - - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - const TExtColumnData& col = cols[i]; - - if (col.is_null[row_idx]) { - tuple->set_null(slot_desc->null_indicator_offset()); - continue; - } else { - tuple->set_not_null(slot_desc->null_indicator_offset()); - } - - int val_idx = cols_next_val_idx[i]++; - switch (slot_desc->type().type) { - case TYPE_CHAR: - case TYPE_VARCHAR: { - if (val_idx >= col.string_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "STRING")); - } - const string& val = col.string_vals[val_idx]; - size_t val_size = val.size(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); - if (UNLIKELY(buffer == NULL)) { - string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", - val_size, "string slot"); - return tuple_pool->mem_tracker()->MemLimitExceeded(NULL, details, val_size); - } - memcpy(buffer, val.data(), val_size); - reinterpret_cast(slot)->ptr = buffer; - reinterpret_cast(slot)->len = val_size; - break; - } - case TYPE_TINYINT: - if (val_idx >= col.byte_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TINYINT")); - } - *reinterpret_cast(slot) = col.byte_vals[val_idx]; - break; - case TYPE_SMALLINT: - if (val_idx >= col.short_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "SMALLINT")); - } - *reinterpret_cast(slot) = col.short_vals[val_idx]; - break; - case TYPE_INT: - if (val_idx >= col.int_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "INT")); - } - *reinterpret_cast(slot) = col.int_vals[val_idx]; - break; - case TYPE_BIGINT: - if (val_idx >= col.long_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BIGINT")); - } - *reinterpret_cast(slot) = col.long_vals[val_idx]; - break; - case TYPE_LARGEINT: - if (val_idx >= col.long_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "LARGEINT")); - } - *reinterpret_cast(slot) = col.long_vals[val_idx]; - break; - case TYPE_DOUBLE: - if (val_idx >= col.double_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DOUBLE")); - } - *reinterpret_cast(slot) = col.double_vals[val_idx]; - break; - case TYPE_FLOAT: - if (val_idx >= col.double_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "FLOAT")); - } - *reinterpret_cast(slot) = col.double_vals[val_idx]; - break; - case TYPE_BOOLEAN: - if (val_idx >= col.bool_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BOOLEAN")); - } - *reinterpret_cast(slot) = col.bool_vals[val_idx]; - break; - case TYPE_DATE: - if (val_idx >= col.long_vals.size() || - !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATE")); - } - reinterpret_cast(slot)->cast_to_date(); - break; - case TYPE_DATETIME: { - if (val_idx >= col.long_vals.size() || - !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATETIME")); - } - reinterpret_cast(slot)->set_type(TIME_DATETIME); - break; - } - case TYPE_DECIMAL: { - if (val_idx >= col.binary_vals.size()) { - return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DECIMAL")); - } - const string& val = col.binary_vals[val_idx]; - *reinterpret_cast(slot) = *reinterpret_cast(&val); - break; - } - default: - DCHECK(false); - } - } - return Status::OK(); -} - -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "es_scan_node.h" + +#include +#include +#include + +#include "gen_cpp/PlanNodes_types.h" +#include "gen_cpp/Exprs_types.h" +#include "runtime/runtime_state.h" +#include "runtime/row_batch.h" +#include "runtime/string_value.h" +#include "runtime/tuple_row.h" +#include "runtime/client_cache.h" +#include "util/runtime_profile.h" +#include "util/debug_util.h" +#include "service/backend_options.h" +#include "olap/olap_common.h" +#include "olap/utils.h" +#include "exprs/expr_context.h" +#include "exprs/expr.h" +#include "exprs/in_predicate.h" +#include "exprs/slot_ref.h" + +namespace doris { + +// $0 = column type (e.g. INT) +const string ERROR_INVALID_COL_DATA = "Data source returned inconsistent column data. " + "Expected value of type $0 based on column metadata. This likely indicates a " + "problem with the data source library."; +const string ERROR_MEM_LIMIT_EXCEEDED = "DataSourceScanNode::$0() failed to allocate " + "$1 bytes for $2."; + +EsScanNode::EsScanNode( + ObjectPool* pool, + const TPlanNode& tnode, + const DescriptorTbl& descs) : + ScanNode(pool, tnode, descs), + _tuple_id(tnode.es_scan_node.tuple_id), + _scan_range_idx(0) { + if (tnode.es_scan_node.__isset.properties) { + _properties = tnode.es_scan_node.properties; + } +} + +EsScanNode::~EsScanNode() { +} + +Status EsScanNode::prepare(RuntimeState* state) { + VLOG(1) << "EsScanNode::Prepare"; + + RETURN_IF_ERROR(ScanNode::prepare(state)); + _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); + if (_tuple_desc == nullptr) { + std::stringstream ss; + ss << "es tuple descriptor is null, _tuple_id=" << _tuple_id; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + _env = state->exec_env(); + + return Status::OK(); +} + +Status EsScanNode::open(RuntimeState* state) { + VLOG(1) << "EsScanNode::Open"; + + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); + RETURN_IF_CANCELLED(state); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + RETURN_IF_ERROR(ExecNode::open(state)); + + // TExtOpenParams.row_schema + vector cols; + for (const SlotDescriptor* slot : _tuple_desc->slots()) { + TExtColumnDesc col; + col.__set_name(slot->col_name()); + col.__set_type(slot->type().to_thrift()); + cols.emplace_back(std::move(col)); + } + TExtTableSchema row_schema; + row_schema.cols = std::move(cols); + row_schema.__isset.cols = true; + + // TExtOpenParams.predicates + vector > predicates; + vector predicate_to_conjunct; + for (int i = 0; i < _conjunct_ctxs.size(); ++i) { + VLOG(1) << "conjunct: " << _conjunct_ctxs[i]->root()->debug_string(); + vector disjuncts; + if (get_disjuncts(_conjunct_ctxs[i], _conjunct_ctxs[i]->root(), disjuncts)) { + predicates.emplace_back(std::move(disjuncts)); + predicate_to_conjunct.push_back(i); + } + } + + // open every scan range + vector conjunct_accepted_times(_conjunct_ctxs.size(), 0); + for (int i = 0; i < _scan_ranges.size(); ++i) { + TEsScanRange& es_scan_range = _scan_ranges[i]; + + if (es_scan_range.es_hosts.empty()) { + std::stringstream ss; + ss << "es fail to open: hosts empty"; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + + // TExtOpenParams + TExtOpenParams params; + params.__set_query_id(state->query_id()); + _properties["index"] = es_scan_range.index; + if (es_scan_range.__isset.type) { + _properties["type"] = es_scan_range.type; + } + _properties["shard_id"] = std::to_string(es_scan_range.shard_id); + params.__set_properties(_properties); + params.__set_row_schema(row_schema); + params.__set_batch_size(state->batch_size()); + params.__set_predicates(predicates); + TExtOpenResult result; + + // choose an es node, local is the first choice + std::string localhost = BackendOptions::get_localhost(); + bool is_success = false; + for (int j = 0; j < 2; ++j) { + for (auto& es_host : es_scan_range.es_hosts) { + if ((j == 0 && es_host.hostname != localhost) + || (j == 1 && es_host.hostname == localhost)) { + continue; + } + Status status = open_es(es_host, result, params); + if (status.ok()) { + is_success = true; + _addresses.push_back(es_host); + _scan_handles.push_back(result.scan_handle); + if (result.__isset.accepted_conjuncts) { + for (int index : result.accepted_conjuncts) { + conjunct_accepted_times[predicate_to_conjunct[index]]++; + } + } + break; + } else if (status.code() == TStatusCode::ES_SHARD_NOT_FOUND) { + // if shard not found, try other nodes + LOG(WARNING) << "shard not found on es node: " + << ", address=" << es_host + << ", scan_range_idx=" << i << ", try other nodes"; + } else { + LOG(WARNING) << "es open error: scan_range_idx=" << i + << ", address=" << es_host + << ", msg=" << status.get_error_msg(); + return status; + } + } + if (is_success) { + break; + } + } + + if (!is_success) { + std::stringstream ss; + ss << "es open error: scan_range_idx=" << i + << ", can't find shard on any node"; + return Status::InternalError(ss.str()); + } + } + + // remove those conjuncts that accepted by all scan ranges + for (int i = predicate_to_conjunct.size() - 1; i >= 0; i--) { + int conjunct_index = predicate_to_conjunct[i]; + if (conjunct_accepted_times[conjunct_index] == _scan_ranges.size()) { + _pushdown_conjunct_ctxs.push_back(*(_conjunct_ctxs.begin() + conjunct_index)); + _conjunct_ctxs.erase(_conjunct_ctxs.begin() + conjunct_index); + } + } + + for (int i = 0; i < _conjunct_ctxs.size(); ++i) { + if (!check_left_conjuncts(_conjunct_ctxs[i]->root())) { + return Status::InternalError("esquery could only be executed on es, but could not push down to es"); + } + } + + return Status::OK(); +} + +Status EsScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { + VLOG(1) << "EsScanNode::GetNext"; + + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); + RETURN_IF_CANCELLED(state); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + SCOPED_TIMER(materialize_tuple_timer()); + + // create tuple + MemPool* tuple_pool = row_batch->tuple_data_pool(); + int64_t tuple_buffer_size; + uint8_t* tuple_buffer = nullptr; + RETURN_IF_ERROR(row_batch->resize_and_allocate_tuple_buffer(state, &tuple_buffer_size, &tuple_buffer)); + Tuple* tuple = reinterpret_cast(tuple_buffer); + + // get batch + TExtGetNextResult result; + RETURN_IF_ERROR(get_next_from_es(result)); + _offsets[_scan_range_idx] += result.rows.num_rows; + + // convert + VLOG(1) << "begin to convert: scan_range_idx=" << _scan_range_idx + << ", num_rows=" << result.rows.num_rows; + vector& cols = result.rows.cols; + // indexes of the next non-null value in the row batch, per column. + vector cols_next_val_idx(_tuple_desc->slots().size(), 0); + for (int row_idx = 0; row_idx < result.rows.num_rows; row_idx++) { + if (reached_limit()) { + *eos = true; + break; + } + RETURN_IF_ERROR(materialize_row(tuple_pool, tuple, cols, row_idx, cols_next_val_idx)); + TupleRow* tuple_row = row_batch->get_row(row_batch->add_row()); + tuple_row->set_tuple(0, tuple); + if (ExecNode::eval_conjuncts(_conjunct_ctxs.data(), _conjunct_ctxs.size(), tuple_row)) { + row_batch->commit_last_row(); + tuple = reinterpret_cast( + reinterpret_cast(tuple) + _tuple_desc->byte_size()); + ++_num_rows_returned; + } + } + + VLOG(1) << "finish one batch: num_rows=" << row_batch->num_rows(); + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + if (result.__isset.eos && result.eos) { + VLOG(1) << "es finish one scan_range: scan_range_idx=" << _scan_range_idx; + ++_scan_range_idx; + } + if (_scan_range_idx == _scan_ranges.size()) { + *eos = true; + } + + return Status::OK(); +} + +Status EsScanNode::close(RuntimeState* state) { + if (is_closed()) return Status::OK(); + VLOG(1) << "EsScanNode::Close"; + RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); + SCOPED_TIMER(_runtime_profile->total_time_counter()); + Expr::close(_pushdown_conjunct_ctxs, state); + RETURN_IF_ERROR(ExecNode::close(state)); + for (int i = 0; i < _addresses.size(); ++i) { + TExtCloseParams params; + params.__set_scan_handle(_scan_handles[i]); + TExtCloseResult result; + +#ifndef BE_TEST + const TNetworkAddress& address = _addresses[i]; + try { + Status status; + ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); + ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); + if (!status.ok()) { + LOG(WARNING) << "es create client error: scan_range_idx=" << i + << ", address=" << address + << ", msg=" << status.get_error_msg(); + return status; + } + + try { + VLOG(1) << "es close param=" << apache::thrift::ThriftDebugString(params); + client->close(result, params); + } catch (apache::thrift::transport::TTransportException& e) { + LOG(WARNING) << "es close retrying, because: " << e.what(); + RETURN_IF_ERROR(client.reopen()); + client->close(result, params); + } + } catch (apache::thrift::TException &e) { + std::stringstream ss; + ss << "es close error: scan_range_idx=" << i + << ", msg=" << e.what(); + LOG(WARNING) << ss.str(); + return Status::ThriftRpcError(ss.str()); + } + + VLOG(1) << "es close result=" << apache::thrift::ThriftDebugString(result); + Status status(result.status); + if (!status.ok()) { + LOG(WARNING) << "es close error: : scan_range_idx=" << i + << ", msg=" << status.get_error_msg(); + return status; + } +#else + TStatus status; + result.__set_status(status); +#endif + } + + return Status::OK(); +} + +void EsScanNode::debug_string(int indentation_level, stringstream* out) const { + *out << string(indentation_level * 2, ' '); + *out << "EsScanNode(tupleid=" << _tuple_id; + *out << ")" << std::endl; + + for (int i = 0; i < _children.size(); ++i) { + _children[i]->debug_string(indentation_level + 1, out); + } +} + +Status EsScanNode::set_scan_ranges(const vector& scan_ranges) { + for (int i = 0; i < scan_ranges.size(); ++i) { + TScanRangeParams scan_range = scan_ranges[i]; + DCHECK(scan_range.scan_range.__isset.es_scan_range); + TEsScanRange es_scan_range = scan_range.scan_range.es_scan_range; + _scan_ranges.push_back(es_scan_range); + } + + _offsets.resize(scan_ranges.size(), 0); + return Status::OK(); +} + +Status EsScanNode::open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params) { + + VLOG(1) << "es open param=" << apache::thrift::ThriftDebugString(params); +#ifndef BE_TEST + try { + ExtDataSourceServiceClientCache* client_cache = _env->extdatasource_client_cache(); + Status status; + ExtDataSourceServiceConnection client(client_cache, address, 10000, &status); + if (!status.ok()) { + std::stringstream ss; + ss << "es create client error: address=" << address + << ", msg=" << status.get_error_msg(); + return Status::InternalError(ss.str()); + } + + try { + client->open(result, params); + } catch (apache::thrift::transport::TTransportException& e) { + LOG(WARNING) << "es open retrying, because: " << e.what(); + RETURN_IF_ERROR(client.reopen()); + client->open(result, params); + } + VLOG(1) << "es open result=" << apache::thrift::ThriftDebugString(result); + return Status(result.status); + } catch (apache::thrift::TException &e) { + std::stringstream ss; + ss << "es open error: address=" << address << ", msg=" << e.what(); + return Status::InternalError(ss.str()); + } +#else + TStatus status; + result.__set_status(status); + result.__set_scan_handle("0"); + return Status(status); +#endif +} + +// legacy conjuncts must not contain match function +bool EsScanNode::check_left_conjuncts(Expr* conjunct) { + if (is_match_func(conjunct)) { + return false; + } else { + int num_children = conjunct->get_num_children(); + for (int child_idx = 0; child_idx < num_children; ++child_idx) { + if (!check_left_conjuncts(conjunct->get_child(child_idx))) { + return false; + } + } + return true; + } +} + +bool EsScanNode::ignore_cast(SlotDescriptor* slot, Expr* expr) { + if (slot->type().is_date_type() && expr->type().is_date_type()) { + return true; + } + if (slot->type().is_string_type() && expr->type().is_string_type()) { + return true; + } + return false; +} + +bool EsScanNode::get_disjuncts(ExprContext* context, Expr* conjunct, + vector& disjuncts) { + if (TExprNodeType::BINARY_PRED == conjunct->node_type()) { + if (conjunct->children().size() != 2) { + VLOG(1) << "get disjuncts fail: number of childs is not 2"; + return false; + } + SlotRef* slotRef; + TExprOpcode::type op; + Expr* expr; + if (TExprNodeType::SLOT_REF == conjunct->get_child(0)->node_type()) { + expr = conjunct->get_child(1); + slotRef = (SlotRef*)(conjunct->get_child(0)); + op = conjunct->op(); + } else if (TExprNodeType::SLOT_REF == conjunct->get_child(1)->node_type()) { + expr = conjunct->get_child(0); + slotRef = (SlotRef*)(conjunct->get_child(1)); + op = conjunct->op(); + } else { + VLOG(1) << "get disjuncts fail: no SLOT_REF child"; + return false; + } + + SlotDescriptor* slot_desc = get_slot_desc(slotRef); + if (slot_desc == nullptr) { + VLOG(1) << "get disjuncts fail: slot_desc is null"; + return false; + } + + TExtLiteral literal; + if (!to_ext_literal(context, expr, &literal)) { + VLOG(1) << "get disjuncts fail: can't get literal, node_type=" + << expr->node_type(); + return false; + } + + TExtColumnDesc columnDesc; + columnDesc.__set_name(slot_desc->col_name()); + columnDesc.__set_type(slot_desc->type().to_thrift()); + TExtBinaryPredicate binaryPredicate; + binaryPredicate.__set_col(columnDesc); + binaryPredicate.__set_op(op); + binaryPredicate.__set_value(std::move(literal)); + TExtPredicate predicate; + predicate.__set_node_type(TExprNodeType::BINARY_PRED); + predicate.__set_binary_predicate(binaryPredicate); + disjuncts.push_back(std::move(predicate)); + return true; + } else if (is_match_func(conjunct)) { + // if this is a function call expr and function name is match, then push + // down it to es + TExtFunction match_function; + match_function.__set_func_name(conjunct->fn().name.function_name); + vector query_conditions; + + + TExtLiteral literal; + if (!to_ext_literal(context, conjunct->get_child(1), &literal)) { + VLOG(1) << "get disjuncts fail: can't get literal, node_type=" + << conjunct->get_child(1)->node_type(); + return false; + } + + query_conditions.push_back(std::move(literal)); + match_function.__set_values(query_conditions); + TExtPredicate predicate; + predicate.__set_node_type(TExprNodeType::FUNCTION_CALL); + predicate.__set_ext_function(match_function); + disjuncts.push_back(std::move(predicate)); + return true; + } else if (TExprNodeType::IN_PRED == conjunct->node_type()) { + // the op code maybe FILTER_NEW_IN, it means there is function in list + // like col_a in (abs(1)) + if (TExprOpcode::FILTER_IN != conjunct->op() + && TExprOpcode::FILTER_NOT_IN != conjunct->op()) { + return false; + } + TExtInPredicate ext_in_predicate; + vector in_pred_values; + InPredicate* pred = dynamic_cast(conjunct); + ext_in_predicate.__set_is_not_in(pred->is_not_in()); + if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) { + return false; + } + + SlotRef* slot_ref = (SlotRef*)(conjunct->get_child(0)); + SlotDescriptor* slot_desc = get_slot_desc(slot_ref); + if (slot_desc == nullptr) { + return false; + } + TExtColumnDesc columnDesc; + columnDesc.__set_name(slot_desc->col_name()); + columnDesc.__set_type(slot_desc->type().to_thrift()); + ext_in_predicate.__set_col(columnDesc); + + if (pred->get_child(0)->type().type != slot_desc->type().type) { + if (!ignore_cast(slot_desc, pred->get_child(0))) { + return false; + } + } + + HybirdSetBase::IteratorBase* iter = pred->hybird_set()->begin(); + while (iter->has_next()) { + if (nullptr == iter->get_value()) { + return false; + } + TExtLiteral literal; + if (!to_ext_literal(slot_desc->type().type, const_cast(iter->get_value()), &literal)) { + VLOG(1) << "get disjuncts fail: can't get literal, node_type=" + << slot_desc->type().type; + return false; + } + in_pred_values.push_back(literal); + iter->next(); + } + ext_in_predicate.__set_values(in_pred_values); + TExtPredicate predicate; + predicate.__set_node_type(TExprNodeType::IN_PRED); + predicate.__set_in_predicate(ext_in_predicate); + disjuncts.push_back(std::move(predicate)); + return true; + } else if (TExprNodeType::COMPOUND_PRED == conjunct->node_type()) { + if (TExprOpcode::COMPOUND_OR != conjunct->op()) { + VLOG(1) << "get disjuncts fail: op is not COMPOUND_OR"; + return false; + } + if (!get_disjuncts(context, conjunct->get_child(0), disjuncts)) { + return false; + } + if (!get_disjuncts(context, conjunct->get_child(1), disjuncts)) { + return false; + } + return true; + } else { + VLOG(1) << "get disjuncts fail: node type is " << conjunct->node_type() + << ", should be BINARY_PRED or COMPOUND_PRED"; + return false; + } +} + +bool EsScanNode::is_match_func(Expr* conjunct) { + if (TExprNodeType::FUNCTION_CALL == conjunct->node_type() + && conjunct->fn().name.function_name == "esquery") { + return true; + } + return false; +} + +SlotDescriptor* EsScanNode::get_slot_desc(SlotRef* slotRef) { + std::vector slot_ids; + slotRef->get_slot_ids(&slot_ids); + SlotDescriptor* slot_desc = nullptr; + for (SlotDescriptor* slot : _tuple_desc->slots()) { + if (slot->id() == slot_ids[0]) { + slot_desc = slot; + break; + } + } + return slot_desc; +} + +bool EsScanNode::to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal) { + switch (expr->node_type()) { + case TExprNodeType::BOOL_LITERAL: + case TExprNodeType::INT_LITERAL: + case TExprNodeType::LARGE_INT_LITERAL: + case TExprNodeType::FLOAT_LITERAL: + case TExprNodeType::DECIMAL_LITERAL: + case TExprNodeType::STRING_LITERAL: + case TExprNodeType::DATE_LITERAL: + return to_ext_literal(expr->type().type, context->get_value(expr, NULL), literal); + default: + return false; + } +} + +bool EsScanNode::to_ext_literal(PrimitiveType slot_type, void* value, TExtLiteral* literal) { + TExprNodeType::type node_type; + switch (slot_type) { + case TYPE_BOOLEAN: { + node_type = (TExprNodeType::BOOL_LITERAL); + TBoolLiteral bool_literal; + bool_literal.__set_value(*reinterpret_cast(value)); + literal->__set_bool_literal(bool_literal); + break; + } + + case TYPE_TINYINT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + case TYPE_SMALLINT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + case TYPE_INT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + case TYPE_BIGINT: { + node_type = (TExprNodeType::INT_LITERAL); + TIntLiteral int_literal; + int_literal.__set_value(*reinterpret_cast(value)); + literal->__set_int_literal(int_literal); + break; + } + + case TYPE_LARGEINT: { + node_type = (TExprNodeType::LARGE_INT_LITERAL); + char buf[48]; + int len = 48; + char* v = LargeIntValue::to_string(*reinterpret_cast<__int128*>(value), buf, &len); + TLargeIntLiteral large_int_literal; + large_int_literal.__set_value(v); + literal->__set_large_int_literal(large_int_literal); + break; + } + + case TYPE_FLOAT: { + node_type = (TExprNodeType::FLOAT_LITERAL); + TFloatLiteral float_literal; + float_literal.__set_value(*reinterpret_cast(value)); + literal->__set_float_literal(float_literal); + break; + } + case TYPE_DOUBLE: { + node_type = (TExprNodeType::FLOAT_LITERAL); + TFloatLiteral float_literal; + float_literal.__set_value(*reinterpret_cast(value)); + literal->__set_float_literal(float_literal); + break; + } + + case TYPE_DECIMAL: { + node_type = (TExprNodeType::DECIMAL_LITERAL); + TDecimalLiteral decimal_literal; + decimal_literal.__set_value(reinterpret_cast(value)->to_string()); + literal->__set_decimal_literal(decimal_literal); + break; + } + + case TYPE_DATE: + case TYPE_DATETIME: { + node_type = (TExprNodeType::DATE_LITERAL); + const DateTimeValue date_value = *reinterpret_cast(value); + char str[MAX_DTVALUE_STR_LEN]; + date_value.to_string(str); + TDateLiteral date_literal; + date_literal.__set_value(str); + literal->__set_date_literal(date_literal); + break; + } + + case TYPE_CHAR: + case TYPE_VARCHAR: { + node_type = (TExprNodeType::STRING_LITERAL); + TStringLiteral string_literal; + string_literal.__set_value((reinterpret_cast(value))->debug_string()); + literal->__set_string_literal(string_literal); + break; + } + + default: { + DCHECK(false) << "Invalid type."; + return false; + } + } + literal->__set_node_type(node_type); + return true; +} + +Status EsScanNode::get_next_from_es(TExtGetNextResult& result) { + TExtGetNextParams params; + params.__set_scan_handle(_scan_handles[_scan_range_idx]); + params.__set_offset(_offsets[_scan_range_idx]); + + // getNext + const TNetworkAddress &address = _addresses[_scan_range_idx]; +#ifndef BE_TEST + try { + Status create_client_status; + ExtDataSourceServiceClientCache *client_cache = _env->extdatasource_client_cache(); + ExtDataSourceServiceConnection client(client_cache, address, 10000, &create_client_status); + if (!create_client_status.ok()) { + LOG(WARNING) << "es create client error: scan_range_idx=" << _scan_range_idx + << ", address=" << address + << ", msg=" << create_client_status.get_error_msg(); + return create_client_status; + } + + try { + VLOG(1) << "es get_next param=" << apache::thrift::ThriftDebugString(params); + client->getNext(result, params); + } catch (apache::thrift::transport::TTransportException& e) { + std::stringstream ss; + ss << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", msg=" << e.what(); + LOG(WARNING) << ss.str(); + RETURN_IF_ERROR(client.reopen()); + return Status::ThriftRpcError(ss.str()); + } + } catch (apache::thrift::TException &e) { + std::stringstream ss; + ss << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", msg=" << e.what(); + LOG(WARNING) << ss.str(); + return Status::ThriftRpcError(ss.str()); + } +#else + TStatus status; + result.__set_status(status); + result.__set_eos(true); + TExtColumnData col_data; + std::vector is_null; + is_null.push_back(false); + col_data.__set_is_null(is_null); + std::vector int_vals; + int_vals.push_back(1); + int_vals.push_back(2); + col_data.__set_int_vals(int_vals); + std::vector cols; + cols.push_back(col_data); + TExtRowBatch rows; + rows.__set_cols(cols); + rows.__set_num_rows(2); + result.__set_rows(rows); + return Status(status); +#endif + + // check result + VLOG(1) << "es get_next result=" << apache::thrift::ThriftDebugString(result); + Status get_next_status(result.status); + if (!get_next_status.ok()) { + LOG(WARNING) << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", address=" << address + << ", msg=" << get_next_status.get_error_msg(); + return get_next_status; + } + if (!result.__isset.rows || !result.rows.__isset.num_rows) { + std::stringstream ss; + ss << "es get_next error: scan_range_idx=" << _scan_range_idx + << ", msg=rows or num_rows not in result"; + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + return Status::OK(); +} + +Status EsScanNode::materialize_row(MemPool* tuple_pool, Tuple* tuple, + const vector& cols, int row_idx, + vector& cols_next_val_idx) { + tuple->init(_tuple_desc->byte_size()); + + for (int i = 0; i < _tuple_desc->slots().size(); ++i) { + const SlotDescriptor* slot_desc = _tuple_desc->slots()[i]; + + if (!slot_desc->is_materialized()) { + continue; + } + + void* slot = tuple->get_slot(slot_desc->tuple_offset()); + const TExtColumnData& col = cols[i]; + + if (col.is_null[row_idx]) { + tuple->set_null(slot_desc->null_indicator_offset()); + continue; + } else { + tuple->set_not_null(slot_desc->null_indicator_offset()); + } + + int val_idx = cols_next_val_idx[i]++; + switch (slot_desc->type().type) { + case TYPE_CHAR: + case TYPE_VARCHAR: { + if (val_idx >= col.string_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "STRING")); + } + const string& val = col.string_vals[val_idx]; + size_t val_size = val.size(); + char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); + if (UNLIKELY(buffer == NULL)) { + string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", + val_size, "string slot"); + return tuple_pool->mem_tracker()->MemLimitExceeded(NULL, details, val_size); + } + memcpy(buffer, val.data(), val_size); + reinterpret_cast(slot)->ptr = buffer; + reinterpret_cast(slot)->len = val_size; + break; + } + case TYPE_TINYINT: + if (val_idx >= col.byte_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TINYINT")); + } + *reinterpret_cast(slot) = col.byte_vals[val_idx]; + break; + case TYPE_SMALLINT: + if (val_idx >= col.short_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "SMALLINT")); + } + *reinterpret_cast(slot) = col.short_vals[val_idx]; + break; + case TYPE_INT: + if (val_idx >= col.int_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "INT")); + } + *reinterpret_cast(slot) = col.int_vals[val_idx]; + break; + case TYPE_BIGINT: + if (val_idx >= col.long_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BIGINT")); + } + *reinterpret_cast(slot) = col.long_vals[val_idx]; + break; + case TYPE_LARGEINT: + if (val_idx >= col.long_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "LARGEINT")); + } + *reinterpret_cast(slot) = col.long_vals[val_idx]; + break; + case TYPE_DOUBLE: + if (val_idx >= col.double_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DOUBLE")); + } + *reinterpret_cast(slot) = col.double_vals[val_idx]; + break; + case TYPE_FLOAT: + if (val_idx >= col.double_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "FLOAT")); + } + *reinterpret_cast(slot) = col.double_vals[val_idx]; + break; + case TYPE_BOOLEAN: + if (val_idx >= col.bool_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "BOOLEAN")); + } + *reinterpret_cast(slot) = col.bool_vals[val_idx]; + break; + case TYPE_DATE: + if (val_idx >= col.long_vals.size() || + !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATE")); + } + reinterpret_cast(slot)->cast_to_date(); + break; + case TYPE_DATETIME: { + if (val_idx >= col.long_vals.size() || + !reinterpret_cast(slot)->from_unixtime(col.long_vals[val_idx], "+08:00")) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "TYPE_DATETIME")); + } + reinterpret_cast(slot)->set_type(TIME_DATETIME); + break; + } + case TYPE_DECIMAL: { + if (val_idx >= col.binary_vals.size()) { + return Status::InternalError(strings::Substitute(ERROR_INVALID_COL_DATA, "DECIMAL")); + } + const string& val = col.binary_vals[val_idx]; + *reinterpret_cast(slot) = *reinterpret_cast(&val); + break; + } + default: + DCHECK(false); + } + } + return Status::OK(); +} + +} diff --git a/be/src/exec/es_scan_node.h b/be/src/exec/es_scan_node.h index 810917d9ff..de871a8731 100644 --- a/be/src/exec/es_scan_node.h +++ b/be/src/exec/es_scan_node.h @@ -1,92 +1,92 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H -#define BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H - -#include -#include - -#include "runtime/descriptors.h" -#include "runtime/tuple.h" -#include "exec/scan_node.h" -#include "exprs/slot_ref.h" -#include "runtime/exec_env.h" -#include "gen_cpp/TExtDataSourceService.h" -#include "gen_cpp/PaloExternalDataSourceService_types.h" - -namespace doris { - -class TupleDescriptor; -class RuntimeState; -class Status; - -class EsScanNode : public ScanNode { -public: - EsScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - ~EsScanNode(); - - virtual Status prepare(RuntimeState* state) override; - virtual Status open(RuntimeState* state) override; - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; - virtual Status close(RuntimeState* state) override; - virtual Status set_scan_ranges(const std::vector& scan_ranges) override; - -protected: - // Write debug string of this into out. - virtual void debug_string(int indentation_level, std::stringstream* out) const; - -private: - Status open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params); - Status materialize_row(MemPool* tuple_pool, Tuple* tuple, - const vector& cols, int next_row_idx, - vector& cols_next_val_idx); - Status get_next_from_es(TExtGetNextResult& result); - - bool get_disjuncts(ExprContext* context, Expr* conjunct, vector& disjuncts); - bool to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal); - bool to_ext_literal(PrimitiveType node_type, void* value, TExtLiteral* literal); - bool ignore_cast(SlotDescriptor* slot, Expr* expr); - - bool is_match_func(Expr* conjunct); - - SlotDescriptor* get_slot_desc(SlotRef* slotRef); - - // check if open result meets condition - // 1. check if left conjuncts contain "match" function, since match function could only be executed on es - bool check_left_conjuncts(Expr* conjunct); - -private: - TupleId _tuple_id; - std::map _properties; - const TupleDescriptor* _tuple_desc; - ExecEnv* _env; - std::vector _scan_ranges; - - // scan range's iterator, used in get_next() - int _scan_range_idx; - - // store every scan range's netaddress/handle/offset - std::vector _addresses; - std::vector _scan_handles; - std::vector _offsets; - std::vector _pushdown_conjunct_ctxs; -}; - -} - -#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H +#define BDG_PALO_BE_SRC_QUERY_EXEC_ES_SCAN_NODE_H + +#include +#include + +#include "runtime/descriptors.h" +#include "runtime/tuple.h" +#include "exec/scan_node.h" +#include "exprs/slot_ref.h" +#include "runtime/exec_env.h" +#include "gen_cpp/TExtDataSourceService.h" +#include "gen_cpp/PaloExternalDataSourceService_types.h" + +namespace doris { + +class TupleDescriptor; +class RuntimeState; +class Status; + +class EsScanNode : public ScanNode { +public: + EsScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); + ~EsScanNode(); + + virtual Status prepare(RuntimeState* state) override; + virtual Status open(RuntimeState* state) override; + virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; + virtual Status close(RuntimeState* state) override; + virtual Status set_scan_ranges(const std::vector& scan_ranges) override; + +protected: + // Write debug string of this into out. + virtual void debug_string(int indentation_level, std::stringstream* out) const; + +private: + Status open_es(TNetworkAddress& address, TExtOpenResult& result, TExtOpenParams& params); + Status materialize_row(MemPool* tuple_pool, Tuple* tuple, + const vector& cols, int next_row_idx, + vector& cols_next_val_idx); + Status get_next_from_es(TExtGetNextResult& result); + + bool get_disjuncts(ExprContext* context, Expr* conjunct, vector& disjuncts); + bool to_ext_literal(ExprContext* context, Expr* expr, TExtLiteral* literal); + bool to_ext_literal(PrimitiveType node_type, void* value, TExtLiteral* literal); + bool ignore_cast(SlotDescriptor* slot, Expr* expr); + + bool is_match_func(Expr* conjunct); + + SlotDescriptor* get_slot_desc(SlotRef* slotRef); + + // check if open result meets condition + // 1. check if left conjuncts contain "match" function, since match function could only be executed on es + bool check_left_conjuncts(Expr* conjunct); + +private: + TupleId _tuple_id; + std::map _properties; + const TupleDescriptor* _tuple_desc; + ExecEnv* _env; + std::vector _scan_ranges; + + // scan range's iterator, used in get_next() + int _scan_range_idx; + + // store every scan range's netaddress/handle/offset + std::vector _addresses; + std::vector _scan_handles; + std::vector _offsets; + std::vector _pushdown_conjunct_ctxs; +}; + +} + +#endif diff --git a/be/src/gutil/cpu.cc b/be/src/gutil/cpu.cc index f43664aee7..c02f5e5949 100644 --- a/be/src/gutil/cpu.cc +++ b/be/src/gutil/cpu.cc @@ -1,286 +1,286 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "gutil/cpu.h" - -#include -#include - -#include "gutil/integral_types.h" - -#if defined(__x86_64__) -#if defined(_MSC_VER) -#include -#include // For _xgetbv() -#endif -#endif - -namespace base { - -CPU::CPU() - : signature_(0), - type_(0), - family_(0), - model_(0), - stepping_(0), - ext_model_(0), - ext_family_(0), - has_mmx_(false), - has_sse_(false), - has_sse2_(false), - has_sse3_(false), - has_ssse3_(false), - has_sse41_(false), - has_sse42_(false), - has_avx_(false), - has_avx2_(false), - has_aesni_(false), - has_non_stop_time_stamp_counter_(false), - has_broken_neon_(false), - cpu_vendor_("unknown") { - Initialize(); -} - -namespace { - -#if defined(__x86_64__) -#ifndef _MSC_VER - -#if defined(__pic__) && defined(__i386__) - -void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); -} - -#else - -void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "cpuid\n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type), "c"(0) - ); -} - -#endif - -// _xgetbv returns the value of an Intel Extended Control Register (XCR). -// Currently only XCR0 is defined by Intel so |xcr| should always be zero. -uint64 _xgetbv(uint32 xcr) { - uint32 eax, edx; - - __asm__ volatile ( - "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (static_cast(edx) << 32) | eax; -} - -#endif // !_MSC_VER -#endif // __x86_64__ - -#if defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) -class LazyCpuInfoValue { - public: - LazyCpuInfoValue() : has_broken_neon_(false) { - // This function finds the value from /proc/cpuinfo under the key "model - // name" or "Processor". "model name" is used in Linux 3.8 and later (3.7 - // and later for arm64) and is shown once per CPU. "Processor" is used in - // earler versions and is shown only once at the top of /proc/cpuinfo - // regardless of the number CPUs. - const char kModelNamePrefix[] = "model name\t: "; - const char kProcessorPrefix[] = "Processor\t: "; - - // This function also calculates whether we believe that this CPU has a - // broken NEON unit based on these fields from cpuinfo: - unsigned implementer = 0, architecture = 0, variant = 0, part = 0, - revision = 0; - const struct { - const char key[17]; - unsigned int* result; - } kUnsignedValues[] = { - {"CPU implementer", &implementer}, - {"CPU architecture", &architecture}, - {"CPU variant", &variant}, - {"CPU part", &part}, - {"CPU revision", &revision}, - }; - - std::string contents; - ReadFileToString(FilePath("/proc/cpuinfo"), &contents); - DCHECK(!contents.empty()); - if (contents.empty()) { - return; - } - - std::istringstream iss(contents); - std::string line; - while (std::getline(iss, line)) { - if (brand_.empty() && - (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0 || - line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0)) { - brand_.assign(line.substr(strlen(kModelNamePrefix))); - } - - for (size_t i = 0; i < arraysize(kUnsignedValues); i++) { - const char *key = kUnsignedValues[i].key; - const size_t len = strlen(key); - - if (line.compare(0, len, key) == 0 && - line.size() >= len + 1 && - (line[len] == '\t' || line[len] == ' ' || line[len] == ':')) { - size_t colon_pos = line.find(':', len); - if (colon_pos == std::string::npos) { - continue; - } - - const StringPiece line_sp(line); - StringPiece value_sp = line_sp.substr(colon_pos + 1); - while (!value_sp.empty() && - (value_sp[0] == ' ' || value_sp[0] == '\t')) { - value_sp = value_sp.substr(1); - } - - // The string may have leading "0x" or not, so we use strtoul to - // handle that. - char* endptr; - std::string value(value_sp.as_string()); - unsigned long int result = strtoul(value.c_str(), &endptr, 0); - if (*endptr == 0 && result <= UINT_MAX) { - *kUnsignedValues[i].result = result; - } - } - } - } - - has_broken_neon_ = - implementer == 0x51 && - architecture == 7 && - variant == 1 && - part == 0x4d && - revision == 0; - } - - const std::string& brand() const { return brand_; } - bool has_broken_neon() const { return has_broken_neon_; } - - private: - std::string brand_; - bool has_broken_neon_; - DISALLOW_COPY_AND_ASSIGN(LazyCpuInfoValue); -}; - -base::LazyInstance::Leaky g_lazy_cpuinfo = - LAZY_INSTANCE_INITIALIZER; - -#endif // defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || - // defined(OS_LINUX)) - -} // anonymous namespace - -void CPU::Initialize() { -#if defined(__x86_64__) - int cpu_info[4] = {-1}; - char cpu_string[48]; - - // __cpuid with an InfoType argument of 0 returns the number of - // valid Ids in CPUInfo[0] and the CPU identification string in - // the other three array elements. The CPU identification string is - // not in linear order. The code below arranges the information - // in a human readable form. The human readable order is CPUInfo[1] | - // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped - // before using memcpy to copy these three array elements to cpu_string. - __cpuid(cpu_info, 0); - int num_ids = cpu_info[0]; - std::swap(cpu_info[2], cpu_info[3]); - memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); - cpu_vendor_.assign(cpu_string, 3 * sizeof(cpu_info[1])); - - // Interpret CPU feature information. - if (num_ids > 0) { - int cpu_info7[4] = {0}; - __cpuid(cpu_info, 1); - if (num_ids >= 7) { - __cpuid(cpu_info7, 7); - } - signature_ = cpu_info[0]; - stepping_ = cpu_info[0] & 0xf; - model_ = ((cpu_info[0] >> 4) & 0xf) + ((cpu_info[0] >> 12) & 0xf0); - family_ = (cpu_info[0] >> 8) & 0xf; - type_ = (cpu_info[0] >> 12) & 0x3; - ext_model_ = (cpu_info[0] >> 16) & 0xf; - ext_family_ = (cpu_info[0] >> 20) & 0xff; - has_mmx_ = (cpu_info[3] & 0x00800000) != 0; - has_sse_ = (cpu_info[3] & 0x02000000) != 0; - has_sse2_ = (cpu_info[3] & 0x04000000) != 0; - has_sse3_ = (cpu_info[2] & 0x00000001) != 0; - has_ssse3_ = (cpu_info[2] & 0x00000200) != 0; - has_sse41_ = (cpu_info[2] & 0x00080000) != 0; - has_sse42_ = (cpu_info[2] & 0x00100000) != 0; - // AVX instructions will generate an illegal instruction exception unless - // a) they are supported by the CPU, - // b) XSAVE is supported by the CPU and - // c) XSAVE is enabled by the kernel. - // See http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled - // - // In addition, we have observed some crashes with the xgetbv instruction - // even after following Intel's example code. (See crbug.com/375968.) - // Because of that, we also test the XSAVE bit because its description in - // the CPUID documentation suggests that it signals xgetbv support. - has_avx_ = - (cpu_info[2] & 0x10000000) != 0 && - (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && - (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && - (_xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; - has_aesni_ = (cpu_info[2] & 0x02000000) != 0; - has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0; - } - - // Get the brand string of the cpu. - __cpuid(cpu_info, 0x80000000); - const int parameter_end = 0x80000004; - int max_parameter = cpu_info[0]; - - if (cpu_info[0] >= parameter_end) { - char* cpu_string_ptr = cpu_string; - - for (int parameter = 0x80000002; parameter <= parameter_end && - cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { - __cpuid(cpu_info, parameter); - memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); - cpu_string_ptr += sizeof(cpu_info); - } - cpu_brand_.assign(cpu_string, cpu_string_ptr - cpu_string); - } - - const int parameter_containing_non_stop_time_stamp_counter = 0x80000007; - if (max_parameter >= parameter_containing_non_stop_time_stamp_counter) { - __cpuid(cpu_info, parameter_containing_non_stop_time_stamp_counter); - has_non_stop_time_stamp_counter_ = (cpu_info[3] & (1 << 8)) != 0; - } -#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) - cpu_brand_.assign(g_lazy_cpuinfo.Get().brand()); - has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon(); -#else - #error unknown architecture -#endif -} - -CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { - if (has_avx2()) return AVX2; - if (has_avx()) return AVX; - if (has_sse42()) return SSE42; - if (has_sse41()) return SSE41; - if (has_ssse3()) return SSSE3; - if (has_sse3()) return SSE3; - if (has_sse2()) return SSE2; - if (has_sse()) return SSE; - return PENTIUM; -} - -} // namespace base +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gutil/cpu.h" + +#include +#include + +#include "gutil/integral_types.h" + +#if defined(__x86_64__) +#if defined(_MSC_VER) +#include +#include // For _xgetbv() +#endif +#endif + +namespace base { + +CPU::CPU() + : signature_(0), + type_(0), + family_(0), + model_(0), + stepping_(0), + ext_model_(0), + ext_family_(0), + has_mmx_(false), + has_sse_(false), + has_sse2_(false), + has_sse3_(false), + has_ssse3_(false), + has_sse41_(false), + has_sse42_(false), + has_avx_(false), + has_avx2_(false), + has_aesni_(false), + has_non_stop_time_stamp_counter_(false), + has_broken_neon_(false), + cpu_vendor_("unknown") { + Initialize(); +} + +namespace { + +#if defined(__x86_64__) +#ifndef _MSC_VER + +#if defined(__pic__) && defined(__i386__) + +void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} + +#else + +void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type), "c"(0) + ); +} + +#endif + +// _xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so |xcr| should always be zero. +uint64 _xgetbv(uint32 xcr) { + uint32 eax, edx; + + __asm__ volatile ( + "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (static_cast(edx) << 32) | eax; +} + +#endif // !_MSC_VER +#endif // __x86_64__ + +#if defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) +class LazyCpuInfoValue { + public: + LazyCpuInfoValue() : has_broken_neon_(false) { + // This function finds the value from /proc/cpuinfo under the key "model + // name" or "Processor". "model name" is used in Linux 3.8 and later (3.7 + // and later for arm64) and is shown once per CPU. "Processor" is used in + // earler versions and is shown only once at the top of /proc/cpuinfo + // regardless of the number CPUs. + const char kModelNamePrefix[] = "model name\t: "; + const char kProcessorPrefix[] = "Processor\t: "; + + // This function also calculates whether we believe that this CPU has a + // broken NEON unit based on these fields from cpuinfo: + unsigned implementer = 0, architecture = 0, variant = 0, part = 0, + revision = 0; + const struct { + const char key[17]; + unsigned int* result; + } kUnsignedValues[] = { + {"CPU implementer", &implementer}, + {"CPU architecture", &architecture}, + {"CPU variant", &variant}, + {"CPU part", &part}, + {"CPU revision", &revision}, + }; + + std::string contents; + ReadFileToString(FilePath("/proc/cpuinfo"), &contents); + DCHECK(!contents.empty()); + if (contents.empty()) { + return; + } + + std::istringstream iss(contents); + std::string line; + while (std::getline(iss, line)) { + if (brand_.empty() && + (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0 || + line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0)) { + brand_.assign(line.substr(strlen(kModelNamePrefix))); + } + + for (size_t i = 0; i < arraysize(kUnsignedValues); i++) { + const char *key = kUnsignedValues[i].key; + const size_t len = strlen(key); + + if (line.compare(0, len, key) == 0 && + line.size() >= len + 1 && + (line[len] == '\t' || line[len] == ' ' || line[len] == ':')) { + size_t colon_pos = line.find(':', len); + if (colon_pos == std::string::npos) { + continue; + } + + const StringPiece line_sp(line); + StringPiece value_sp = line_sp.substr(colon_pos + 1); + while (!value_sp.empty() && + (value_sp[0] == ' ' || value_sp[0] == '\t')) { + value_sp = value_sp.substr(1); + } + + // The string may have leading "0x" or not, so we use strtoul to + // handle that. + char* endptr; + std::string value(value_sp.as_string()); + unsigned long int result = strtoul(value.c_str(), &endptr, 0); + if (*endptr == 0 && result <= UINT_MAX) { + *kUnsignedValues[i].result = result; + } + } + } + } + + has_broken_neon_ = + implementer == 0x51 && + architecture == 7 && + variant == 1 && + part == 0x4d && + revision == 0; + } + + const std::string& brand() const { return brand_; } + bool has_broken_neon() const { return has_broken_neon_; } + + private: + std::string brand_; + bool has_broken_neon_; + DISALLOW_COPY_AND_ASSIGN(LazyCpuInfoValue); +}; + +base::LazyInstance::Leaky g_lazy_cpuinfo = + LAZY_INSTANCE_INITIALIZER; + +#endif // defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || + // defined(OS_LINUX)) + +} // anonymous namespace + +void CPU::Initialize() { +#if defined(__x86_64__) + int cpu_info[4] = {-1}; + char cpu_string[48]; + + // __cpuid with an InfoType argument of 0 returns the number of + // valid Ids in CPUInfo[0] and the CPU identification string in + // the other three array elements. The CPU identification string is + // not in linear order. The code below arranges the information + // in a human readable form. The human readable order is CPUInfo[1] | + // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped + // before using memcpy to copy these three array elements to cpu_string. + __cpuid(cpu_info, 0); + int num_ids = cpu_info[0]; + std::swap(cpu_info[2], cpu_info[3]); + memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); + cpu_vendor_.assign(cpu_string, 3 * sizeof(cpu_info[1])); + + // Interpret CPU feature information. + if (num_ids > 0) { + int cpu_info7[4] = {0}; + __cpuid(cpu_info, 1); + if (num_ids >= 7) { + __cpuid(cpu_info7, 7); + } + signature_ = cpu_info[0]; + stepping_ = cpu_info[0] & 0xf; + model_ = ((cpu_info[0] >> 4) & 0xf) + ((cpu_info[0] >> 12) & 0xf0); + family_ = (cpu_info[0] >> 8) & 0xf; + type_ = (cpu_info[0] >> 12) & 0x3; + ext_model_ = (cpu_info[0] >> 16) & 0xf; + ext_family_ = (cpu_info[0] >> 20) & 0xff; + has_mmx_ = (cpu_info[3] & 0x00800000) != 0; + has_sse_ = (cpu_info[3] & 0x02000000) != 0; + has_sse2_ = (cpu_info[3] & 0x04000000) != 0; + has_sse3_ = (cpu_info[2] & 0x00000001) != 0; + has_ssse3_ = (cpu_info[2] & 0x00000200) != 0; + has_sse41_ = (cpu_info[2] & 0x00080000) != 0; + has_sse42_ = (cpu_info[2] & 0x00100000) != 0; + // AVX instructions will generate an illegal instruction exception unless + // a) they are supported by the CPU, + // b) XSAVE is supported by the CPU and + // c) XSAVE is enabled by the kernel. + // See http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled + // + // In addition, we have observed some crashes with the xgetbv instruction + // even after following Intel's example code. (See crbug.com/375968.) + // Because of that, we also test the XSAVE bit because its description in + // the CPUID documentation suggests that it signals xgetbv support. + has_avx_ = + (cpu_info[2] & 0x10000000) != 0 && + (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && + (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && + (_xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; + has_aesni_ = (cpu_info[2] & 0x02000000) != 0; + has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0; + } + + // Get the brand string of the cpu. + __cpuid(cpu_info, 0x80000000); + const int parameter_end = 0x80000004; + int max_parameter = cpu_info[0]; + + if (cpu_info[0] >= parameter_end) { + char* cpu_string_ptr = cpu_string; + + for (int parameter = 0x80000002; parameter <= parameter_end && + cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { + __cpuid(cpu_info, parameter); + memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); + cpu_string_ptr += sizeof(cpu_info); + } + cpu_brand_.assign(cpu_string, cpu_string_ptr - cpu_string); + } + + const int parameter_containing_non_stop_time_stamp_counter = 0x80000007; + if (max_parameter >= parameter_containing_non_stop_time_stamp_counter) { + __cpuid(cpu_info, parameter_containing_non_stop_time_stamp_counter); + has_non_stop_time_stamp_counter_ = (cpu_info[3] & (1 << 8)) != 0; + } +#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) + cpu_brand_.assign(g_lazy_cpuinfo.Get().brand()); + has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon(); +#else + #error unknown architecture +#endif +} + +CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { + if (has_avx2()) return AVX2; + if (has_avx()) return AVX; + if (has_sse42()) return SSE42; + if (has_sse41()) return SSE41; + if (has_ssse3()) return SSSE3; + if (has_sse3()) return SSE3; + if (has_sse2()) return SSE2; + if (has_sse()) return SSE; + return PENTIUM; +} + +} // namespace base diff --git a/be/src/gutil/cpu.h b/be/src/gutil/cpu.h index b401867c3c..65498140d1 100644 --- a/be/src/gutil/cpu.h +++ b/be/src/gutil/cpu.h @@ -1,90 +1,90 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef BASE_CPU_H_ -#define BASE_CPU_H_ - -#include - -namespace base { - -// Query information about the processor. -class CPU { - public: - // Constructor - CPU(); - - enum IntelMicroArchitecture { - PENTIUM, - SSE, - SSE2, - SSE3, - SSSE3, - SSE41, - SSE42, - AVX, - AVX2, - MAX_INTEL_MICRO_ARCHITECTURE - }; - - // Accessors for CPU information. - const std::string& vendor_name() const { return cpu_vendor_; } - int signature() const { return signature_; } - int stepping() const { return stepping_; } - int model() const { return model_; } - int family() const { return family_; } - int type() const { return type_; } - int extended_model() const { return ext_model_; } - int extended_family() const { return ext_family_; } - bool has_mmx() const { return has_mmx_; } - bool has_sse() const { return has_sse_; } - bool has_sse2() const { return has_sse2_; } - bool has_sse3() const { return has_sse3_; } - bool has_ssse3() const { return has_ssse3_; } - bool has_sse41() const { return has_sse41_; } - bool has_sse42() const { return has_sse42_; } - bool has_avx() const { return has_avx_; } - bool has_avx2() const { return has_avx2_; } - bool has_aesni() const { return has_aesni_; } - bool has_non_stop_time_stamp_counter() const { - return has_non_stop_time_stamp_counter_; - } - // has_broken_neon is only valid on ARM chips. If true, it indicates that we - // believe that the NEON unit on the current CPU is flawed and cannot execute - // some code. See https://code.google.com/p/chromium/issues/detail?id=341598 - bool has_broken_neon() const { return has_broken_neon_; } - - IntelMicroArchitecture GetIntelMicroArchitecture() const; - const std::string& cpu_brand() const { return cpu_brand_; } - - private: - // Query the processor for CPUID information. - void Initialize(); - - int signature_; // raw form of type, family, model, and stepping - int type_; // process type - int family_; // family of the processor - int model_; // model of processor - int stepping_; // processor revision number - int ext_model_; - int ext_family_; - bool has_mmx_; - bool has_sse_; - bool has_sse2_; - bool has_sse3_; - bool has_ssse3_; - bool has_sse41_; - bool has_sse42_; - bool has_avx_; - bool has_avx2_; - bool has_aesni_; - bool has_non_stop_time_stamp_counter_; - bool has_broken_neon_; - std::string cpu_vendor_; - std::string cpu_brand_; -}; - -} // namespace base - -#endif // BASE_CPU_H_ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_CPU_H_ +#define BASE_CPU_H_ + +#include + +namespace base { + +// Query information about the processor. +class CPU { + public: + // Constructor + CPU(); + + enum IntelMicroArchitecture { + PENTIUM, + SSE, + SSE2, + SSE3, + SSSE3, + SSE41, + SSE42, + AVX, + AVX2, + MAX_INTEL_MICRO_ARCHITECTURE + }; + + // Accessors for CPU information. + const std::string& vendor_name() const { return cpu_vendor_; } + int signature() const { return signature_; } + int stepping() const { return stepping_; } + int model() const { return model_; } + int family() const { return family_; } + int type() const { return type_; } + int extended_model() const { return ext_model_; } + int extended_family() const { return ext_family_; } + bool has_mmx() const { return has_mmx_; } + bool has_sse() const { return has_sse_; } + bool has_sse2() const { return has_sse2_; } + bool has_sse3() const { return has_sse3_; } + bool has_ssse3() const { return has_ssse3_; } + bool has_sse41() const { return has_sse41_; } + bool has_sse42() const { return has_sse42_; } + bool has_avx() const { return has_avx_; } + bool has_avx2() const { return has_avx2_; } + bool has_aesni() const { return has_aesni_; } + bool has_non_stop_time_stamp_counter() const { + return has_non_stop_time_stamp_counter_; + } + // has_broken_neon is only valid on ARM chips. If true, it indicates that we + // believe that the NEON unit on the current CPU is flawed and cannot execute + // some code. See https://code.google.com/p/chromium/issues/detail?id=341598 + bool has_broken_neon() const { return has_broken_neon_; } + + IntelMicroArchitecture GetIntelMicroArchitecture() const; + const std::string& cpu_brand() const { return cpu_brand_; } + + private: + // Query the processor for CPUID information. + void Initialize(); + + int signature_; // raw form of type, family, model, and stepping + int type_; // process type + int family_; // family of the processor + int model_; // model of processor + int stepping_; // processor revision number + int ext_model_; + int ext_family_; + bool has_mmx_; + bool has_sse_; + bool has_sse2_; + bool has_sse3_; + bool has_ssse3_; + bool has_sse41_; + bool has_sse42_; + bool has_avx_; + bool has_avx2_; + bool has_aesni_; + bool has_non_stop_time_stamp_counter_; + bool has_broken_neon_; + std::string cpu_vendor_; + std::string cpu_brand_; +}; + +} // namespace base + +#endif // BASE_CPU_H_ diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index e4737e402b..5bfa28c348 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -1,74 +1,74 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H -#define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H - -#include "gen_cpp/olap_file.pb.h" -#include "olap/data_dir.h" -#include "olap/tablet_schema.h" - -namespace doris { - -class RowsetWriterContextBuilder; -using RowsetWriterContextBuilderSharedPtr = std::shared_ptr; - -struct RowsetWriterContext { - RowsetWriterContext() : - tablet_id(0), - tablet_schema_hash(0), - partition_id(0), - rowset_type(ALPHA_ROWSET), - rowset_path_prefix(""), - tablet_schema(nullptr), - rowset_state(PREPARED), - data_dir(nullptr), - version(Version(0, 0)), - version_hash(0), - txn_id(0), - tablet_uid(0, 0) { - load_id.set_hi(0); - load_id.set_lo(0); - } - RowsetId rowset_id; - int64_t tablet_id; - int64_t tablet_schema_hash; - int64_t partition_id; - RowsetTypePB rowset_type; - std::string rowset_path_prefix; - const TabletSchema* tablet_schema; - // PREPARED/COMMITTED for pending rowset - // VISIBLE for non-pending rowset - RowsetStatePB rowset_state; - DataDir* data_dir; - // properties for non-pending rowset - Version version; - VersionHash version_hash; - - // properties for pending rowset - int64_t txn_id; - PUniqueId load_id; - TabletUid tablet_uid; - // segment file use uint32 to represent row number, therefore the maximum is UINT32_MAX. - // the default is set to INT32_MAX to avoid overflow issue when casting from uint32_t to int. - // test cases can change this value to control flush timing - uint32_t max_rows_per_segment = INT32_MAX; -}; - -} // namespace doris - -#endif // DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H +#define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H + +#include "gen_cpp/olap_file.pb.h" +#include "olap/data_dir.h" +#include "olap/tablet_schema.h" + +namespace doris { + +class RowsetWriterContextBuilder; +using RowsetWriterContextBuilderSharedPtr = std::shared_ptr; + +struct RowsetWriterContext { + RowsetWriterContext() : + tablet_id(0), + tablet_schema_hash(0), + partition_id(0), + rowset_type(ALPHA_ROWSET), + rowset_path_prefix(""), + tablet_schema(nullptr), + rowset_state(PREPARED), + data_dir(nullptr), + version(Version(0, 0)), + version_hash(0), + txn_id(0), + tablet_uid(0, 0) { + load_id.set_hi(0); + load_id.set_lo(0); + } + RowsetId rowset_id; + int64_t tablet_id; + int64_t tablet_schema_hash; + int64_t partition_id; + RowsetTypePB rowset_type; + std::string rowset_path_prefix; + const TabletSchema* tablet_schema; + // PREPARED/COMMITTED for pending rowset + // VISIBLE for non-pending rowset + RowsetStatePB rowset_state; + DataDir* data_dir; + // properties for non-pending rowset + Version version; + VersionHash version_hash; + + // properties for pending rowset + int64_t txn_id; + PUniqueId load_id; + TabletUid tablet_uid; + // segment file use uint32 to represent row number, therefore the maximum is UINT32_MAX. + // the default is set to INT32_MAX to avoid overflow issue when casting from uint32_t to int. + // test cases can change this value to control flush timing + uint32_t max_rows_per_segment = INT32_MAX; +}; + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_ROWSET_WRITER_CONTEXT_H diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h b/be/src/olap/rowset/segment_v2/bitshuffle_page.h index 6a9bd1ae98..1573d681f0 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h @@ -1,342 +1,342 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "util/coding.h" -#include "util/faststring.h" -#include "gutil/port.h" -#include "olap/olap_common.h" -#include "olap/types.h" -#include "olap/rowset/segment_v2/options.h" -#include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_decoder.h" -#include "olap/rowset/segment_v2/common.h" -#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" - -namespace doris { -namespace segment_v2 { - -enum { - BITSHUFFLE_PAGE_HEADER_SIZE = 16 -}; - -void warn_with_bitshuffle_error(int64_t val); - -// BitshufflePageBuilder bitshuffles and compresses the bits of fixed -// size type blocks with lz4. -// -// The page format is as follows: -// -// 1. Header: (16 bytes total) -// -// [32-bit] -// The number of elements encoded in the page. -// -// [32-bit] -// The post-compression size of the page, including this header. -// -// [32-bit] -// Padding is needed to meet the requirements of the bitshuffle -// library such that the input/output is a multiple of 8. Some -// ignored elements are appended to the end of the page if necessary -// to meet this requirement. -// -// This header field is the post-padding element count. -// -// [32-bit] -// The size of the elements, in bytes, as actually encoded. In the -// case that all of the data in a page can fit into a smaller -// integer type, then we may choose to encode that smaller type -// to save CPU costs. -// -// This is currently only implemented in the UINT32 page type. -// -// NOTE: all on-disk ints are encoded little-endian -// -// 2. Element data -// -// The header is followed by the bitshuffle-compressed element data. -// -template -class BitshufflePageBuilder : public PageBuilder { -public: - BitshufflePageBuilder(const PageBuilderOptions& options) : - _options(options), - _count(0), - _remain_element_capacity(0), - _finished(false) { - reset(); - } - - bool is_page_full() override { - return _remain_element_capacity == 0; - } - - Status add(const uint8_t* vals, size_t* count) override { - DCHECK(!_finished); - int to_add = std::min(_remain_element_capacity, *count); - _data.append(vals, to_add * SIZE_OF_TYPE); - _count += to_add; - _remain_element_capacity -= to_add; - // return added number through count - *count = to_add; - return Status::OK(); - } - - Slice finish() override { - return _finish(SIZE_OF_TYPE); - } - - void reset() override { - auto block_size = _options.data_page_size; - _count = 0; - _data.clear(); - _data.reserve(block_size); - DCHECK_EQ(reinterpret_cast(_data.data()) & (alignof(CppType) - 1), 0) - << "buffer must be naturally-aligned"; - _buffer.clear(); - _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE); - _finished = false; - _remain_element_capacity = block_size / SIZE_OF_TYPE; - } - - size_t count() const { - return _count; - } - - uint64_t size() const override { - return _buffer.size(); - } - - // this api will release the memory ownership of encoded data - // Note: - // release() should be called after finish - // reset() should be called after this function before reuse the builder - void release() override { - uint8_t* ret = _buffer.release(); - (void)ret; - } - -private: - Slice _finish(int final_size_of_type) { - _data.resize(final_size_of_type * _count); - - // Do padding so that the input num of element is multiple of 8. - int num_elems_after_padding = ALIGN_UP(_count, 8); - int padding_elems = num_elems_after_padding - _count; - int padding_bytes = padding_elems * final_size_of_type; - for (int i = 0; i < padding_bytes; i++) { - _data.push_back(0); - } - - _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE + - bitshuffle::compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); - - encode_fixed32_le(&_buffer[0], _count); - int64_t bytes = bitshuffle::compress_lz4(_data.data(), &_buffer[BITSHUFFLE_PAGE_HEADER_SIZE], - num_elems_after_padding, final_size_of_type, 0); - if (PREDICT_FALSE(bytes < 0)) { - // This means the bitshuffle function fails. - // Ideally, this should not happen. - warn_with_bitshuffle_error(bytes); - // It does not matter what will be returned here, - // since we have logged fatal in warn_with_bitshuffle_error(). - return Slice(); - } - encode_fixed32_le(&_buffer[4], BITSHUFFLE_PAGE_HEADER_SIZE + bytes); - encode_fixed32_le(&_buffer[8], num_elems_after_padding); - encode_fixed32_le(&_buffer[12], final_size_of_type); - _finished = true; - return Slice(_buffer.data(), BITSHUFFLE_PAGE_HEADER_SIZE + bytes); - } - - typedef typename TypeTraits::CppType CppType; - - CppType cell(int idx) const { - DCHECK_GE(idx, 0); - CppType ret; - memcpy(&ret, &_data[idx * SIZE_OF_TYPE], sizeof(CppType)); - return ret; - } - - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - PageBuilderOptions _options; - uint32_t _count; - int _remain_element_capacity; - bool _finished; - faststring _data; - faststring _buffer; -}; - -template -class BitShufflePageDecoder : public PageDecoder { -public: - BitShufflePageDecoder(Slice data, const PageDecoderOptions& options) : _data(data), - _options(options), - _parsed(false), - _num_elements(0), - _compressed_size(0), - _num_element_after_padding(0), - _size_of_element(0), - _cur_index(0) { } - - Status init() override { - CHECK(!_parsed); - if (_data.size < BITSHUFFLE_PAGE_HEADER_SIZE) { - std::stringstream ss; - ss << "file corrupton: invalid data size:" << _data.size << ", header size:" << BITSHUFFLE_PAGE_HEADER_SIZE; - return Status::InternalError(ss.str()); - } - _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); - _compressed_size = decode_fixed32_le((const uint8_t*)&_data[4]); - if (_compressed_size != _data.size) { - std::stringstream ss; - ss << "Size information unmatched, _compressed_size:" << _compressed_size - << ", _num_elements:" << _num_elements - << ", data size:" << _data.size; - return Status::InternalError(ss.str()); - } - _num_element_after_padding = decode_fixed32_le((const uint8_t*)&_data[8]); - if (_num_element_after_padding != ALIGN_UP(_num_elements, 8)) { - std::stringstream ss; - ss << "num of element information corrupted," - << " _num_element_after_padding:" << _num_element_after_padding - << ", _num_elements:" << _num_elements; - return Status::InternalError(ss.str()); - } - _size_of_element = decode_fixed32_le((const uint8_t*)&_data[12]); - switch (_size_of_element) { - case 1: - case 2: - case 3: - case 4: - case 8: - case 12: - case 16: - break; - default: - std::stringstream ss; - ss << "invalid size_of_elem:" << _size_of_element; - return Status::InternalError(ss.str()); - } - - // Currently, only the UINT32 block encoder supports expanding size: - if (UNLIKELY(Type != OLAP_FIELD_TYPE_UNSIGNED_INT && _size_of_element != SIZE_OF_TYPE)) { - std::stringstream ss; - ss << "invalid size info. size of element:" << _size_of_element - << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE - << ", type:" << Type; - return Status::InternalError(ss.str()); - } - if (UNLIKELY(_size_of_element > SIZE_OF_TYPE)) { - std::stringstream ss; - ss << "invalid size info. size of element:" << _size_of_element - << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE; - return Status::InternalError(ss.str()); - } - - RETURN_IF_ERROR(_decode()); - _parsed = true; - return Status::OK(); - } - - Status seek_to_position_in_page(size_t pos) override { - DCHECK(_parsed) << "Must call init()"; - if (PREDICT_FALSE(_num_elements == 0)) { - DCHECK_EQ(0, pos); - return Status::InvalidArgument("invalid pos"); - } - - DCHECK_LE(pos, _num_elements); - _cur_index = pos; - return Status::OK(); - } - - Status next_batch(size_t* n, ColumnBlockView* dst) override { - DCHECK(_parsed); - if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { - *n = 0; - return Status::OK(); - } - - size_t max_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); - _copy_next_values(max_fetch, dst->data()); - *n = max_fetch; - _cur_index += max_fetch; - - return Status::OK(); - } - - size_t count() const override { - return _num_elements; - } - - size_t current_index() const override { - return _cur_index; - } - -private: - void _copy_next_values(size_t n, void* data) { - memcpy(data, &_decoded[_cur_index * SIZE_OF_TYPE], n * SIZE_OF_TYPE); - } - - Status _decode() { - if (_num_elements > 0) { - int64_t bytes; - _decoded.resize(_num_element_after_padding * _size_of_element); - char* in = const_cast(&_data[BITSHUFFLE_PAGE_HEADER_SIZE]); - bytes = bitshuffle::decompress_lz4(in, _decoded.data(), _num_element_after_padding, - _size_of_element, 0); - if (PREDICT_FALSE(bytes < 0)) { - // Ideally, this should not happen. - warn_with_bitshuffle_error(bytes); - return Status::RuntimeError("Unshuffle Process failed"); - } - } - return Status::OK(); - } - - typedef typename TypeTraits::CppType CppType; - - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - - Slice _data; - PageDecoderOptions _options; - bool _parsed; - size_t _num_elements; - size_t _compressed_size; - size_t _num_element_after_padding; - - int _size_of_element; - size_t _cur_index; - faststring _decoded; -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "util/coding.h" +#include "util/faststring.h" +#include "gutil/port.h" +#include "olap/olap_common.h" +#include "olap/types.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" + +namespace doris { +namespace segment_v2 { + +enum { + BITSHUFFLE_PAGE_HEADER_SIZE = 16 +}; + +void warn_with_bitshuffle_error(int64_t val); + +// BitshufflePageBuilder bitshuffles and compresses the bits of fixed +// size type blocks with lz4. +// +// The page format is as follows: +// +// 1. Header: (16 bytes total) +// +// [32-bit] +// The number of elements encoded in the page. +// +// [32-bit] +// The post-compression size of the page, including this header. +// +// [32-bit] +// Padding is needed to meet the requirements of the bitshuffle +// library such that the input/output is a multiple of 8. Some +// ignored elements are appended to the end of the page if necessary +// to meet this requirement. +// +// This header field is the post-padding element count. +// +// [32-bit] +// The size of the elements, in bytes, as actually encoded. In the +// case that all of the data in a page can fit into a smaller +// integer type, then we may choose to encode that smaller type +// to save CPU costs. +// +// This is currently only implemented in the UINT32 page type. +// +// NOTE: all on-disk ints are encoded little-endian +// +// 2. Element data +// +// The header is followed by the bitshuffle-compressed element data. +// +template +class BitshufflePageBuilder : public PageBuilder { +public: + BitshufflePageBuilder(const PageBuilderOptions& options) : + _options(options), + _count(0), + _remain_element_capacity(0), + _finished(false) { + reset(); + } + + bool is_page_full() override { + return _remain_element_capacity == 0; + } + + Status add(const uint8_t* vals, size_t* count) override { + DCHECK(!_finished); + int to_add = std::min(_remain_element_capacity, *count); + _data.append(vals, to_add * SIZE_OF_TYPE); + _count += to_add; + _remain_element_capacity -= to_add; + // return added number through count + *count = to_add; + return Status::OK(); + } + + Slice finish() override { + return _finish(SIZE_OF_TYPE); + } + + void reset() override { + auto block_size = _options.data_page_size; + _count = 0; + _data.clear(); + _data.reserve(block_size); + DCHECK_EQ(reinterpret_cast(_data.data()) & (alignof(CppType) - 1), 0) + << "buffer must be naturally-aligned"; + _buffer.clear(); + _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE); + _finished = false; + _remain_element_capacity = block_size / SIZE_OF_TYPE; + } + + size_t count() const { + return _count; + } + + uint64_t size() const override { + return _buffer.size(); + } + + // this api will release the memory ownership of encoded data + // Note: + // release() should be called after finish + // reset() should be called after this function before reuse the builder + void release() override { + uint8_t* ret = _buffer.release(); + (void)ret; + } + +private: + Slice _finish(int final_size_of_type) { + _data.resize(final_size_of_type * _count); + + // Do padding so that the input num of element is multiple of 8. + int num_elems_after_padding = ALIGN_UP(_count, 8); + int padding_elems = num_elems_after_padding - _count; + int padding_bytes = padding_elems * final_size_of_type; + for (int i = 0; i < padding_bytes; i++) { + _data.push_back(0); + } + + _buffer.resize(BITSHUFFLE_PAGE_HEADER_SIZE + + bitshuffle::compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); + + encode_fixed32_le(&_buffer[0], _count); + int64_t bytes = bitshuffle::compress_lz4(_data.data(), &_buffer[BITSHUFFLE_PAGE_HEADER_SIZE], + num_elems_after_padding, final_size_of_type, 0); + if (PREDICT_FALSE(bytes < 0)) { + // This means the bitshuffle function fails. + // Ideally, this should not happen. + warn_with_bitshuffle_error(bytes); + // It does not matter what will be returned here, + // since we have logged fatal in warn_with_bitshuffle_error(). + return Slice(); + } + encode_fixed32_le(&_buffer[4], BITSHUFFLE_PAGE_HEADER_SIZE + bytes); + encode_fixed32_le(&_buffer[8], num_elems_after_padding); + encode_fixed32_le(&_buffer[12], final_size_of_type); + _finished = true; + return Slice(_buffer.data(), BITSHUFFLE_PAGE_HEADER_SIZE + bytes); + } + + typedef typename TypeTraits::CppType CppType; + + CppType cell(int idx) const { + DCHECK_GE(idx, 0); + CppType ret; + memcpy(&ret, &_data[idx * SIZE_OF_TYPE], sizeof(CppType)); + return ret; + } + + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + PageBuilderOptions _options; + uint32_t _count; + int _remain_element_capacity; + bool _finished; + faststring _data; + faststring _buffer; +}; + +template +class BitShufflePageDecoder : public PageDecoder { +public: + BitShufflePageDecoder(Slice data, const PageDecoderOptions& options) : _data(data), + _options(options), + _parsed(false), + _num_elements(0), + _compressed_size(0), + _num_element_after_padding(0), + _size_of_element(0), + _cur_index(0) { } + + Status init() override { + CHECK(!_parsed); + if (_data.size < BITSHUFFLE_PAGE_HEADER_SIZE) { + std::stringstream ss; + ss << "file corrupton: invalid data size:" << _data.size << ", header size:" << BITSHUFFLE_PAGE_HEADER_SIZE; + return Status::InternalError(ss.str()); + } + _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); + _compressed_size = decode_fixed32_le((const uint8_t*)&_data[4]); + if (_compressed_size != _data.size) { + std::stringstream ss; + ss << "Size information unmatched, _compressed_size:" << _compressed_size + << ", _num_elements:" << _num_elements + << ", data size:" << _data.size; + return Status::InternalError(ss.str()); + } + _num_element_after_padding = decode_fixed32_le((const uint8_t*)&_data[8]); + if (_num_element_after_padding != ALIGN_UP(_num_elements, 8)) { + std::stringstream ss; + ss << "num of element information corrupted," + << " _num_element_after_padding:" << _num_element_after_padding + << ", _num_elements:" << _num_elements; + return Status::InternalError(ss.str()); + } + _size_of_element = decode_fixed32_le((const uint8_t*)&_data[12]); + switch (_size_of_element) { + case 1: + case 2: + case 3: + case 4: + case 8: + case 12: + case 16: + break; + default: + std::stringstream ss; + ss << "invalid size_of_elem:" << _size_of_element; + return Status::InternalError(ss.str()); + } + + // Currently, only the UINT32 block encoder supports expanding size: + if (UNLIKELY(Type != OLAP_FIELD_TYPE_UNSIGNED_INT && _size_of_element != SIZE_OF_TYPE)) { + std::stringstream ss; + ss << "invalid size info. size of element:" << _size_of_element + << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE + << ", type:" << Type; + return Status::InternalError(ss.str()); + } + if (UNLIKELY(_size_of_element > SIZE_OF_TYPE)) { + std::stringstream ss; + ss << "invalid size info. size of element:" << _size_of_element + << ", SIZE_OF_TYPE:" << SIZE_OF_TYPE; + return Status::InternalError(ss.str()); + } + + RETURN_IF_ERROR(_decode()); + _parsed = true; + return Status::OK(); + } + + Status seek_to_position_in_page(size_t pos) override { + DCHECK(_parsed) << "Must call init()"; + if (PREDICT_FALSE(_num_elements == 0)) { + DCHECK_EQ(0, pos); + return Status::InvalidArgument("invalid pos"); + } + + DCHECK_LE(pos, _num_elements); + _cur_index = pos; + return Status::OK(); + } + + Status next_batch(size_t* n, ColumnBlockView* dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { + *n = 0; + return Status::OK(); + } + + size_t max_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); + _copy_next_values(max_fetch, dst->data()); + *n = max_fetch; + _cur_index += max_fetch; + + return Status::OK(); + } + + size_t count() const override { + return _num_elements; + } + + size_t current_index() const override { + return _cur_index; + } + +private: + void _copy_next_values(size_t n, void* data) { + memcpy(data, &_decoded[_cur_index * SIZE_OF_TYPE], n * SIZE_OF_TYPE); + } + + Status _decode() { + if (_num_elements > 0) { + int64_t bytes; + _decoded.resize(_num_element_after_padding * _size_of_element); + char* in = const_cast(&_data[BITSHUFFLE_PAGE_HEADER_SIZE]); + bytes = bitshuffle::decompress_lz4(in, _decoded.data(), _num_element_after_padding, + _size_of_element, 0); + if (PREDICT_FALSE(bytes < 0)) { + // Ideally, this should not happen. + warn_with_bitshuffle_error(bytes); + return Status::RuntimeError("Unshuffle Process failed"); + } + } + return Status::OK(); + } + + typedef typename TypeTraits::CppType CppType; + + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + Slice _data; + PageDecoderOptions _options; + bool _parsed; + size_t _num_elements; + size_t _compressed_size; + size_t _num_element_after_padding; + + int _size_of_element; + size_t _cur_index; + faststring _decoded; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp index 36ceb8ce39..22c280ae1a 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp @@ -1,81 +1,81 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" - -// Include the bitshuffle header once to get the default (non-AVX2) -// symbols. -#include - -#include "gutil/cpu.h" - -// Include the bitshuffle header again, but this time importing the -// AVX2-compiled symbols by defining some macros. -#undef BITSHUFFLE_H -#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2 -#define bshuf_compress_lz4 bshuf_compress_lz4_avx2 -#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2 -#include // NOLINT(*) -#undef bshuf_compress_lz4_bound -#undef bshuf_compress_lz4 -#undef bshuf_decompress_lz4 - -using base::CPU; - -namespace doris { -namespace bitshuffle { - -// Function pointers which will be assigned the correct implementation -// for the runtime architecture. -namespace { -decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound; -decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4; -decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4; -} // anonymous namespace - -// When this translation unit is initialized, figure out the current CPU and -// assign the correct function for this architecture. -// -// This avoids an expensive 'cpuid' call in the hot path, and also avoids -// the cost of a 'std::once' call. -__attribute__((constructor)) -void SelectBitshuffleFunctions() { - if (CPU().has_avx2()) { - g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2; - g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2; - g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2; - } else { - g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; - g_bshuf_compress_lz4 = bshuf_compress_lz4; - g_bshuf_decompress_lz4 = bshuf_decompress_lz4; - } -} - -int64_t compress_lz4(void* in, void* out, size_t size, - size_t elem_size, size_t block_size) { - return g_bshuf_compress_lz4(in, out, size, elem_size, block_size); -} -int64_t decompress_lz4(void* in, void* out, size_t size, - size_t elem_size, size_t block_size) { - return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size); -} -size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) { - return g_bshuf_compress_lz4_bound(size, elem_size, block_size); -} - -} // namespace bitshuffle -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/bitshuffle_wrapper.h" + +// Include the bitshuffle header once to get the default (non-AVX2) +// symbols. +#include + +#include "gutil/cpu.h" + +// Include the bitshuffle header again, but this time importing the +// AVX2-compiled symbols by defining some macros. +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2 +#define bshuf_compress_lz4 bshuf_compress_lz4_avx2 +#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2 +#include // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + +using base::CPU; + +namespace doris { +namespace bitshuffle { + +// Function pointers which will be assigned the correct implementation +// for the runtime architecture. +namespace { +decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound; +decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4; +decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4; +} // anonymous namespace + +// When this translation unit is initialized, figure out the current CPU and +// assign the correct function for this architecture. +// +// This avoids an expensive 'cpuid' call in the hot path, and also avoids +// the cost of a 'std::once' call. +__attribute__((constructor)) +void SelectBitshuffleFunctions() { + if (CPU().has_avx2()) { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2; + g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2; + } else { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; + g_bshuf_compress_lz4 = bshuf_compress_lz4; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4; + } +} + +int64_t compress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4(in, out, size, elem_size, block_size); +} +int64_t decompress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size); +} +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4_bound(size, elem_size, block_size); +} + +} // namespace bitshuffle +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h index 38c1e7231f..4846438130 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.h @@ -1,34 +1,34 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -// This namespace has wrappers for the Bitshuffle library which do runtime dispatch to -// either AVX2-accelerated or regular SSE2 implementations based on the available CPU. -namespace doris { -namespace bitshuffle { - -// See for documentation on these functions. -size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size); -int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); -int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); - -} // namespace bitshuffle -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +// This namespace has wrappers for the Bitshuffle library which do runtime dispatch to +// either AVX2-accelerated or regular SSE2 implementations based on the available CPU. +namespace doris { +namespace bitshuffle { + +// See for documentation on these functions. +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size); +int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); +int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); + +} // namespace bitshuffle +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_builder.h b/be/src/olap/rowset/segment_v2/page_builder.h index 4ef0701588..c2cc0eb813 100644 --- a/be/src/olap/rowset/segment_v2/page_builder.h +++ b/be/src/olap/rowset/segment_v2/page_builder.h @@ -1,87 +1,87 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "gutil/macros.h" -#include "util/slice.h" -#include "common/status.h" -#include "olap/rowset/segment_v2/common.h" - -namespace doris { -namespace segment_v2 { - -// PageBuilder is used to build page -// Page is a data management unit, including: -// 1. Data Page: store encoded and compressed data -// 2. BloomFilter Page: store bloom filter of data -// 3. Ordinal Index Page: store ordinal index of data -// 4. Short Key Index Page: store short key index of data -// 5. Bitmap Index Page: store bitmap index of data -class PageBuilder { -public: - PageBuilder() { } - - virtual ~PageBuilder() { } - - // Used by column writer to determine whether the current page is full. - // Column writer depends on the result to decide whether to flush current page. - virtual bool is_page_full() = 0; - - // Add a sequence of values to the page. - // The number of values actually added will be returned through count, which may be less - // than requested if the page is full. - // - // vals size should be decided according to the page build type - virtual doris::Status add(const uint8_t* vals, size_t* count) = 0; - - // Get the dictionary page for dictionary encoding mode column. - virtual Status get_dictionary_page(Slice* dictionary_page) { - return Status::NotSupported("get_dictionary_page not implemented"); - } - - // Return a Slice which represents the encoded data of current page. - // - // This Slice points to internal data of this builder. - virtual Slice finish() = 0; - - // Reset the internal state of the page builder. - // - // Any data previously returned by finish may be invalidated by this call. - virtual void reset() = 0; - - // Return the number of entries that have been added to the page. - virtual size_t count() const = 0; - - // Return the total bytes of pageBuilder that have been added to the page. - virtual uint64_t size() const = 0; - - // This api is for release the resource owned by builder - // It means it will transfer the ownership of some resource to other. - // This api is always called after finish - // and should be followed by reset() before reuse the builder - virtual void release() = 0; - -private: - DISALLOW_COPY_AND_ASSIGN(PageBuilder); -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "gutil/macros.h" +#include "util/slice.h" +#include "common/status.h" +#include "olap/rowset/segment_v2/common.h" + +namespace doris { +namespace segment_v2 { + +// PageBuilder is used to build page +// Page is a data management unit, including: +// 1. Data Page: store encoded and compressed data +// 2. BloomFilter Page: store bloom filter of data +// 3. Ordinal Index Page: store ordinal index of data +// 4. Short Key Index Page: store short key index of data +// 5. Bitmap Index Page: store bitmap index of data +class PageBuilder { +public: + PageBuilder() { } + + virtual ~PageBuilder() { } + + // Used by column writer to determine whether the current page is full. + // Column writer depends on the result to decide whether to flush current page. + virtual bool is_page_full() = 0; + + // Add a sequence of values to the page. + // The number of values actually added will be returned through count, which may be less + // than requested if the page is full. + // + // vals size should be decided according to the page build type + virtual doris::Status add(const uint8_t* vals, size_t* count) = 0; + + // Get the dictionary page for dictionary encoding mode column. + virtual Status get_dictionary_page(Slice* dictionary_page) { + return Status::NotSupported("get_dictionary_page not implemented"); + } + + // Return a Slice which represents the encoded data of current page. + // + // This Slice points to internal data of this builder. + virtual Slice finish() = 0; + + // Reset the internal state of the page builder. + // + // Any data previously returned by finish may be invalidated by this call. + virtual void reset() = 0; + + // Return the number of entries that have been added to the page. + virtual size_t count() const = 0; + + // Return the total bytes of pageBuilder that have been added to the page. + virtual uint64_t size() const = 0; + + // This api is for release the resource owned by builder + // It means it will transfer the ownership of some resource to other. + // This api is always called after finish + // and should be followed by reset() before reuse the builder + virtual void release() = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(PageBuilder); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h index a6e4c47ef2..490f45f498 100644 --- a/be/src/olap/rowset/segment_v2/page_decoder.h +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -1,79 +1,79 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "olap/column_block.h" // for ColumnBlockView -#include "olap/rowset/segment_v2/common.h" // for rowid_t -#include "common/status.h" // for Status - -namespace doris { -namespace segment_v2 { - -// PageDecoder is used to decode page. -class PageDecoder { -public: - PageDecoder() { } - - virtual ~PageDecoder() { } - - // Call this to do some preparation for decoder. - // eg: parse data page header - virtual Status init() = 0; - - // Seek the decoder to the given positional index of the page. - // For example, seek_to_position_in_page(0) seeks to the first - // stored entry. - // - // It is an error to call this with a value larger than Count(). - // Doing so has undefined results. - virtual Status seek_to_position_in_page(size_t pos) = 0; - - // Seek the decoder forward by a given number of rows, or to the end - // of the page. This is primarily used to skip over data. - // - // Return the step skipped. - virtual size_t seek_forward(size_t n) { - size_t step = std::min(n, count() - current_index()); - DCHECK_GE(step, 0); - seek_to_position_in_page(current_index() + step); - return step; - } - - // Fetch the next vector of values from the page into 'column_vector_view'. - // The output vector must have space for up to n cells. - // - // Return the size of read entries . - // - // In the case that the values are themselves references - // to other memory (eg Slices), the referred-to memory is - // allocated in the column_vector_view's mem_pool. - virtual Status next_batch(size_t* n, ColumnBlockView* dst) = 0; - - // Return the number of elements in this page. - virtual size_t count() const = 0; - - // Return the position within the page of the currently seeked - // entry (ie the entry that will next be returned by next_vector()) - virtual size_t current_index() const = 0; - -private: - DISALLOW_COPY_AND_ASSIGN(PageDecoder); -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/column_block.h" // for ColumnBlockView +#include "olap/rowset/segment_v2/common.h" // for rowid_t +#include "common/status.h" // for Status + +namespace doris { +namespace segment_v2 { + +// PageDecoder is used to decode page. +class PageDecoder { +public: + PageDecoder() { } + + virtual ~PageDecoder() { } + + // Call this to do some preparation for decoder. + // eg: parse data page header + virtual Status init() = 0; + + // Seek the decoder to the given positional index of the page. + // For example, seek_to_position_in_page(0) seeks to the first + // stored entry. + // + // It is an error to call this with a value larger than Count(). + // Doing so has undefined results. + virtual Status seek_to_position_in_page(size_t pos) = 0; + + // Seek the decoder forward by a given number of rows, or to the end + // of the page. This is primarily used to skip over data. + // + // Return the step skipped. + virtual size_t seek_forward(size_t n) { + size_t step = std::min(n, count() - current_index()); + DCHECK_GE(step, 0); + seek_to_position_in_page(current_index() + step); + return step; + } + + // Fetch the next vector of values from the page into 'column_vector_view'. + // The output vector must have space for up to n cells. + // + // Return the size of read entries . + // + // In the case that the values are themselves references + // to other memory (eg Slices), the referred-to memory is + // allocated in the column_vector_view's mem_pool. + virtual Status next_batch(size_t* n, ColumnBlockView* dst) = 0; + + // Return the number of elements in this page. + virtual size_t count() const = 0; + + // Return the position within the page of the currently seeked + // entry (ie the entry that will next be returned by next_vector()) + virtual size_t current_index() const = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(PageDecoder); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/rle_page.h b/be/src/olap/rowset/segment_v2/rle_page.h index 36817b7cfd..46fb197312 100644 --- a/be/src/olap/rowset/segment_v2/rle_page.h +++ b/be/src/olap/rowset/segment_v2/rle_page.h @@ -1,256 +1,256 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder -#include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder -#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions -#include "olap/rowset/segment_v2/common.h" // for rowid_t -#include "util/rle_encoding.h" // for RleEncoder/RleDecoder -#include "util/coding.h" // for encode_fixed32_le/decode_fixed32_le - -namespace doris { -namespace segment_v2 { - -enum { - RLE_PAGE_HEADER_SIZE = 4 -}; - -// RLE builder for generic integer and bool types. What is missing is some way -// to enforce that this can only be instantiated for INT and BOOL types. -// -// The page format is as follows: -// -// 1. Header: (4 bytes total) -// -// [32-bit] -// The number of elements encoded in the page. -// -// NOTE: all on-disk ints are encoded little-endian -// -// 2. Element data -// -// The header is followed by the rle-encoded element data. -// -// This Rle encoding algorithm is only effective for repeated INT type and bool type, -// It is not good for sequence number or random number. BitshufflePage is recommended -// for these case. -// -// TODO(hkp): optimize rle algorithm -template -class RlePageBuilder : public PageBuilder { -public: - RlePageBuilder(const PageBuilderOptions& options) : - _options(options), - _count(0), - _finished(false), - _bit_width(0), - _rle_encoder(nullptr) { - switch(Type) { - case OLAP_FIELD_TYPE_BOOL: { - _bit_width = 1; - break; - } - default: { - _bit_width = SIZE_OF_TYPE * 8; - break; - } - } - _rle_encoder = new RleEncoder(&_buf, _bit_width); - reset(); - } - - ~RlePageBuilder() { - delete _rle_encoder; - } - - bool is_page_full() override { - return _rle_encoder->len() >= _options.data_page_size; - } - - Status add(const uint8_t* vals, size_t* count) override { - DCHECK(!_finished); - DCHECK_EQ(reinterpret_cast(vals) & (alignof(CppType) - 1), 0) - << "Pointer passed to Add() must be naturally-aligned"; - - const CppType* new_vals = reinterpret_cast(vals); - for (int i = 0; i < *count; ++i) { - _rle_encoder->Put(new_vals[i]); - } - - _count += *count; - return Status::OK(); - } - - Slice finish() override { - _finished = true; - // here should Flush first and then encode the count header - // or it will lead to a bug if the header is less than 8 byte and the data is small - _rle_encoder->Flush(); - encode_fixed32_le(&_buf[0], _count); - return Slice(_buf.data(), _buf.size()); - } - - void reset() override { - _count = 0; - _rle_encoder->Clear(); - _rle_encoder->Reserve(RLE_PAGE_HEADER_SIZE, 0); - } - - size_t count() const override { - return _count; - } - - uint64_t size() const override { - return _rle_encoder->len(); - } - - // this api will release the memory ownership of encoded data - // Note: - // release() should be called after finish - // reset() should be called after this function before reuse the builder - void release() override { - uint8_t* ret = _buf.release(); - (void)ret; - } - -private: - typedef typename TypeTraits::CppType CppType; - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - - PageBuilderOptions _options; - size_t _count; - bool _finished; - int _bit_width; - RleEncoder* _rle_encoder; - faststring _buf; -}; - -template -class RlePageDecoder : public PageDecoder { -public: - RlePageDecoder(Slice slice, const PageDecoderOptions& options) : - _data(slice), - _options(options), - _parsed(false), - _num_elements(0), - _cur_index(0), - _bit_width(0) { } - - Status init() override { - CHECK(!_parsed); - - if (_data.size < RLE_PAGE_HEADER_SIZE) { - return Status::Corruption( - "not enough bytes for header in RleBitMapBlockDecoder"); - } - _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); - - _parsed = true; - - switch(Type) { - case OLAP_FIELD_TYPE_BOOL: { - _bit_width = 1; - break; - } - default: { - _bit_width = SIZE_OF_TYPE * 8; - break; - } - } - - _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, - _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); - - seek_to_position_in_page(0); - return Status::OK(); - } - - Status seek_to_position_in_page(size_t pos) override { - DCHECK(_parsed) << "Must call init()"; - DCHECK_LE(pos, _num_elements) << "Tried to seek to " << pos << " which is > number of elements (" - << _num_elements << ") in the block!"; - // If the block is empty (e.g. the column is filled with nulls), there is no data to seek. - if (PREDICT_FALSE(_num_elements == 0)) { - return Status::OK(); - } - if (_cur_index == pos) { - // No need to seek. - return Status::OK(); - } else if (_cur_index < pos) { - uint nskip = pos - _cur_index; - _rle_decoder.Skip(nskip); - } else { - _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, - _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); - _rle_decoder.Skip(pos); - } - _cur_index = pos; - return Status::OK(); - } - - Status next_batch(size_t* n, ColumnBlockView* dst) override { - DCHECK(_parsed); - if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { - *n = 0; - return Status::OK(); - } - - size_t to_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); - size_t remaining = to_fetch; - uint8_t* data_ptr = dst->data(); - bool result = false; - while (remaining > 0) { - result = _rle_decoder.Get(reinterpret_cast(data_ptr)); - DCHECK(result); - remaining--; - data_ptr += SIZE_OF_TYPE; - } - - _cur_index += to_fetch; - *n = to_fetch; - return Status::OK(); - } - - size_t count() const override { - return _num_elements; - } - - size_t current_index() const override { - return _cur_index; - } - -private: - typedef typename TypeTraits::CppType CppType; - enum { - SIZE_OF_TYPE = TypeTraits::size - }; - - Slice _data; - PageDecoderOptions _options; - bool _parsed; - uint32_t _num_elements; - size_t _cur_index; - int _bit_width; - RleDecoder _rle_decoder; -}; - -} // namespace segment_v2 -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder +#include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder +#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions +#include "olap/rowset/segment_v2/common.h" // for rowid_t +#include "util/rle_encoding.h" // for RleEncoder/RleDecoder +#include "util/coding.h" // for encode_fixed32_le/decode_fixed32_le + +namespace doris { +namespace segment_v2 { + +enum { + RLE_PAGE_HEADER_SIZE = 4 +}; + +// RLE builder for generic integer and bool types. What is missing is some way +// to enforce that this can only be instantiated for INT and BOOL types. +// +// The page format is as follows: +// +// 1. Header: (4 bytes total) +// +// [32-bit] +// The number of elements encoded in the page. +// +// NOTE: all on-disk ints are encoded little-endian +// +// 2. Element data +// +// The header is followed by the rle-encoded element data. +// +// This Rle encoding algorithm is only effective for repeated INT type and bool type, +// It is not good for sequence number or random number. BitshufflePage is recommended +// for these case. +// +// TODO(hkp): optimize rle algorithm +template +class RlePageBuilder : public PageBuilder { +public: + RlePageBuilder(const PageBuilderOptions& options) : + _options(options), + _count(0), + _finished(false), + _bit_width(0), + _rle_encoder(nullptr) { + switch(Type) { + case OLAP_FIELD_TYPE_BOOL: { + _bit_width = 1; + break; + } + default: { + _bit_width = SIZE_OF_TYPE * 8; + break; + } + } + _rle_encoder = new RleEncoder(&_buf, _bit_width); + reset(); + } + + ~RlePageBuilder() { + delete _rle_encoder; + } + + bool is_page_full() override { + return _rle_encoder->len() >= _options.data_page_size; + } + + Status add(const uint8_t* vals, size_t* count) override { + DCHECK(!_finished); + DCHECK_EQ(reinterpret_cast(vals) & (alignof(CppType) - 1), 0) + << "Pointer passed to Add() must be naturally-aligned"; + + const CppType* new_vals = reinterpret_cast(vals); + for (int i = 0; i < *count; ++i) { + _rle_encoder->Put(new_vals[i]); + } + + _count += *count; + return Status::OK(); + } + + Slice finish() override { + _finished = true; + // here should Flush first and then encode the count header + // or it will lead to a bug if the header is less than 8 byte and the data is small + _rle_encoder->Flush(); + encode_fixed32_le(&_buf[0], _count); + return Slice(_buf.data(), _buf.size()); + } + + void reset() override { + _count = 0; + _rle_encoder->Clear(); + _rle_encoder->Reserve(RLE_PAGE_HEADER_SIZE, 0); + } + + size_t count() const override { + return _count; + } + + uint64_t size() const override { + return _rle_encoder->len(); + } + + // this api will release the memory ownership of encoded data + // Note: + // release() should be called after finish + // reset() should be called after this function before reuse the builder + void release() override { + uint8_t* ret = _buf.release(); + (void)ret; + } + +private: + typedef typename TypeTraits::CppType CppType; + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + PageBuilderOptions _options; + size_t _count; + bool _finished; + int _bit_width; + RleEncoder* _rle_encoder; + faststring _buf; +}; + +template +class RlePageDecoder : public PageDecoder { +public: + RlePageDecoder(Slice slice, const PageDecoderOptions& options) : + _data(slice), + _options(options), + _parsed(false), + _num_elements(0), + _cur_index(0), + _bit_width(0) { } + + Status init() override { + CHECK(!_parsed); + + if (_data.size < RLE_PAGE_HEADER_SIZE) { + return Status::Corruption( + "not enough bytes for header in RleBitMapBlockDecoder"); + } + _num_elements = decode_fixed32_le((const uint8_t*)&_data[0]); + + _parsed = true; + + switch(Type) { + case OLAP_FIELD_TYPE_BOOL: { + _bit_width = 1; + break; + } + default: { + _bit_width = SIZE_OF_TYPE * 8; + break; + } + } + + _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, + _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); + + seek_to_position_in_page(0); + return Status::OK(); + } + + Status seek_to_position_in_page(size_t pos) override { + DCHECK(_parsed) << "Must call init()"; + DCHECK_LE(pos, _num_elements) << "Tried to seek to " << pos << " which is > number of elements (" + << _num_elements << ") in the block!"; + // If the block is empty (e.g. the column is filled with nulls), there is no data to seek. + if (PREDICT_FALSE(_num_elements == 0)) { + return Status::OK(); + } + if (_cur_index == pos) { + // No need to seek. + return Status::OK(); + } else if (_cur_index < pos) { + uint nskip = pos - _cur_index; + _rle_decoder.Skip(nskip); + } else { + _rle_decoder = RleDecoder((uint8_t*)_data.data + RLE_PAGE_HEADER_SIZE, + _data.size - RLE_PAGE_HEADER_SIZE, _bit_width); + _rle_decoder.Skip(pos); + } + _cur_index = pos; + return Status::OK(); + } + + Status next_batch(size_t* n, ColumnBlockView* dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { + *n = 0; + return Status::OK(); + } + + size_t to_fetch = std::min(*n, static_cast(_num_elements - _cur_index)); + size_t remaining = to_fetch; + uint8_t* data_ptr = dst->data(); + bool result = false; + while (remaining > 0) { + result = _rle_decoder.Get(reinterpret_cast(data_ptr)); + DCHECK(result); + remaining--; + data_ptr += SIZE_OF_TYPE; + } + + _cur_index += to_fetch; + *n = to_fetch; + return Status::OK(); + } + + size_t count() const override { + return _num_elements; + } + + size_t current_index() const override { + return _cur_index; + } + +private: + typedef typename TypeTraits::CppType CppType; + enum { + SIZE_OF_TYPE = TypeTraits::size + }; + + Slice _data; + PageDecoderOptions _options; + bool _parsed; + uint32_t _num_elements; + size_t _cur_index; + int _bit_width; + RleDecoder _rle_decoder; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/udf/CMakeLists.txt b/be/src/udf/CMakeLists.txt index 1587d0176f..c8a5b05d67 100755 --- a/be/src/udf/CMakeLists.txt +++ b/be/src/udf/CMakeLists.txt @@ -15,43 +15,43 @@ # specific language governing permissions and limitations # under the License. -# where to put generated libraries +# where to put generated libraries set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/udf") - -# where to put generated binaries -set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/udf") - -# Build this library twice. Once to be linked into the main Doris. This version -# can have dependencies on our other libs. The second version is shipped as part -# of the UDF sdk, which can't use other libs. -add_library(Udf udf.cpp udf_ir.cpp) -add_library(DorisUdf udf.cpp udf_ir.cpp) -set_target_properties(DorisUdf PROPERTIES COMPILE_FLAGS "-DDORIS_UDF_SDK_BUILD") - -# We can't use the normal link list since we want to pick up libDorisUdf (the external -# library) rather than the interal libUdf. -set (UDF_TEST_LINK_LIBS - -Wl,--start-group - Common - GlobalFlags - DorisUdf - Runtime - Util - -Wl,--end-group -# Below are all external dependencies. They should some after the doris libs. - ${Boost_LIBRARIES} - glogstatic - gflagsstatic - -lboost_date_time - gtest) - +set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/udf") + +# where to put generated binaries +set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/udf") + +# Build this library twice. Once to be linked into the main Doris. This version +# can have dependencies on our other libs. The second version is shipped as part +# of the UDF sdk, which can't use other libs. +add_library(Udf udf.cpp udf_ir.cpp) +add_library(DorisUdf udf.cpp udf_ir.cpp) +set_target_properties(DorisUdf PROPERTIES COMPILE_FLAGS "-DDORIS_UDF_SDK_BUILD") + +# We can't use the normal link list since we want to pick up libDorisUdf (the external +# library) rather than the interal libUdf. +set (UDF_TEST_LINK_LIBS + -Wl,--start-group + Common + GlobalFlags + DorisUdf + Runtime + Util + -Wl,--end-group +# Below are all external dependencies. They should some after the doris libs. + ${Boost_LIBRARIES} + glogstatic + gflagsstatic + -lboost_date_time + gtest) + set_target_properties(DorisUdf PROPERTIES PUBLIC_HEADER "udf.h;uda_test_harness.h") INSTALL(TARGETS DorisUdf ARCHIVE DESTINATION ${OUTPUT_DIR}/udf LIBRARY DESTINATION ${OUTPUT_DIR}/udf/lib PUBLIC_HEADER DESTINATION ${OUTPUT_DIR}/udf/include) -#ADD_BE_TEST(udf_test) -#ADD_BE_TEST(uda_test) +#ADD_BE_TEST(udf_test) +#ADD_BE_TEST(uda_test) diff --git a/be/src/util/alignment.h b/be/src/util/alignment.h index e1cc759d71..43802805fb 100644 --- a/be/src/util/alignment.h +++ b/be/src/util/alignment.h @@ -1,26 +1,26 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// -// Macros for dealing with memory alignment. -#pragma once - -// Round down 'x' to the nearest 'align' boundary -#define ALIGN_DOWN(x, align) ((x) & (~(align) + 1)) - -// Round up 'x' to the nearest 'align' boundary -#define ALIGN_UP(x, align) (((x) + ((align) - 1)) & (~(align) + 1)) - +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Macros for dealing with memory alignment. +#pragma once + +// Round down 'x' to the nearest 'align' boundary +#define ALIGN_DOWN(x, align) ((x) & (~(align) + 1)) + +// Round up 'x' to the nearest 'align' boundary +#define ALIGN_UP(x, align) (((x) + ((align) - 1)) & (~(align) + 1)) + diff --git a/be/src/util/bit_stream_utils.h b/be/src/util/bit_stream_utils.h index 220c8cb4f1..cc463c346f 100644 --- a/be/src/util/bit_stream_utils.h +++ b/be/src/util/bit_stream_utils.h @@ -1,149 +1,149 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#pragma once - -#include "gutil/port.h" -#include "util/bit_util.h" -#include "util/faststring.h" - -using doris::BitUtil; - -namespace doris { - -// Utility class to write bit/byte streams. This class can write data to either be -// bit packed or byte aligned (and a single stream that has a mix of both). -class BitWriter { - public: - // buffer: buffer to write bits to. - explicit BitWriter(faststring *buffer) - : buffer_(buffer) { - Clear(); - } - - void Clear() { - buffered_values_ = 0; - byte_offset_ = 0; - bit_offset_ = 0; - buffer_->clear(); - } - - // Returns a pointer to the underlying buffer - faststring *buffer() const { return buffer_; } - - // The number of current bytes written, including the current byte (i.e. may include a - // fraction of a byte). Includes buffered values. - int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } - - // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit - // packed. - void PutValue(uint64_t v, int num_bits); - - // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the - // extra high-order bits will be ignored. - template - void PutAligned(T v, int num_bits); - - // Write a Vlq encoded int to the buffer. The value is written byte aligned. - // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity - void PutVlqInt(int32_t v); - - // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. - size_t GetByteIndexAndAdvance(int num_bytes) { - uint8_t* ptr = GetNextBytePtr(num_bytes); - return ptr - buffer_->data(); - } - - // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. - uint8_t* GetNextBytePtr(int num_bytes); - - // Flushes all buffered values to the buffer. Call this when done writing to the buffer. - // If 'align' is true, buffered_values_ is reset and any future writes will be written - // to the next byte boundary. - void Flush(bool align = false); - - private: - // Bit-packed values are initially written to this variable before being memcpy'd to - // buffer_. This is faster than writing values byte by byte directly to buffer_. - uint64_t buffered_values_; - - faststring *buffer_; - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -// Utility class to read bit/byte stream. This class can read bits or bytes -// that are either byte aligned or not. It also has utilities to read multiple -// bytes in one read (e.g. encoded int). -class BitReader { - public: - // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. - BitReader(const uint8_t* buffer, int buffer_len); - - BitReader() : buffer_(NULL), max_bytes_(0) {} - - // Gets the next value from the buffer. Returns true if 'v' could be read or false if - // there are not enough bytes left. num_bits must be <= 32. - template - bool GetValue(int num_bits, T* v); - - // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a - // little-endian native type and big enough to store 'num_bytes'. The value is assumed - // to be byte-aligned so the stream will be advanced to the start of the next byte - // before 'v' is read. Returns false if there are not enough bytes left. - template - bool GetAligned(int num_bytes, T* v); - - // Reads a vlq encoded int from the stream. The encoded int must start at the - // beginning of a byte. Return false if there were not enough bytes in the buffer. - bool GetVlqInt(int32_t* v); - - // Returns the number of bytes left in the stream, not including the current byte (i.e., - // there may be an additional fraction of a byte). - int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } - - // Current position in the stream, by bit. - int position() const { return byte_offset_ * 8 + bit_offset_; } - - // Rewind the stream by 'num_bits' bits - void Rewind(int num_bits); - - // Seek to a specific bit in the buffer - void SeekToBit(uint stream_position); - - // Maximum byte length of a vlq encoded int - static const int MAX_VLQ_BYTE_LEN = 5; - - bool is_initialized() const { return buffer_ != NULL; } - - private: - // Used by SeekToBit() and GetValue() to fetch the - // the next word into buffer_. - void BufferValues(); - - const uint8_t* buffer_; - int max_bytes_; - - // Bytes are memcpy'd from buffer_ and values are read from this variable. This is - // faster than reading values byte by byte directly from buffer_. - uint64_t buffered_values_; - - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -} // namespace doris - +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include "gutil/port.h" +#include "util/bit_util.h" +#include "util/faststring.h" + +using doris::BitUtil; + +namespace doris { + +// Utility class to write bit/byte streams. This class can write data to either be +// bit packed or byte aligned (and a single stream that has a mix of both). +class BitWriter { + public: + // buffer: buffer to write bits to. + explicit BitWriter(faststring *buffer) + : buffer_(buffer) { + Clear(); + } + + void Clear() { + buffered_values_ = 0; + byte_offset_ = 0; + bit_offset_ = 0; + buffer_->clear(); + } + + // Returns a pointer to the underlying buffer + faststring *buffer() const { return buffer_; } + + // The number of current bytes written, including the current byte (i.e. may include a + // fraction of a byte). Includes buffered values. + int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } + + // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit + // packed. + void PutValue(uint64_t v, int num_bits); + + // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the + // extra high-order bits will be ignored. + template + void PutAligned(T v, int num_bits); + + // Write a Vlq encoded int to the buffer. The value is written byte aligned. + // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity + void PutVlqInt(int32_t v); + + // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. + size_t GetByteIndexAndAdvance(int num_bytes) { + uint8_t* ptr = GetNextBytePtr(num_bytes); + return ptr - buffer_->data(); + } + + // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. + uint8_t* GetNextBytePtr(int num_bytes); + + // Flushes all buffered values to the buffer. Call this when done writing to the buffer. + // If 'align' is true, buffered_values_ is reset and any future writes will be written + // to the next byte boundary. + void Flush(bool align = false); + + private: + // Bit-packed values are initially written to this variable before being memcpy'd to + // buffer_. This is faster than writing values byte by byte directly to buffer_. + uint64_t buffered_values_; + + faststring *buffer_; + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +// Utility class to read bit/byte stream. This class can read bits or bytes +// that are either byte aligned or not. It also has utilities to read multiple +// bytes in one read (e.g. encoded int). +class BitReader { + public: + // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. + BitReader(const uint8_t* buffer, int buffer_len); + + BitReader() : buffer_(NULL), max_bytes_(0) {} + + // Gets the next value from the buffer. Returns true if 'v' could be read or false if + // there are not enough bytes left. num_bits must be <= 32. + template + bool GetValue(int num_bits, T* v); + + // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a + // little-endian native type and big enough to store 'num_bytes'. The value is assumed + // to be byte-aligned so the stream will be advanced to the start of the next byte + // before 'v' is read. Returns false if there are not enough bytes left. + template + bool GetAligned(int num_bytes, T* v); + + // Reads a vlq encoded int from the stream. The encoded int must start at the + // beginning of a byte. Return false if there were not enough bytes in the buffer. + bool GetVlqInt(int32_t* v); + + // Returns the number of bytes left in the stream, not including the current byte (i.e., + // there may be an additional fraction of a byte). + int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } + + // Current position in the stream, by bit. + int position() const { return byte_offset_ * 8 + bit_offset_; } + + // Rewind the stream by 'num_bits' bits + void Rewind(int num_bits); + + // Seek to a specific bit in the buffer + void SeekToBit(uint stream_position); + + // Maximum byte length of a vlq encoded int + static const int MAX_VLQ_BYTE_LEN = 5; + + bool is_initialized() const { return buffer_ != NULL; } + + private: + // Used by SeekToBit() and GetValue() to fetch the + // the next word into buffer_. + void BufferValues(); + + const uint8_t* buffer_; + int max_bytes_; + + // Bytes are memcpy'd from buffer_ and values are read from this variable. This is + // faster than reading values byte by byte directly from buffer_. + uint64_t buffered_values_; + + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +} // namespace doris + diff --git a/be/src/util/bit_stream_utils.inline.h b/be/src/util/bit_stream_utils.inline.h index 1bbabd789b..deac875ce2 100644 --- a/be/src/util/bit_stream_utils.inline.h +++ b/be/src/util/bit_stream_utils.inline.h @@ -1,213 +1,213 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H -#define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H - -#include - -#include "glog/logging.h" -#include "util/bit_stream_utils.h" -#include "util/alignment.h" - -using doris::BitUtil; - -namespace doris { - -inline void BitWriter::PutValue(uint64_t v, int num_bits) { - DCHECK_LE(num_bits, 64); - // Truncate the higher-order bits. This is necessary to - // support signed values. - v &= ~0ULL >> (64 - num_bits); - - - buffered_values_ |= v << bit_offset_; - bit_offset_ += num_bits; - - if (PREDICT_FALSE(bit_offset_ >= 64)) { - // Flush buffered_values_ and write out bits of v that did not fit - buffer_->reserve(ALIGN_UP(byte_offset_ + 8, 8)); - buffer_->resize(byte_offset_ + 8); - DCHECK_LE(byte_offset_ + 8, buffer_->capacity()); - memcpy(buffer_->data() + byte_offset_, &buffered_values_, 8); - buffered_values_ = 0; - byte_offset_ += 8; - bit_offset_ -= 64; - buffered_values_ = BitUtil::ShiftRightZeroOnOverflow(v, (num_bits - bit_offset_)); - } - DCHECK_LT(bit_offset_, 64); -} - -inline void BitWriter::Flush(bool align) { - int num_bytes = BitUtil::Ceil(bit_offset_, 8); - buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); - buffer_->resize(byte_offset_ + num_bytes); - DCHECK_LE(byte_offset_ + num_bytes, buffer_->capacity()); - memcpy(buffer_->data() + byte_offset_, &buffered_values_, num_bytes); - - if (align) { - buffered_values_ = 0; - byte_offset_ += num_bytes; - bit_offset_ = 0; - } -} - -inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { - Flush(/* align */ true); - buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); - buffer_->resize(byte_offset_ + num_bytes); - uint8_t* ptr = buffer_->data() + byte_offset_; - byte_offset_ += num_bytes; - DCHECK_LE(byte_offset_, buffer_->capacity()); - return ptr; -} - -template -inline void BitWriter::PutAligned(T val, int num_bytes) { - DCHECK_LE(num_bytes, sizeof(T)); - uint8_t* ptr = GetNextBytePtr(num_bytes); - memcpy(ptr, &val, num_bytes); -} - -inline void BitWriter::PutVlqInt(int32_t v) { - while ((v & 0xFFFFFF80) != 0L) { - PutAligned((v & 0x7F) | 0x80, 1); - v >>= 7; - } - PutAligned(v & 0x7F, 1); -} - - -inline BitReader::BitReader(const uint8_t* buffer, int buffer_len) - : buffer_(buffer), - max_bytes_(buffer_len), - buffered_values_(0), - byte_offset_(0), - bit_offset_(0) { - int num_bytes = std::min(8, max_bytes_); - memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); -} - -inline void BitReader::BufferValues() { - int bytes_remaining = max_bytes_ - byte_offset_; - if (PREDICT_TRUE(bytes_remaining >= 8)) { - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); - } else { - memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); - } -} - -template -inline bool BitReader::GetValue(int num_bits, T* v) { - DCHECK_LE(num_bits, 64); - DCHECK_LE(num_bits, sizeof(T) * 8); - - if (PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; - - *v = BitUtil::TrailingBits(buffered_values_, bit_offset_ + num_bits) >> bit_offset_; - - bit_offset_ += num_bits; - if (bit_offset_ >= 64) { - byte_offset_ += 8; - bit_offset_ -= 64; - BufferValues(); - // Read bits of v that crossed into new buffered_values_ - *v |= BitUtil::ShiftLeftZeroOnOverflow( - BitUtil::TrailingBits(buffered_values_, bit_offset_), - (num_bits - bit_offset_)); - } - DCHECK_LE(bit_offset_, 64); - return true; -} - -inline void BitReader::Rewind(int num_bits) { - bit_offset_ -= num_bits; - if (bit_offset_ >= 0) { - return; - } - while (bit_offset_ < 0) { - int seek_back = std::min(byte_offset_, 8); - byte_offset_ -= seek_back; - bit_offset_ += seek_back * 8; - } - // This should only be executed *if* rewinding by 'num_bits' - // make the existing buffered_values_ invalid - DCHECK_GE(byte_offset_, 0); // Check for underflow - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); -} - -inline void BitReader::SeekToBit(uint stream_position) { - DCHECK_LE(stream_position, max_bytes_ * 8); - - int delta = static_cast(stream_position) - position(); - if (delta == 0) { - return; - } else if (delta < 0) { - Rewind(position() - stream_position); - } else { - bit_offset_ += delta; - while (bit_offset_ >= 64) { - byte_offset_ +=8; - bit_offset_ -= 64; - if (bit_offset_ < 64) { - // This should only be executed if seeking to - // 'stream_position' makes the existing buffered_values_ - // invalid. - BufferValues(); - } - } - } -} - -template -inline bool BitReader::GetAligned(int num_bytes, T* v) { - DCHECK_LE(num_bytes, sizeof(T)); - int bytes_read = BitUtil::Ceil(bit_offset_, 8); - if (PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false; - - // Advance byte_offset to next unread byte and read num_bytes - byte_offset_ += bytes_read; - memcpy(v, buffer_ + byte_offset_, num_bytes); - byte_offset_ += num_bytes; - - // Reset buffered_values_ - bit_offset_ = 0; - int bytes_remaining = max_bytes_ - byte_offset_; - if (PREDICT_TRUE(bytes_remaining >= 8)) { - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); - } else { - memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); - } - return true; -} - -inline bool BitReader::GetVlqInt(int32_t* v) { - *v = 0; - int shift = 0; - int num_bytes = 0; - uint8_t byte = 0; - do { - if (!GetAligned(1, &byte)) return false; - *v |= (byte & 0x7F) << shift; - shift += 7; - DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); - } while ((byte & 0x80) != 0); - return true; -} - -} // namespace doris - -#endif +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H +#define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H + +#include + +#include "glog/logging.h" +#include "util/bit_stream_utils.h" +#include "util/alignment.h" + +using doris::BitUtil; + +namespace doris { + +inline void BitWriter::PutValue(uint64_t v, int num_bits) { + DCHECK_LE(num_bits, 64); + // Truncate the higher-order bits. This is necessary to + // support signed values. + v &= ~0ULL >> (64 - num_bits); + + + buffered_values_ |= v << bit_offset_; + bit_offset_ += num_bits; + + if (PREDICT_FALSE(bit_offset_ >= 64)) { + // Flush buffered_values_ and write out bits of v that did not fit + buffer_->reserve(ALIGN_UP(byte_offset_ + 8, 8)); + buffer_->resize(byte_offset_ + 8); + DCHECK_LE(byte_offset_ + 8, buffer_->capacity()); + memcpy(buffer_->data() + byte_offset_, &buffered_values_, 8); + buffered_values_ = 0; + byte_offset_ += 8; + bit_offset_ -= 64; + buffered_values_ = BitUtil::ShiftRightZeroOnOverflow(v, (num_bits - bit_offset_)); + } + DCHECK_LT(bit_offset_, 64); +} + +inline void BitWriter::Flush(bool align) { + int num_bytes = BitUtil::Ceil(bit_offset_, 8); + buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); + buffer_->resize(byte_offset_ + num_bytes); + DCHECK_LE(byte_offset_ + num_bytes, buffer_->capacity()); + memcpy(buffer_->data() + byte_offset_, &buffered_values_, num_bytes); + + if (align) { + buffered_values_ = 0; + byte_offset_ += num_bytes; + bit_offset_ = 0; + } +} + +inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { + Flush(/* align */ true); + buffer_->reserve(ALIGN_UP(byte_offset_ + num_bytes, 8)); + buffer_->resize(byte_offset_ + num_bytes); + uint8_t* ptr = buffer_->data() + byte_offset_; + byte_offset_ += num_bytes; + DCHECK_LE(byte_offset_, buffer_->capacity()); + return ptr; +} + +template +inline void BitWriter::PutAligned(T val, int num_bytes) { + DCHECK_LE(num_bytes, sizeof(T)); + uint8_t* ptr = GetNextBytePtr(num_bytes); + memcpy(ptr, &val, num_bytes); +} + +inline void BitWriter::PutVlqInt(int32_t v) { + while ((v & 0xFFFFFF80) != 0L) { + PutAligned((v & 0x7F) | 0x80, 1); + v >>= 7; + } + PutAligned(v & 0x7F, 1); +} + + +inline BitReader::BitReader(const uint8_t* buffer, int buffer_len) + : buffer_(buffer), + max_bytes_(buffer_len), + buffered_values_(0), + byte_offset_(0), + bit_offset_(0) { + int num_bytes = std::min(8, max_bytes_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); +} + +inline void BitReader::BufferValues() { + int bytes_remaining = max_bytes_ - byte_offset_; + if (PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } +} + +template +inline bool BitReader::GetValue(int num_bits, T* v) { + DCHECK_LE(num_bits, 64); + DCHECK_LE(num_bits, sizeof(T) * 8); + + if (PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; + + *v = BitUtil::TrailingBits(buffered_values_, bit_offset_ + num_bits) >> bit_offset_; + + bit_offset_ += num_bits; + if (bit_offset_ >= 64) { + byte_offset_ += 8; + bit_offset_ -= 64; + BufferValues(); + // Read bits of v that crossed into new buffered_values_ + *v |= BitUtil::ShiftLeftZeroOnOverflow( + BitUtil::TrailingBits(buffered_values_, bit_offset_), + (num_bits - bit_offset_)); + } + DCHECK_LE(bit_offset_, 64); + return true; +} + +inline void BitReader::Rewind(int num_bits) { + bit_offset_ -= num_bits; + if (bit_offset_ >= 0) { + return; + } + while (bit_offset_ < 0) { + int seek_back = std::min(byte_offset_, 8); + byte_offset_ -= seek_back; + bit_offset_ += seek_back * 8; + } + // This should only be executed *if* rewinding by 'num_bits' + // make the existing buffered_values_ invalid + DCHECK_GE(byte_offset_, 0); // Check for underflow + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); +} + +inline void BitReader::SeekToBit(uint stream_position) { + DCHECK_LE(stream_position, max_bytes_ * 8); + + int delta = static_cast(stream_position) - position(); + if (delta == 0) { + return; + } else if (delta < 0) { + Rewind(position() - stream_position); + } else { + bit_offset_ += delta; + while (bit_offset_ >= 64) { + byte_offset_ +=8; + bit_offset_ -= 64; + if (bit_offset_ < 64) { + // This should only be executed if seeking to + // 'stream_position' makes the existing buffered_values_ + // invalid. + BufferValues(); + } + } + } +} + +template +inline bool BitReader::GetAligned(int num_bytes, T* v) { + DCHECK_LE(num_bytes, sizeof(T)); + int bytes_read = BitUtil::Ceil(bit_offset_, 8); + if (PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false; + + // Advance byte_offset to next unread byte and read num_bytes + byte_offset_ += bytes_read; + memcpy(v, buffer_ + byte_offset_, num_bytes); + byte_offset_ += num_bytes; + + // Reset buffered_values_ + bit_offset_ = 0; + int bytes_remaining = max_bytes_ - byte_offset_; + if (PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } + return true; +} + +inline bool BitReader::GetVlqInt(int32_t* v) { + *v = 0; + int shift = 0; + int num_bytes = 0; + uint8_t byte = 0; + do { + if (!GetAligned(1, &byte)) return false; + *v |= (byte & 0x7F) << shift; + shift += 7; + DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); + } while ((byte & 0x80) != 0); + return true; +} + +} // namespace doris + +#endif diff --git a/be/src/util/faststring.cc b/be/src/util/faststring.cc index 30febe9705..49f868704c 100644 --- a/be/src/util/faststring.cc +++ b/be/src/util/faststring.cc @@ -1,72 +1,72 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/faststring.h" - -#include -#include - -namespace doris { - -void faststring::GrowByAtLeast(size_t count) { - // Not enough space, need to reserve more. - // Don't reserve exactly enough space for the new string -- that makes it - // too easy to write perf bugs where you get O(n^2) append. - // Instead, alwayhs expand by at least 50%. - - size_t to_reserve = len_ + count; - if (len_ + count < len_ * 3 / 2) { - to_reserve = len_ * 3 / 2; - } - GrowArray(to_reserve); -} - -void faststring::GrowArray(size_t newcapacity) { - DCHECK_GE(newcapacity, capacity_); - std::unique_ptr newdata(new uint8_t[newcapacity]); - if (len_ > 0) { - memcpy(&newdata[0], &data_[0], len_); - } - capacity_ = newcapacity; - if (data_ != initial_data_) { - delete[] data_; - } else { - ASAN_POISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); - } - - data_ = newdata.release(); - ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); -} - -void faststring::ShrinkToFitInternal() { - DCHECK_NE(data_, initial_data_); - if (len_ <= kInitialCapacity) { - ASAN_UNPOISON_MEMORY_REGION(initial_data_, len_); - memcpy(initial_data_, &data_[0], len_); - delete[] data_; - data_ = initial_data_; - capacity_ = kInitialCapacity; - } else { - std::unique_ptr newdata(new uint8_t[len_]); - memcpy(&newdata[0], &data_[0], len_); - delete[] data_; - data_ = newdata.release(); - capacity_ = len_; - } -} - -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/faststring.h" + +#include +#include + +namespace doris { + +void faststring::GrowByAtLeast(size_t count) { + // Not enough space, need to reserve more. + // Don't reserve exactly enough space for the new string -- that makes it + // too easy to write perf bugs where you get O(n^2) append. + // Instead, alwayhs expand by at least 50%. + + size_t to_reserve = len_ + count; + if (len_ + count < len_ * 3 / 2) { + to_reserve = len_ * 3 / 2; + } + GrowArray(to_reserve); +} + +void faststring::GrowArray(size_t newcapacity) { + DCHECK_GE(newcapacity, capacity_); + std::unique_ptr newdata(new uint8_t[newcapacity]); + if (len_ > 0) { + memcpy(&newdata[0], &data_[0], len_); + } + capacity_ = newcapacity; + if (data_ != initial_data_) { + delete[] data_; + } else { + ASAN_POISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); + } + + data_ = newdata.release(); + ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); +} + +void faststring::ShrinkToFitInternal() { + DCHECK_NE(data_, initial_data_); + if (len_ <= kInitialCapacity) { + ASAN_UNPOISON_MEMORY_REGION(initial_data_, len_); + memcpy(initial_data_, &data_[0], len_); + delete[] data_; + data_ = initial_data_; + capacity_ = kInitialCapacity; + } else { + std::unique_ptr newdata(new uint8_t[len_]); + memcpy(&newdata[0], &data_[0], len_); + delete[] data_; + data_ = newdata.release(); + capacity_ = len_; + } +} + +} // namespace doris diff --git a/be/src/util/faststring.h b/be/src/util/faststring.h index f3892f4170..98dfbb7828 100644 --- a/be/src/util/faststring.h +++ b/be/src/util/faststring.h @@ -1,257 +1,257 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "gutil/dynamic_annotations.h" -#include "gutil/macros.h" -#include "gutil/port.h" -#include "gutil/strings/fastmem.h" - -namespace doris { - -// A faststring is similar to a std::string, except that it is faster for many -// common use cases (in particular, resize() will fill with uninitialized data -// instead of memsetting to \0) -class faststring { - public: - enum { - kInitialCapacity = 32 - }; - - faststring() : - data_(initial_data_), - len_(0), - capacity_(kInitialCapacity) { - } - - // Construct a string with the given capacity, in bytes. - explicit faststring(size_t capacity) - : data_(initial_data_), - len_(0), - capacity_(kInitialCapacity) { - if (capacity > capacity_) { - data_ = new uint8_t[capacity]; - capacity_ = capacity; - } - ASAN_POISON_MEMORY_REGION(data_, capacity_); - } - - ~faststring() { - ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); - if (data_ != initial_data_) { - delete[] data_; - } - } - - // Reset the valid length of the string to 0. - // - // This does not free up any memory. The capacity of the string remains unchanged. - void clear() { - resize(0); - ASAN_POISON_MEMORY_REGION(data_, capacity_); - } - - // Resize the string to the given length. - // If the new length is larger than the old length, the capacity is expanded as necessary. - // - // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are - // not cleared. - void resize(size_t newsize) { - if (newsize > capacity_) { - reserve(newsize); - } - len_ = newsize; - ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); - ASAN_UNPOISON_MEMORY_REGION(data_, len_); - } - - // Releases the underlying array; after this, the buffer is left empty. - // - // NOTE: the data pointer returned by release() is not necessarily the pointer - uint8_t *release() { - uint8_t *ret = data_; - if (ret == initial_data_) { - ret = new uint8_t[len_]; - memcpy(ret, data_, len_); - } - len_ = 0; - capacity_ = kInitialCapacity; - data_ = initial_data_; - ASAN_POISON_MEMORY_REGION(data_, capacity_); - return ret; - } - - // Reserve space for the given total amount of data. If the current capacity is already - // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). - // - // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory - // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. - void reserve(size_t newcapacity) { - if (PREDICT_TRUE(newcapacity <= capacity_)) return; - GrowArray(newcapacity); - } - - // Append the given data to the string, resizing capacity as necessary. - void append(const void *src_v, size_t count) { - const uint8_t *src = reinterpret_cast(src_v); - EnsureRoomForAppend(count); - ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); - - // appending short values is common enough that this - // actually helps, according to benchmarks. In theory - // memcpy_inlined should already be just as good, but this - // was ~20% faster for reading a large prefix-coded string file - // where each string was only a few chars different - if (count <= 4) { - uint8_t *p = &data_[len_]; - for (int i = 0; i < count; i++) { - *p++ = *src++; - } - } else { - strings::memcpy_inlined(&data_[len_], src, count); - } - len_ += count; - } - - // Append the given string to this string. - void append(const std::string &str) { - append(str.data(), str.size()); - } - - // Append the given character to this string. - void push_back(const char byte) { - EnsureRoomForAppend(1); - ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); - data_[len_] = byte; - len_++; - } - - // Return the valid length of this string. - size_t length() const { - return len_; - } - - // Return the valid length of this string (identical to length()) - size_t size() const { - return len_; - } - - // Return the allocated capacity of this string. - size_t capacity() const { - return capacity_; - } - - // Return a pointer to the data in this string. Note that this pointer - // may be invalidated by any later non-const operation. - const uint8_t *data() const { - return &data_[0]; - } - - // Return a pointer to the data in this string. Note that this pointer - // may be invalidated by any later non-const operation. - uint8_t *data() { - return &data_[0]; - } - - // Return the given element of this string. Note that this does not perform - // any bounds checking. - const uint8_t &at(size_t i) const { - return data_[i]; - } - - // Return the given element of this string. Note that this does not perform - // any bounds checking. - const uint8_t &operator[](size_t i) const { - return data_[i]; - } - - // Return the given element of this string. Note that this does not perform - // any bounds checking. - uint8_t &operator[](size_t i) { - return data_[i]; - } - - // Reset the contents of this string by copying 'len' bytes from 'src'. - void assign_copy(const uint8_t *src, size_t len) { - // Reset length so that the first resize doesn't need to copy the current - // contents of the array. - len_ = 0; - resize(len); - memcpy(data(), src, len); - } - - // Reset the contents of this string by copying from the given std::string. - void assign_copy(const std::string &str) { - assign_copy(reinterpret_cast(str.c_str()), - str.size()); - } - - // Reallocates the internal storage to fit only the current data. - // - // This may revert to using internal storage if the current length is shorter than - // kInitialCapacity. Note that, in that case, after this call, capacity() will return - // a capacity larger than the data length. - // - // Any pointers within this instance are invalidated. - void shrink_to_fit() { - if (data_ == initial_data_ || capacity_ == len_) return; - ShrinkToFitInternal(); - } - - // Return a copy of this string as a std::string. - std::string ToString() const { - return std::string(reinterpret_cast(data()), - len_); - } - - private: - DISALLOW_COPY_AND_ASSIGN(faststring); - - // If necessary, expand the buffer to fit at least 'count' more bytes. - // If the array has to be grown, it is grown by at least 50%. - void EnsureRoomForAppend(size_t count) { - if (PREDICT_TRUE(len_ + count <= capacity_)) { - return; - } - - // Call the non-inline slow path - this reduces the number of instructions - // on the hot path. - GrowByAtLeast(count); - } - - // The slow path of MakeRoomFor. Grows the buffer by either - // 'count' bytes, or 50%, whichever is more. - void GrowByAtLeast(size_t count); - - // Grow the array to the given capacity, which must be more than - // the current capacity. - void GrowArray(size_t newcapacity); - - void ShrinkToFitInternal(); - - uint8_t* data_; - uint8_t initial_data_[kInitialCapacity]; - size_t len_; - size_t capacity_; -}; - -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "gutil/dynamic_annotations.h" +#include "gutil/macros.h" +#include "gutil/port.h" +#include "gutil/strings/fastmem.h" + +namespace doris { + +// A faststring is similar to a std::string, except that it is faster for many +// common use cases (in particular, resize() will fill with uninitialized data +// instead of memsetting to \0) +class faststring { + public: + enum { + kInitialCapacity = 32 + }; + + faststring() : + data_(initial_data_), + len_(0), + capacity_(kInitialCapacity) { + } + + // Construct a string with the given capacity, in bytes. + explicit faststring(size_t capacity) + : data_(initial_data_), + len_(0), + capacity_(kInitialCapacity) { + if (capacity > capacity_) { + data_ = new uint8_t[capacity]; + capacity_ = capacity; + } + ASAN_POISON_MEMORY_REGION(data_, capacity_); + } + + ~faststring() { + ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); + if (data_ != initial_data_) { + delete[] data_; + } + } + + // Reset the valid length of the string to 0. + // + // This does not free up any memory. The capacity of the string remains unchanged. + void clear() { + resize(0); + ASAN_POISON_MEMORY_REGION(data_, capacity_); + } + + // Resize the string to the given length. + // If the new length is larger than the old length, the capacity is expanded as necessary. + // + // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are + // not cleared. + void resize(size_t newsize) { + if (newsize > capacity_) { + reserve(newsize); + } + len_ = newsize; + ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); + ASAN_UNPOISON_MEMORY_REGION(data_, len_); + } + + // Releases the underlying array; after this, the buffer is left empty. + // + // NOTE: the data pointer returned by release() is not necessarily the pointer + uint8_t *release() { + uint8_t *ret = data_; + if (ret == initial_data_) { + ret = new uint8_t[len_]; + memcpy(ret, data_, len_); + } + len_ = 0; + capacity_ = kInitialCapacity; + data_ = initial_data_; + ASAN_POISON_MEMORY_REGION(data_, capacity_); + return ret; + } + + // Reserve space for the given total amount of data. If the current capacity is already + // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). + // + // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory + // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. + void reserve(size_t newcapacity) { + if (PREDICT_TRUE(newcapacity <= capacity_)) return; + GrowArray(newcapacity); + } + + // Append the given data to the string, resizing capacity as necessary. + void append(const void *src_v, size_t count) { + const uint8_t *src = reinterpret_cast(src_v); + EnsureRoomForAppend(count); + ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); + + // appending short values is common enough that this + // actually helps, according to benchmarks. In theory + // memcpy_inlined should already be just as good, but this + // was ~20% faster for reading a large prefix-coded string file + // where each string was only a few chars different + if (count <= 4) { + uint8_t *p = &data_[len_]; + for (int i = 0; i < count; i++) { + *p++ = *src++; + } + } else { + strings::memcpy_inlined(&data_[len_], src, count); + } + len_ += count; + } + + // Append the given string to this string. + void append(const std::string &str) { + append(str.data(), str.size()); + } + + // Append the given character to this string. + void push_back(const char byte) { + EnsureRoomForAppend(1); + ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); + data_[len_] = byte; + len_++; + } + + // Return the valid length of this string. + size_t length() const { + return len_; + } + + // Return the valid length of this string (identical to length()) + size_t size() const { + return len_; + } + + // Return the allocated capacity of this string. + size_t capacity() const { + return capacity_; + } + + // Return a pointer to the data in this string. Note that this pointer + // may be invalidated by any later non-const operation. + const uint8_t *data() const { + return &data_[0]; + } + + // Return a pointer to the data in this string. Note that this pointer + // may be invalidated by any later non-const operation. + uint8_t *data() { + return &data_[0]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + const uint8_t &at(size_t i) const { + return data_[i]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + const uint8_t &operator[](size_t i) const { + return data_[i]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + uint8_t &operator[](size_t i) { + return data_[i]; + } + + // Reset the contents of this string by copying 'len' bytes from 'src'. + void assign_copy(const uint8_t *src, size_t len) { + // Reset length so that the first resize doesn't need to copy the current + // contents of the array. + len_ = 0; + resize(len); + memcpy(data(), src, len); + } + + // Reset the contents of this string by copying from the given std::string. + void assign_copy(const std::string &str) { + assign_copy(reinterpret_cast(str.c_str()), + str.size()); + } + + // Reallocates the internal storage to fit only the current data. + // + // This may revert to using internal storage if the current length is shorter than + // kInitialCapacity. Note that, in that case, after this call, capacity() will return + // a capacity larger than the data length. + // + // Any pointers within this instance are invalidated. + void shrink_to_fit() { + if (data_ == initial_data_ || capacity_ == len_) return; + ShrinkToFitInternal(); + } + + // Return a copy of this string as a std::string. + std::string ToString() const { + return std::string(reinterpret_cast(data()), + len_); + } + + private: + DISALLOW_COPY_AND_ASSIGN(faststring); + + // If necessary, expand the buffer to fit at least 'count' more bytes. + // If the array has to be grown, it is grown by at least 50%. + void EnsureRoomForAppend(size_t count) { + if (PREDICT_TRUE(len_ + count <= capacity_)) { + return; + } + + // Call the non-inline slow path - this reduces the number of instructions + // on the hot path. + GrowByAtLeast(count); + } + + // The slow path of MakeRoomFor. Grows the buffer by either + // 'count' bytes, or 50%, whichever is more. + void GrowByAtLeast(size_t count); + + // Grow the array to the given capacity, which must be more than + // the current capacity. + void GrowArray(size_t newcapacity); + + void ShrinkToFitInternal(); + + uint8_t* data_; + uint8_t initial_data_[kInitialCapacity]; + size_t len_; + size_t capacity_; +}; + +} // namespace doris diff --git a/be/src/util/rle_encoding.h b/be/src/util/rle_encoding.h index 8538bc6460..26b03e1b1f 100644 --- a/be/src/util/rle_encoding.h +++ b/be/src/util/rle_encoding.h @@ -1,521 +1,521 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#pragma once - -#include - -#include "gutil/port.h" -#include "util/bit_stream_utils.inline.h" -#include "util/bit_util.h" - -namespace doris { - -// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs -// are sufficiently long, RLE is used, otherwise, the values are just bit-packed -// (literal encoding). -// For both types of runs, there is a byte-aligned indicator which encodes the length -// of the run and the type of the run. -// This encoding has the benefit that when there aren't any long enough runs, values -// are always decoded at fixed (can be precomputed) bit offsets OR both the value and -// the run length are byte aligned. This allows for very efficient decoding -// implementations. -// The encoding is: -// encoded-block := run* -// run := literal-run | repeated-run -// literal-run := literal-indicator < literal bytes > -// repeated-run := repeated-indicator < repeated value. padded to byte boundary > -// literal-indicator := varint_encode( number_of_groups << 1 | 1) -// repeated-indicator := varint_encode( number_of_repetitions << 1 ) -// -// Each run is preceded by a varint. The varint's least significant bit is -// used to indicate whether the run is a literal run or a repeated run. The rest -// of the varint is used to determine the length of the run (eg how many times the -// value repeats). -// -// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode -// in groups of 8), so that no matter the bit-width of the value, the sequence will end -// on a byte boundary without padding. -// Given that we know it is a multiple of 8, we store the number of 8-groups rather than -// the actual number of encoded ints. (This means that the total number of encoded values -// can not be determined from the encoded data, since the number of values in the last -// group may not be a multiple of 8). -// There is a break-even point when it is more storage efficient to do run length -// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes -// for both the repeated encoding or the literal encoding. This value can always -// be computed based on the bit-width. -// TODO: think about how to use this for strings. The bit packing isn't quite the same. -// -// Examples with bit-width 1 (eg encoding booleans): -// ---------------------------------------- -// 100 1s followed by 100 0s: -// <1, padded to 1 byte> <0, padded to 1 byte> -// - (total 4 bytes) -// -// alternating 1s and 0s (200 total): -// 200 ints = 25 groups of 8 -// <25 bytes of values, bitpacked> -// (total 26 bytes, 1 byte overhead) -// - -// Decoder class for RLE encoded data. -// -// NOTE: the encoded format does not have any length prefix or any other way of -// indicating that the encoded sequence ends at a certain point, so the Decoder -// methods may return some extra bits at the end before the read methods start -// to return 0/false. -template -class RleDecoder { - public: - // Create a decoder object. buffer/buffer_len is the decoded data. - // bit_width is the width of each value (before encoding). - RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0), - rewind_state_(CANT_REWIND) { - DCHECK_GE(bit_width_, 1); - DCHECK_LE(bit_width_, 64); - } - - RleDecoder() {} - - // Skip n values, and returns the number of non-zero entries skipped. - size_t Skip(size_t to_skip); - - // Gets the next value. Returns false if there are no more. - bool Get(T* val); - - // Seek to the previous value. - void RewindOne(); - - // Gets the next run of the same 'val'. Returns 0 if there is no - // more data to be decoded. Will return a run of at most 'max_run' - // values. If there are more values than this, the next call to - // GetNextRun will return more from the same run. - size_t GetNextRun(T* val, size_t max_run); - - private: - bool ReadHeader(); - - enum RewindState { - REWIND_LITERAL, - REWIND_RUN, - CANT_REWIND - }; - - BitReader bit_reader_; - int bit_width_; - uint64_t current_value_; - uint32_t repeat_count_; - uint32_t literal_count_; - RewindState rewind_state_; -}; - -// Class to incrementally build the rle data. -// The encoding has two modes: encoding repeated runs and literal runs. -// If the run is sufficiently short, it is more efficient to encode as a literal run. -// This class does so by buffering 8 values at a time. If they are not all the same -// they are added to the literal run. If they are the same, they are added to the -// repeated run. When we switch modes, the previous run is flushed out. -template -class RleEncoder { - public: - // buffer: buffer to write bits to. - // bit_width: max number of bits for value. - // TODO: consider adding a min_repeated_run_length so the caller can control - // when values should be encoded as repeated runs. Currently this is derived - // based on the bit_width, which can determine a storage optimal choice. - explicit RleEncoder(faststring *buffer, int bit_width) - : bit_width_(bit_width), - bit_writer_(buffer) { - DCHECK_GE(bit_width_, 1); - DCHECK_LE(bit_width_, 64); - Clear(); - } - - // Reserve 'num_bytes' bytes for a plain encoded header, set each - // byte with 'val': this is used for the RLE-encoded data blocks in - // order to be able to able to store the initial ordinal position - // and number of elements. This is a part of RleEncoder in order to - // maintain the correct offset in 'buffer'. - void Reserve(int num_bytes, uint8_t val); - - // Encode value. This value must be representable with bit_width_ bits. - void Put(T value, size_t run_length = 1); - - // Flushes any pending values to the underlying buffer. - // Returns the total number of bytes written - int Flush(); - - // Resets all the state in the encoder. - void Clear(); - - int32_t len() const { return bit_writer_.bytes_written(); } - - private: - // Flushes any buffered values. If this is part of a repeated run, this is largely - // a no-op. - // If it is part of a literal run, this will call FlushLiteralRun, which writes - // out the buffered literal values. - // If 'done' is true, the current run would be written even if it would normally - // have been buffered more. This should only be called at the end, when the - // encoder has received all values even if it would normally continue to be - // buffered. - void FlushBufferedValues(bool done); - - // Flushes literal values to the underlying buffer. If update_indicator_byte, - // then the current literal run is complete and the indicator byte is updated. - void FlushLiteralRun(bool update_indicator_byte); - - // Flushes a repeated run to the underlying buffer. - void FlushRepeatedRun(); - - // Number of bits needed to encode the value. - const int bit_width_; - - // Underlying buffer. - BitWriter bit_writer_; - - // We need to buffer at most 8 values for literals. This happens when the - // bit_width is 1 (so 8 values fit in one byte). - // TODO: generalize this to other bit widths - uint64_t buffered_values_[8]; - - // Number of values in buffered_values_ - int num_buffered_values_; - - // The current (also last) value that was written and the count of how - // many times in a row that value has been seen. This is maintained even - // if we are in a literal run. If the repeat_count_ get high enough, we switch - // to encoding repeated runs. - uint64_t current_value_; - int repeat_count_; - - // Number of literals in the current run. This does not include the literals - // that might be in buffered_values_. Only after we've got a group big enough - // can we decide if they should part of the literal_count_ or repeat_count_ - int literal_count_; - - // Index of a byte in the underlying buffer that stores the indicator byte. - // This is reserved as soon as we need a literal run but the value is written - // when the literal run is complete. We maintain an index rather than a pointer - // into the underlying buffer because the pointer value may become invalid if - // the underlying buffer is resized. - int literal_indicator_byte_idx_; -}; - -template -inline bool RleDecoder::ReadHeader() { - DCHECK(bit_reader_.is_initialized()); - if (PREDICT_FALSE(literal_count_ == 0 && repeat_count_ == 0)) { - // Read the next run's indicator int, it could be a literal or repeated run - // The int is encoded as a vlq-encoded value. - int32_t indicator_value = 0; - bool result = bit_reader_.GetVlqInt(&indicator_value); - if (PREDICT_FALSE(!result)) { - return false; - } - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - if (is_literal) { - literal_count_ = (indicator_value >> 1) * 8; - DCHECK_GT(literal_count_, 0); - } else { - repeat_count_ = indicator_value >> 1; - DCHECK_GT(repeat_count_, 0); - bool result = bit_reader_.GetAligned( - BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); - DCHECK(result); - } - } - return true; -} - -template -inline bool RleDecoder::Get(T* val) { - DCHECK(bit_reader_.is_initialized()); - if (PREDICT_FALSE(!ReadHeader())) { - return false; - } - - if (PREDICT_TRUE(repeat_count_ > 0)) { - *val = current_value_; - --repeat_count_; - rewind_state_ = REWIND_RUN; - } else { - DCHECK(literal_count_ > 0); - bool result = bit_reader_.GetValue(bit_width_, val); - DCHECK(result); - --literal_count_; - rewind_state_ = REWIND_LITERAL; - } - - return true; -} - -template -inline void RleDecoder::RewindOne() { - DCHECK(bit_reader_.is_initialized()); - - switch (rewind_state_) { - case CANT_REWIND: - LOG(FATAL) << "Can't rewind more than once after each read!"; - break; - case REWIND_RUN: - ++repeat_count_; - break; - case REWIND_LITERAL: - { - bit_reader_.Rewind(bit_width_); - ++literal_count_; - break; - } - } - - rewind_state_ = CANT_REWIND; -} - -template -inline size_t RleDecoder::GetNextRun(T* val, size_t max_run) { - DCHECK(bit_reader_.is_initialized()); - DCHECK_GT(max_run, 0); - size_t ret = 0; - size_t rem = max_run; - while (ReadHeader()) { - if (PREDICT_TRUE(repeat_count_ > 0)) { - if (PREDICT_FALSE(ret > 0 && *val != current_value_)) { - return ret; - } - *val = current_value_; - if (repeat_count_ >= rem) { - // The next run is longer than the amount of remaining data - // that the caller wants to read. Only consume it partially. - repeat_count_ -= rem; - ret += rem; - return ret; - } - ret += repeat_count_; - rem -= repeat_count_; - repeat_count_ = 0; - } else { - DCHECK(literal_count_ > 0); - if (ret == 0) { - bool has_more = bit_reader_.GetValue(bit_width_, val); - DCHECK(has_more); - literal_count_--; - ret++; - rem--; - } - - while (literal_count_ > 0) { - bool result = bit_reader_.GetValue(bit_width_, ¤t_value_); - DCHECK(result); - if (current_value_ != *val || rem == 0) { - bit_reader_.Rewind(bit_width_); - return ret; - } - ret++; - rem--; - literal_count_--; - } - } - } - return ret; - } - -template -inline size_t RleDecoder::Skip(size_t to_skip) { - DCHECK(bit_reader_.is_initialized()); - - size_t set_count = 0; - while (to_skip > 0) { - bool result = ReadHeader(); - DCHECK(result); - - if (PREDICT_TRUE(repeat_count_ > 0)) { - size_t nskip = (repeat_count_ < to_skip) ? repeat_count_ : to_skip; - repeat_count_ -= nskip; - to_skip -= nskip; - if (current_value_ != 0) { - set_count += nskip; - } - } else { - DCHECK(literal_count_ > 0); - size_t nskip = (literal_count_ < to_skip) ? literal_count_ : to_skip; - literal_count_ -= nskip; - to_skip -= nskip; - for (; nskip > 0; nskip--) { - T value = 0; - bool result = bit_reader_.GetValue(bit_width_, &value); - DCHECK(result); - if (value != 0) { - set_count++; - } - } - } - } - return set_count; -} - -// This function buffers input values 8 at a time. After seeing all 8 values, -// it decides whether they should be encoded as a literal or repeated run. -template -inline void RleEncoder::Put(T value, size_t run_length) { - DCHECK(bit_width_ == 64 || value < (1LL << bit_width_)); - - // TODO(perf): remove the loop and use the repeat_count_ - for (; run_length > 0; run_length--) { - if (PREDICT_TRUE(current_value_ == value)) { - ++repeat_count_; - if (repeat_count_ > 8) { - // This is just a continuation of the current run, no need to buffer the - // values. - // Note that this is the fast path for long repeated runs. - continue; - } - } else { - if (repeat_count_ >= 8) { - // We had a run that was long enough but it has ended. Flush the - // current repeated run. - DCHECK_EQ(literal_count_, 0); - FlushRepeatedRun(); - } - repeat_count_ = 1; - current_value_ = value; - } - - buffered_values_[num_buffered_values_] = value; - if (++num_buffered_values_ == 8) { - DCHECK_EQ(literal_count_ % 8, 0); - FlushBufferedValues(false); - } - } -} - -template -inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { - if (literal_indicator_byte_idx_ < 0) { - // The literal indicator byte has not been reserved yet, get one now. - literal_indicator_byte_idx_ = bit_writer_.GetByteIndexAndAdvance(1); - DCHECK_GE(literal_indicator_byte_idx_, 0); - } - - // Write all the buffered values as bit packed literals - for (int i = 0; i < num_buffered_values_; ++i) { - bit_writer_.PutValue(buffered_values_[i], bit_width_); - } - num_buffered_values_ = 0; - - if (update_indicator_byte) { - // At this point we need to write the indicator byte for the literal run. - // We only reserve one byte, to allow for streaming writes of literal values. - // The logic makes sure we flush literal runs often enough to not overrun - // the 1 byte. - int num_groups = BitUtil::Ceil(literal_count_, 8); - int32_t indicator_value = (num_groups << 1) | 1; - DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); - bit_writer_.buffer()->data()[literal_indicator_byte_idx_] = indicator_value; - literal_indicator_byte_idx_ = -1; - literal_count_ = 0; - } -} - -template -inline void RleEncoder::FlushRepeatedRun() { - DCHECK_GT(repeat_count_, 0); - // The lsb of 0 indicates this is a repeated run - int32_t indicator_value = repeat_count_ << 1 | 0; - bit_writer_.PutVlqInt(indicator_value); - bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8)); - num_buffered_values_ = 0; - repeat_count_ = 0; -} - -// Flush the values that have been buffered. At this point we decide whether -// we need to switch between the run types or continue the current one. -template -inline void RleEncoder::FlushBufferedValues(bool done) { - if (repeat_count_ >= 8) { - // Clear the buffered values. They are part of the repeated run now and we - // don't want to flush them out as literals. - num_buffered_values_ = 0; - if (literal_count_ != 0) { - // There was a current literal run. All the values in it have been flushed - // but we still need to update the indicator byte. - DCHECK_EQ(literal_count_ % 8, 0); - DCHECK_EQ(repeat_count_, 8); - FlushLiteralRun(true); - } - DCHECK_EQ(literal_count_, 0); - return; - } - - literal_count_ += num_buffered_values_; - int num_groups = BitUtil::Ceil(literal_count_, 8); - if (num_groups + 1 >= (1 << 6)) { - // We need to start a new literal run because the indicator byte we've reserved - // cannot store more values. - DCHECK_GE(literal_indicator_byte_idx_, 0); - FlushLiteralRun(true); - } else { - FlushLiteralRun(done); - } - repeat_count_ = 0; -} - -template -inline void RleEncoder::Reserve(int num_bytes, uint8_t val) { - for (int i = 0; i < num_bytes; ++i) { - bit_writer_.PutValue(val, 8); - } -} - -template -inline int RleEncoder::Flush() { - if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { - bool all_repeat = literal_count_ == 0 && - (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); - // There is something pending, figure out if it's a repeated or literal run - if (repeat_count_ > 0 && all_repeat) { - FlushRepeatedRun(); - } else { - literal_count_ += num_buffered_values_; - FlushLiteralRun(true); - repeat_count_ = 0; - } - } - bit_writer_.Flush(); - DCHECK_EQ(num_buffered_values_, 0); - DCHECK_EQ(literal_count_, 0); - DCHECK_EQ(repeat_count_, 0); - return bit_writer_.bytes_written(); -} - -template -inline void RleEncoder::Clear() { - current_value_ = 0; - repeat_count_ = 0; - num_buffered_values_ = 0; - literal_count_ = 0; - literal_indicator_byte_idx_ = -1; - bit_writer_.Clear(); -} - -} // namespace doris +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include + +#include "gutil/port.h" +#include "util/bit_stream_utils.inline.h" +#include "util/bit_util.h" + +namespace doris { + +// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs +// are sufficiently long, RLE is used, otherwise, the values are just bit-packed +// (literal encoding). +// For both types of runs, there is a byte-aligned indicator which encodes the length +// of the run and the type of the run. +// This encoding has the benefit that when there aren't any long enough runs, values +// are always decoded at fixed (can be precomputed) bit offsets OR both the value and +// the run length are byte aligned. This allows for very efficient decoding +// implementations. +// The encoding is: +// encoded-block := run* +// run := literal-run | repeated-run +// literal-run := literal-indicator < literal bytes > +// repeated-run := repeated-indicator < repeated value. padded to byte boundary > +// literal-indicator := varint_encode( number_of_groups << 1 | 1) +// repeated-indicator := varint_encode( number_of_repetitions << 1 ) +// +// Each run is preceded by a varint. The varint's least significant bit is +// used to indicate whether the run is a literal run or a repeated run. The rest +// of the varint is used to determine the length of the run (eg how many times the +// value repeats). +// +// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode +// in groups of 8), so that no matter the bit-width of the value, the sequence will end +// on a byte boundary without padding. +// Given that we know it is a multiple of 8, we store the number of 8-groups rather than +// the actual number of encoded ints. (This means that the total number of encoded values +// can not be determined from the encoded data, since the number of values in the last +// group may not be a multiple of 8). +// There is a break-even point when it is more storage efficient to do run length +// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes +// for both the repeated encoding or the literal encoding. This value can always +// be computed based on the bit-width. +// TODO: think about how to use this for strings. The bit packing isn't quite the same. +// +// Examples with bit-width 1 (eg encoding booleans): +// ---------------------------------------- +// 100 1s followed by 100 0s: +// <1, padded to 1 byte> <0, padded to 1 byte> +// - (total 4 bytes) +// +// alternating 1s and 0s (200 total): +// 200 ints = 25 groups of 8 +// <25 bytes of values, bitpacked> +// (total 26 bytes, 1 byte overhead) +// + +// Decoder class for RLE encoded data. +// +// NOTE: the encoded format does not have any length prefix or any other way of +// indicating that the encoded sequence ends at a certain point, so the Decoder +// methods may return some extra bits at the end before the read methods start +// to return 0/false. +template +class RleDecoder { + public: + // Create a decoder object. buffer/buffer_len is the decoded data. + // bit_width is the width of each value (before encoding). + RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) + : bit_reader_(buffer, buffer_len), + bit_width_(bit_width), + current_value_(0), + repeat_count_(0), + literal_count_(0), + rewind_state_(CANT_REWIND) { + DCHECK_GE(bit_width_, 1); + DCHECK_LE(bit_width_, 64); + } + + RleDecoder() {} + + // Skip n values, and returns the number of non-zero entries skipped. + size_t Skip(size_t to_skip); + + // Gets the next value. Returns false if there are no more. + bool Get(T* val); + + // Seek to the previous value. + void RewindOne(); + + // Gets the next run of the same 'val'. Returns 0 if there is no + // more data to be decoded. Will return a run of at most 'max_run' + // values. If there are more values than this, the next call to + // GetNextRun will return more from the same run. + size_t GetNextRun(T* val, size_t max_run); + + private: + bool ReadHeader(); + + enum RewindState { + REWIND_LITERAL, + REWIND_RUN, + CANT_REWIND + }; + + BitReader bit_reader_; + int bit_width_; + uint64_t current_value_; + uint32_t repeat_count_; + uint32_t literal_count_; + RewindState rewind_state_; +}; + +// Class to incrementally build the rle data. +// The encoding has two modes: encoding repeated runs and literal runs. +// If the run is sufficiently short, it is more efficient to encode as a literal run. +// This class does so by buffering 8 values at a time. If they are not all the same +// they are added to the literal run. If they are the same, they are added to the +// repeated run. When we switch modes, the previous run is flushed out. +template +class RleEncoder { + public: + // buffer: buffer to write bits to. + // bit_width: max number of bits for value. + // TODO: consider adding a min_repeated_run_length so the caller can control + // when values should be encoded as repeated runs. Currently this is derived + // based on the bit_width, which can determine a storage optimal choice. + explicit RleEncoder(faststring *buffer, int bit_width) + : bit_width_(bit_width), + bit_writer_(buffer) { + DCHECK_GE(bit_width_, 1); + DCHECK_LE(bit_width_, 64); + Clear(); + } + + // Reserve 'num_bytes' bytes for a plain encoded header, set each + // byte with 'val': this is used for the RLE-encoded data blocks in + // order to be able to able to store the initial ordinal position + // and number of elements. This is a part of RleEncoder in order to + // maintain the correct offset in 'buffer'. + void Reserve(int num_bytes, uint8_t val); + + // Encode value. This value must be representable with bit_width_ bits. + void Put(T value, size_t run_length = 1); + + // Flushes any pending values to the underlying buffer. + // Returns the total number of bytes written + int Flush(); + + // Resets all the state in the encoder. + void Clear(); + + int32_t len() const { return bit_writer_.bytes_written(); } + + private: + // Flushes any buffered values. If this is part of a repeated run, this is largely + // a no-op. + // If it is part of a literal run, this will call FlushLiteralRun, which writes + // out the buffered literal values. + // If 'done' is true, the current run would be written even if it would normally + // have been buffered more. This should only be called at the end, when the + // encoder has received all values even if it would normally continue to be + // buffered. + void FlushBufferedValues(bool done); + + // Flushes literal values to the underlying buffer. If update_indicator_byte, + // then the current literal run is complete and the indicator byte is updated. + void FlushLiteralRun(bool update_indicator_byte); + + // Flushes a repeated run to the underlying buffer. + void FlushRepeatedRun(); + + // Number of bits needed to encode the value. + const int bit_width_; + + // Underlying buffer. + BitWriter bit_writer_; + + // We need to buffer at most 8 values for literals. This happens when the + // bit_width is 1 (so 8 values fit in one byte). + // TODO: generalize this to other bit widths + uint64_t buffered_values_[8]; + + // Number of values in buffered_values_ + int num_buffered_values_; + + // The current (also last) value that was written and the count of how + // many times in a row that value has been seen. This is maintained even + // if we are in a literal run. If the repeat_count_ get high enough, we switch + // to encoding repeated runs. + uint64_t current_value_; + int repeat_count_; + + // Number of literals in the current run. This does not include the literals + // that might be in buffered_values_. Only after we've got a group big enough + // can we decide if they should part of the literal_count_ or repeat_count_ + int literal_count_; + + // Index of a byte in the underlying buffer that stores the indicator byte. + // This is reserved as soon as we need a literal run but the value is written + // when the literal run is complete. We maintain an index rather than a pointer + // into the underlying buffer because the pointer value may become invalid if + // the underlying buffer is resized. + int literal_indicator_byte_idx_; +}; + +template +inline bool RleDecoder::ReadHeader() { + DCHECK(bit_reader_.is_initialized()); + if (PREDICT_FALSE(literal_count_ == 0 && repeat_count_ == 0)) { + // Read the next run's indicator int, it could be a literal or repeated run + // The int is encoded as a vlq-encoded value. + int32_t indicator_value = 0; + bool result = bit_reader_.GetVlqInt(&indicator_value); + if (PREDICT_FALSE(!result)) { + return false; + } + + // lsb indicates if it is a literal run or repeated run + bool is_literal = indicator_value & 1; + if (is_literal) { + literal_count_ = (indicator_value >> 1) * 8; + DCHECK_GT(literal_count_, 0); + } else { + repeat_count_ = indicator_value >> 1; + DCHECK_GT(repeat_count_, 0); + bool result = bit_reader_.GetAligned( + BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); + DCHECK(result); + } + } + return true; +} + +template +inline bool RleDecoder::Get(T* val) { + DCHECK(bit_reader_.is_initialized()); + if (PREDICT_FALSE(!ReadHeader())) { + return false; + } + + if (PREDICT_TRUE(repeat_count_ > 0)) { + *val = current_value_; + --repeat_count_; + rewind_state_ = REWIND_RUN; + } else { + DCHECK(literal_count_ > 0); + bool result = bit_reader_.GetValue(bit_width_, val); + DCHECK(result); + --literal_count_; + rewind_state_ = REWIND_LITERAL; + } + + return true; +} + +template +inline void RleDecoder::RewindOne() { + DCHECK(bit_reader_.is_initialized()); + + switch (rewind_state_) { + case CANT_REWIND: + LOG(FATAL) << "Can't rewind more than once after each read!"; + break; + case REWIND_RUN: + ++repeat_count_; + break; + case REWIND_LITERAL: + { + bit_reader_.Rewind(bit_width_); + ++literal_count_; + break; + } + } + + rewind_state_ = CANT_REWIND; +} + +template +inline size_t RleDecoder::GetNextRun(T* val, size_t max_run) { + DCHECK(bit_reader_.is_initialized()); + DCHECK_GT(max_run, 0); + size_t ret = 0; + size_t rem = max_run; + while (ReadHeader()) { + if (PREDICT_TRUE(repeat_count_ > 0)) { + if (PREDICT_FALSE(ret > 0 && *val != current_value_)) { + return ret; + } + *val = current_value_; + if (repeat_count_ >= rem) { + // The next run is longer than the amount of remaining data + // that the caller wants to read. Only consume it partially. + repeat_count_ -= rem; + ret += rem; + return ret; + } + ret += repeat_count_; + rem -= repeat_count_; + repeat_count_ = 0; + } else { + DCHECK(literal_count_ > 0); + if (ret == 0) { + bool has_more = bit_reader_.GetValue(bit_width_, val); + DCHECK(has_more); + literal_count_--; + ret++; + rem--; + } + + while (literal_count_ > 0) { + bool result = bit_reader_.GetValue(bit_width_, ¤t_value_); + DCHECK(result); + if (current_value_ != *val || rem == 0) { + bit_reader_.Rewind(bit_width_); + return ret; + } + ret++; + rem--; + literal_count_--; + } + } + } + return ret; + } + +template +inline size_t RleDecoder::Skip(size_t to_skip) { + DCHECK(bit_reader_.is_initialized()); + + size_t set_count = 0; + while (to_skip > 0) { + bool result = ReadHeader(); + DCHECK(result); + + if (PREDICT_TRUE(repeat_count_ > 0)) { + size_t nskip = (repeat_count_ < to_skip) ? repeat_count_ : to_skip; + repeat_count_ -= nskip; + to_skip -= nskip; + if (current_value_ != 0) { + set_count += nskip; + } + } else { + DCHECK(literal_count_ > 0); + size_t nskip = (literal_count_ < to_skip) ? literal_count_ : to_skip; + literal_count_ -= nskip; + to_skip -= nskip; + for (; nskip > 0; nskip--) { + T value = 0; + bool result = bit_reader_.GetValue(bit_width_, &value); + DCHECK(result); + if (value != 0) { + set_count++; + } + } + } + } + return set_count; +} + +// This function buffers input values 8 at a time. After seeing all 8 values, +// it decides whether they should be encoded as a literal or repeated run. +template +inline void RleEncoder::Put(T value, size_t run_length) { + DCHECK(bit_width_ == 64 || value < (1LL << bit_width_)); + + // TODO(perf): remove the loop and use the repeat_count_ + for (; run_length > 0; run_length--) { + if (PREDICT_TRUE(current_value_ == value)) { + ++repeat_count_; + if (repeat_count_ > 8) { + // This is just a continuation of the current run, no need to buffer the + // values. + // Note that this is the fast path for long repeated runs. + continue; + } + } else { + if (repeat_count_ >= 8) { + // We had a run that was long enough but it has ended. Flush the + // current repeated run. + DCHECK_EQ(literal_count_, 0); + FlushRepeatedRun(); + } + repeat_count_ = 1; + current_value_ = value; + } + + buffered_values_[num_buffered_values_] = value; + if (++num_buffered_values_ == 8) { + DCHECK_EQ(literal_count_ % 8, 0); + FlushBufferedValues(false); + } + } +} + +template +inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { + if (literal_indicator_byte_idx_ < 0) { + // The literal indicator byte has not been reserved yet, get one now. + literal_indicator_byte_idx_ = bit_writer_.GetByteIndexAndAdvance(1); + DCHECK_GE(literal_indicator_byte_idx_, 0); + } + + // Write all the buffered values as bit packed literals + for (int i = 0; i < num_buffered_values_; ++i) { + bit_writer_.PutValue(buffered_values_[i], bit_width_); + } + num_buffered_values_ = 0; + + if (update_indicator_byte) { + // At this point we need to write the indicator byte for the literal run. + // We only reserve one byte, to allow for streaming writes of literal values. + // The logic makes sure we flush literal runs often enough to not overrun + // the 1 byte. + int num_groups = BitUtil::Ceil(literal_count_, 8); + int32_t indicator_value = (num_groups << 1) | 1; + DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); + bit_writer_.buffer()->data()[literal_indicator_byte_idx_] = indicator_value; + literal_indicator_byte_idx_ = -1; + literal_count_ = 0; + } +} + +template +inline void RleEncoder::FlushRepeatedRun() { + DCHECK_GT(repeat_count_, 0); + // The lsb of 0 indicates this is a repeated run + int32_t indicator_value = repeat_count_ << 1 | 0; + bit_writer_.PutVlqInt(indicator_value); + bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8)); + num_buffered_values_ = 0; + repeat_count_ = 0; +} + +// Flush the values that have been buffered. At this point we decide whether +// we need to switch between the run types or continue the current one. +template +inline void RleEncoder::FlushBufferedValues(bool done) { + if (repeat_count_ >= 8) { + // Clear the buffered values. They are part of the repeated run now and we + // don't want to flush them out as literals. + num_buffered_values_ = 0; + if (literal_count_ != 0) { + // There was a current literal run. All the values in it have been flushed + // but we still need to update the indicator byte. + DCHECK_EQ(literal_count_ % 8, 0); + DCHECK_EQ(repeat_count_, 8); + FlushLiteralRun(true); + } + DCHECK_EQ(literal_count_, 0); + return; + } + + literal_count_ += num_buffered_values_; + int num_groups = BitUtil::Ceil(literal_count_, 8); + if (num_groups + 1 >= (1 << 6)) { + // We need to start a new literal run because the indicator byte we've reserved + // cannot store more values. + DCHECK_GE(literal_indicator_byte_idx_, 0); + FlushLiteralRun(true); + } else { + FlushLiteralRun(done); + } + repeat_count_ = 0; +} + +template +inline void RleEncoder::Reserve(int num_bytes, uint8_t val) { + for (int i = 0; i < num_bytes; ++i) { + bit_writer_.PutValue(val, 8); + } +} + +template +inline int RleEncoder::Flush() { + if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { + bool all_repeat = literal_count_ == 0 && + (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); + // There is something pending, figure out if it's a repeated or literal run + if (repeat_count_ > 0 && all_repeat) { + FlushRepeatedRun(); + } else { + literal_count_ += num_buffered_values_; + FlushLiteralRun(true); + repeat_count_ = 0; + } + } + bit_writer_.Flush(); + DCHECK_EQ(num_buffered_values_, 0); + DCHECK_EQ(literal_count_, 0); + DCHECK_EQ(repeat_count_, 0); + return bit_writer_.bytes_written(); +} + +template +inline void RleEncoder::Clear() { + current_value_ = 0; + repeat_count_ = 0; + num_buffered_values_ = 0; + literal_count_ = 0; + literal_indicator_byte_idx_ = -1; + bit_writer_.Clear(); +} + +} // namespace doris diff --git a/be/test/exec/es_scan_node_test.cpp b/be/test/exec/es_scan_node_test.cpp index 77f2cb7cf6..0f6eab51ab 100644 --- a/be/test/exec/es_scan_node_test.cpp +++ b/be/test/exec/es_scan_node_test.cpp @@ -1,154 +1,154 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "common/object_pool.h" -#include "exec/es_scan_node.h" -#include "gen_cpp/PlanNodes_types.h" -#include "runtime/mem_pool.h" -#include "runtime/descriptors.h" -#include "runtime/runtime_state.h" -#include "runtime/row_batch.h" -#include "runtime/string_value.h" -#include "runtime/tuple_row.h" -#include "util/runtime_profile.h" -#include "util/debug_util.h" - -using std::vector; - -namespace doris { - -// mock -class EsScanNodeTest : public testing::Test { -public: - EsScanNodeTest() : _runtime_state(TQueryGlobals()) { - _runtime_state._instance_mem_tracker.reset(new MemTracker()); - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::ES_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_table_desc.__isset.esTable = true; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - // TSlotDescriptor - int offset = 1; - int i = 0; - // id - { - TSlotDescriptor t_slot_desc; - t_slot_desc.__set_slotType(TypeDescriptor(TYPE_INT).to_thrift()); - t_slot_desc.__set_columnPos(i); - t_slot_desc.__set_byteOffset(offset); - t_slot_desc.__set_nullIndicatorByte(0); - t_slot_desc.__set_nullIndicatorBit(-1); - t_slot_desc.__set_slotIdx(i); - t_slot_desc.__set_isMaterialized(true); - t_desc_table.slotDescriptors.push_back(t_slot_desc); - offset += sizeof(int); - } - - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = 0; - t_tuple_desc.byteSize = offset; - t_tuple_desc.numNullBytes = 1; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.__isset.slotDescriptors = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - _runtime_state.set_desc_tbl(_desc_tbl); - - // Node Id - _tnode.node_id = 0; - _tnode.node_type = TPlanNodeType::SCHEMA_SCAN_NODE; - _tnode.num_children = 0; - _tnode.limit = -1; - _tnode.row_tuples.push_back(0); - _tnode.nullable_tuples.push_back(false); - _tnode.es_scan_node.tuple_id = 0; - std::map properties; - _tnode.es_scan_node.__set_properties(properties); - _tnode.__isset.es_scan_node = true; - } - -protected: - virtual void SetUp() { - } - virtual void TearDown() { - } - TPlanNode _tnode; - ObjectPool _obj_pool; - DescriptorTbl* _desc_tbl; - RuntimeState _runtime_state; -}; - - -TEST_F(EsScanNodeTest, normal_use) { - EsScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - Status status = scan_node.prepare(&_runtime_state); - ASSERT_TRUE(status.ok()); - TEsScanRange es_scan_range; - es_scan_range.__set_index("index1"); - es_scan_range.__set_type("docs"); - es_scan_range.__set_shard_id(0); - TNetworkAddress es_host; - es_host.__set_hostname("host"); - es_host.__set_port(8200); - std::vector es_hosts; - es_hosts.push_back(es_host); - es_scan_range.__set_es_hosts(es_hosts); - TScanRange scan_range; - scan_range.__set_es_scan_range(es_scan_range); - TScanRangeParams scan_range_params; - scan_range_params.__set_scan_range(scan_range); - std::vector scan_ranges; - scan_ranges.push_back(scan_range_params); - - status = scan_node.set_scan_ranges(scan_ranges); - ASSERT_TRUE(status.ok()); - std::stringstream out; - scan_node.debug_string(1, &out); - LOG(WARNING) << out.str(); - - status = scan_node.open(&_runtime_state); - ASSERT_TRUE(status.ok()); - RowBatch row_batch(scan_node._row_descriptor, _runtime_state.batch_size(), new MemTracker(-1)); - bool eos = false; - status = scan_node.get_next(&_runtime_state, &row_batch, &eos); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(2, row_batch.num_rows()); - ASSERT_TRUE(eos); - - status = scan_node.close(&_runtime_state); - ASSERT_TRUE(status.ok()); -} - -} - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "common/object_pool.h" +#include "exec/es_scan_node.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/mem_pool.h" +#include "runtime/descriptors.h" +#include "runtime/runtime_state.h" +#include "runtime/row_batch.h" +#include "runtime/string_value.h" +#include "runtime/tuple_row.h" +#include "util/runtime_profile.h" +#include "util/debug_util.h" + +using std::vector; + +namespace doris { + +// mock +class EsScanNodeTest : public testing::Test { +public: + EsScanNodeTest() : _runtime_state(TQueryGlobals()) { + _runtime_state._instance_mem_tracker.reset(new MemTracker()); + TDescriptorTable t_desc_table; + + // table descriptors + TTableDescriptor t_table_desc; + + t_table_desc.id = 0; + t_table_desc.tableType = TTableType::ES_TABLE; + t_table_desc.numCols = 0; + t_table_desc.numClusteringCols = 0; + t_table_desc.__isset.esTable = true; + t_desc_table.tableDescriptors.push_back(t_table_desc); + t_desc_table.__isset.tableDescriptors = true; + // TSlotDescriptor + int offset = 1; + int i = 0; + // id + { + TSlotDescriptor t_slot_desc; + t_slot_desc.__set_slotType(TypeDescriptor(TYPE_INT).to_thrift()); + t_slot_desc.__set_columnPos(i); + t_slot_desc.__set_byteOffset(offset); + t_slot_desc.__set_nullIndicatorByte(0); + t_slot_desc.__set_nullIndicatorBit(-1); + t_slot_desc.__set_slotIdx(i); + t_slot_desc.__set_isMaterialized(true); + t_desc_table.slotDescriptors.push_back(t_slot_desc); + offset += sizeof(int); + } + + TTupleDescriptor t_tuple_desc; + t_tuple_desc.id = 0; + t_tuple_desc.byteSize = offset; + t_tuple_desc.numNullBytes = 1; + t_tuple_desc.tableId = 0; + t_tuple_desc.__isset.tableId = true; + t_desc_table.__isset.slotDescriptors = true; + t_desc_table.tupleDescriptors.push_back(t_tuple_desc); + + DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); + _runtime_state.set_desc_tbl(_desc_tbl); + + // Node Id + _tnode.node_id = 0; + _tnode.node_type = TPlanNodeType::SCHEMA_SCAN_NODE; + _tnode.num_children = 0; + _tnode.limit = -1; + _tnode.row_tuples.push_back(0); + _tnode.nullable_tuples.push_back(false); + _tnode.es_scan_node.tuple_id = 0; + std::map properties; + _tnode.es_scan_node.__set_properties(properties); + _tnode.__isset.es_scan_node = true; + } + +protected: + virtual void SetUp() { + } + virtual void TearDown() { + } + TPlanNode _tnode; + ObjectPool _obj_pool; + DescriptorTbl* _desc_tbl; + RuntimeState _runtime_state; +}; + + +TEST_F(EsScanNodeTest, normal_use) { + EsScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); + Status status = scan_node.prepare(&_runtime_state); + ASSERT_TRUE(status.ok()); + TEsScanRange es_scan_range; + es_scan_range.__set_index("index1"); + es_scan_range.__set_type("docs"); + es_scan_range.__set_shard_id(0); + TNetworkAddress es_host; + es_host.__set_hostname("host"); + es_host.__set_port(8200); + std::vector es_hosts; + es_hosts.push_back(es_host); + es_scan_range.__set_es_hosts(es_hosts); + TScanRange scan_range; + scan_range.__set_es_scan_range(es_scan_range); + TScanRangeParams scan_range_params; + scan_range_params.__set_scan_range(scan_range); + std::vector scan_ranges; + scan_ranges.push_back(scan_range_params); + + status = scan_node.set_scan_ranges(scan_ranges); + ASSERT_TRUE(status.ok()); + std::stringstream out; + scan_node.debug_string(1, &out); + LOG(WARNING) << out.str(); + + status = scan_node.open(&_runtime_state); + ASSERT_TRUE(status.ok()); + RowBatch row_batch(scan_node._row_descriptor, _runtime_state.batch_size(), new MemTracker(-1)); + bool eos = false; + status = scan_node.get_next(&_runtime_state, &row_batch, &eos); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(2, row_batch.num_rows()); + ASSERT_TRUE(eos); + + status = scan_node.close(&_runtime_state); + ASSERT_TRUE(status.ok()); +} + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp index f2f4d9383e..c99347119f 100644 --- a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp @@ -1,229 +1,229 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "olap/rowset/segment_v2/options.h" -#include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_decoder.h" -#include "olap/rowset/segment_v2/bitshuffle_page.h" -#include "util/arena.h" -#include "util/logging.h" - -using doris::segment_v2::PageBuilderOptions; - -namespace doris { - -class BitShufflePageTest : public testing::Test { -public: - virtual ~BitShufflePageTest() {} - - template - void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { - Arena arena; - uint8_t null_bitmap = 0; - ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - - size_t n = 1; - decoder->_copy_next_values(n, column_block_view.data()); - ASSERT_EQ(1, n); - } - - template - void test_encode_decode_page_template(typename TypeTraits::CppType* src, - size_t size) { - typedef typename TypeTraits::CppType CppType; - PageBuilderOptions options; - options.data_page_size = 256 * 1024; - PageBuilderType page_builder(options); - - page_builder.add(reinterpret_cast(src), &size); - Slice s = page_builder.finish(); - LOG(INFO) << "RLE Encoded size for 10k values: " << s.size - << ", original size:" << size * sizeof(CppType); - - segment_v2::PageDecoderOptions decoder_options; - PageDecoderType page_decoder(s, decoder_options); - Status status = page_decoder.init(); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(0, page_decoder.current_index()); - - Arena arena; - - CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); - uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); - ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - - status = page_decoder.next_batch(&size, &column_block_view); - ASSERT_TRUE(status.ok()); - - CppType* decoded = (CppType*)values; - for (uint i = 0; i < size; i++) { - if (src[i] != decoded[i]) { - FAIL() << "Fail at index " << i << - " inserted=" << src[i] << " got=" << decoded[i]; - } - } - - // Test Seek within block by ordinal - for (int i = 0; i < 100; i++) { - int seek_off = random() % size; - page_decoder.seek_to_position_in_page(seek_off); - EXPECT_EQ((int32_t )(seek_off), page_decoder.current_index()); - CppType ret; - copy_one(&page_decoder, &ret); - EXPECT_EQ(decoded[seek_off], ret); - } - } -}; - -// Test for bitshuffle block, for INT32, INT64, FLOAT, DOUBLE -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = random(); - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt64BlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int64_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = random(); - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleFloatBlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr floats(new float[size]); - for (int i = 0; i < size; i++) { - floats.get()[i] = random() + static_cast(random())/INT_MAX; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(floats.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr doubles(new double[size]); - for (int i = 0; i < size; i++) { - doubles.get()[i] = random() + static_cast(random())/INT_MAX; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(doubles.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderEqual) { - const uint32_t size = 10000; - - std::unique_ptr doubles(new double[size]); - for (int i = 0; i < size; i++) { - doubles.get()[i] = 19880217.19890323; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(doubles.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderSequence) { - const uint32_t size = 10000; - - double base = 19880217.19890323; - double delta = 13.14; - std::unique_ptr doubles(new double[size]); - for (int i = 0; i < size; i++) { - base = base + delta; - doubles.get()[i] = base; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(doubles.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderEqual) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 12345; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberEqual) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 1234567890; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderSequence) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - int32_t number = 0; - for (int i = 0; i < size; i++) { - ints.get()[i] = ++number; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberSequence) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - int32_t number = 0; - for (int i = 0; i < size; i++) { - ints.get()[i] = 1234567890 + number; - ++number; - } - - test_encode_decode_page_template, - segment_v2::BitShufflePageDecoder >(ints.get(), size); -} - -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "util/arena.h" +#include "util/logging.h" + +using doris::segment_v2::PageBuilderOptions; + +namespace doris { + +class BitShufflePageTest : public testing::Test { +public: + virtual ~BitShufflePageTest() {} + + template + void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { + Arena arena; + uint8_t null_bitmap = 0; + ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + + size_t n = 1; + decoder->_copy_next_values(n, column_block_view.data()); + ASSERT_EQ(1, n); + } + + template + void test_encode_decode_page_template(typename TypeTraits::CppType* src, + size_t size) { + typedef typename TypeTraits::CppType CppType; + PageBuilderOptions options; + options.data_page_size = 256 * 1024; + PageBuilderType page_builder(options); + + page_builder.add(reinterpret_cast(src), &size); + Slice s = page_builder.finish(); + LOG(INFO) << "RLE Encoded size for 10k values: " << s.size + << ", original size:" << size * sizeof(CppType); + + segment_v2::PageDecoderOptions decoder_options; + PageDecoderType page_decoder(s, decoder_options); + Status status = page_decoder.init(); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(0, page_decoder.current_index()); + + Arena arena; + + CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); + uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); + ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + + status = page_decoder.next_batch(&size, &column_block_view); + ASSERT_TRUE(status.ok()); + + CppType* decoded = (CppType*)values; + for (uint i = 0; i < size; i++) { + if (src[i] != decoded[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << src[i] << " got=" << decoded[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % size; + page_decoder.seek_to_position_in_page(seek_off); + EXPECT_EQ((int32_t )(seek_off), page_decoder.current_index()); + CppType ret; + copy_one(&page_decoder, &ret); + EXPECT_EQ(decoded[seek_off], ret); + } + } +}; + +// Test for bitshuffle block, for INT32, INT64, FLOAT, DOUBLE +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt64BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int64_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleFloatBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr floats(new float[size]); + for (int i = 0; i < size; i++) { + floats.get()[i] = random() + static_cast(random())/INT_MAX; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(floats.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + doubles.get()[i] = random() + static_cast(random())/INT_MAX; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + doubles.get()[i] = 19880217.19890323; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleDoubleBlockEncoderSequence) { + const uint32_t size = 10000; + + double base = 19880217.19890323; + double delta = 13.14; + std::unique_ptr doubles(new double[size]); + for (int i = 0; i < size; i++) { + base = base + delta; + doubles.get()[i] = base; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(doubles.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 1234567890; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + int32_t number = 0; + for (int i = 0; i < size; i++) { + ints.get()[i] = ++number; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +TEST_F(BitShufflePageTest, TestBitShuffleInt32BlockEncoderMaxNumberSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + int32_t number = 0; + for (int i = 0; i < size; i++) { + ints.get()[i] = 1234567890 + number; + ++number; + } + + test_encode_decode_page_template, + segment_v2::BitShufflePageDecoder >(ints.get(), size); +} + +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/olap/rowset/segment_v2/rle_page_test.cpp b/be/test/olap/rowset/segment_v2/rle_page_test.cpp index e30e45df5e..97015950ab 100644 --- a/be/test/olap/rowset/segment_v2/rle_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/rle_page_test.cpp @@ -1,193 +1,193 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "olap/rowset/segment_v2/options.h" -#include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_decoder.h" -#include "olap/rowset/segment_v2/rle_page.h" -#include "util/arena.h" -#include "util/logging.h" - -using doris::segment_v2::PageBuilderOptions; -using doris::segment_v2::PageDecoderOptions; - -namespace doris { - -class RlePageTest : public testing::Test { -public: - virtual ~RlePageTest() { } - - template - void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { - Arena arena; - uint8_t null_bitmap = 0; - ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - - size_t n = 1; - decoder->next_batch(&n, &column_block_view); - ASSERT_EQ(1, n); - } - - template - void test_encode_decode_page_template(typename TypeTraits::CppType* src, - size_t size) { - typedef typename TypeTraits::CppType CppType; - PageBuilderOptions builder_options; - builder_options.data_page_size = 256 * 1024; - PageBuilderType rle_page_builder(builder_options); - rle_page_builder.add(reinterpret_cast(src), &size); - Slice s = rle_page_builder.finish(); - ASSERT_EQ(size, rle_page_builder.count()); - LOG(INFO) << "RLE Encoded size for 10k values: " << s.size - << ", original size:" << size * sizeof(CppType); - - PageDecoderOptions decodeder_options; - PageDecoderType rle_page_decoder(s, decodeder_options); - Status status = rle_page_decoder.init(); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(0, rle_page_decoder.current_index()); - ASSERT_EQ(size, rle_page_decoder.count()); - - Arena arena; - - CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); - uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); - ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); - ColumnBlockView column_block_view(&block); - size_t size_to_fetch = size; - status = rle_page_decoder.next_batch(&size_to_fetch, &column_block_view); - ASSERT_TRUE(status.ok()); - ASSERT_EQ(size, size_to_fetch); - - for (uint i = 0; i < size; i++) { - if (src[i] != values[i]) { - FAIL() << "Fail at index " << i << - " inserted=" << src[i] << " got=" << values[i]; - } - } - - // Test Seek within block by ordinal - for (int i = 0; i < 100; i++) { - int seek_off = random() % size; - rle_page_decoder.seek_to_position_in_page(seek_off); - EXPECT_EQ((int32_t )(seek_off), rle_page_decoder.current_index()); - CppType ret; - copy_one(&rle_page_decoder, &ret); - EXPECT_EQ(values[seek_off], ret); - } - } -}; - -// Test for rle block, for INT32, BOOL -TEST_F(RlePageTest, TestRleInt32BlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = random(); - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(ints.get(), size); -} - -TEST_F(RlePageTest, TestRleInt32BlockEncoderEqual) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 12345; - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(ints.get(), size); -} - -TEST_F(RlePageTest, TestRleInt32BlockEncoderSequence) { - const uint32_t size = 10000; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 12345 + i; - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(ints.get(), size); -} - -TEST_F(RlePageTest, TestRleInt32BlockEncoderSize) { - size_t size = 100; - - std::unique_ptr ints(new int32_t[size]); - for (int i = 0; i < size; i++) { - ints.get()[i] = 0; - } - PageBuilderOptions builder_options; - builder_options.data_page_size = 256 * 1024; - segment_v2::RlePageBuilder rle_page_builder(builder_options); - rle_page_builder.add(reinterpret_cast(ints.get()), &size); - Slice s = rle_page_builder.finish(); - // 4 bytes header - // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 - // 4 bytes values - ASSERT_EQ(10, s.size); -} - -TEST_F(RlePageTest, TestRleBoolBlockEncoderRandom) { - const uint32_t size = 10000; - - std::unique_ptr bools(new bool[size]); - for (int i = 0; i < size; i++) { - if (random() % 2 == 0) { - bools.get()[i] = true; - } else { - bools.get()[i] = false; - } - } - - test_encode_decode_page_template, - segment_v2::RlePageDecoder >(bools.get(), size); -} - -TEST_F(RlePageTest, TestRleBoolBlockEncoderSize) { - size_t size = 100; - - std::unique_ptr bools(new bool[size]); - for (int i = 0; i < size; i++) { - bools.get()[i] = true; - } - PageBuilderOptions builder_options; - builder_options.data_page_size = 256 * 1024; - segment_v2::RlePageBuilder rle_page_builder(builder_options); - rle_page_builder.add(reinterpret_cast(bools.get()), &size); - Slice s = rle_page_builder.finish(); - // 4 bytes header - // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 - // 1 bytes values - ASSERT_EQ(7, s.size); -} - -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/rle_page.h" +#include "util/arena.h" +#include "util/logging.h" + +using doris::segment_v2::PageBuilderOptions; +using doris::segment_v2::PageDecoderOptions; + +namespace doris { + +class RlePageTest : public testing::Test { +public: + virtual ~RlePageTest() { } + + template + void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) { + Arena arena; + uint8_t null_bitmap = 0; + ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + + size_t n = 1; + decoder->next_batch(&n, &column_block_view); + ASSERT_EQ(1, n); + } + + template + void test_encode_decode_page_template(typename TypeTraits::CppType* src, + size_t size) { + typedef typename TypeTraits::CppType CppType; + PageBuilderOptions builder_options; + builder_options.data_page_size = 256 * 1024; + PageBuilderType rle_page_builder(builder_options); + rle_page_builder.add(reinterpret_cast(src), &size); + Slice s = rle_page_builder.finish(); + ASSERT_EQ(size, rle_page_builder.count()); + LOG(INFO) << "RLE Encoded size for 10k values: " << s.size + << ", original size:" << size * sizeof(CppType); + + PageDecoderOptions decodeder_options; + PageDecoderType rle_page_decoder(s, decodeder_options); + Status status = rle_page_decoder.init(); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(0, rle_page_decoder.current_index()); + ASSERT_EQ(size, rle_page_decoder.count()); + + Arena arena; + + CppType* values = reinterpret_cast(arena.Allocate(size * sizeof(CppType))); + uint8_t* null_bitmap = reinterpret_cast(arena.Allocate(BitmapSize(size))); + ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, &arena); + ColumnBlockView column_block_view(&block); + size_t size_to_fetch = size; + status = rle_page_decoder.next_batch(&size_to_fetch, &column_block_view); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(size, size_to_fetch); + + for (uint i = 0; i < size; i++) { + if (src[i] != values[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << src[i] << " got=" << values[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % size; + rle_page_decoder.seek_to_position_in_page(seek_off); + EXPECT_EQ((int32_t )(seek_off), rle_page_decoder.current_index()); + CppType ret; + copy_one(&rle_page_decoder, &ret); + EXPECT_EQ(values[seek_off], ret); + } + } +}; + +// Test for rle block, for INT32, BOOL +TEST_F(RlePageTest, TestRleInt32BlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = random(); + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(ints.get(), size); +} + +TEST_F(RlePageTest, TestRleInt32BlockEncoderEqual) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345; + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(ints.get(), size); +} + +TEST_F(RlePageTest, TestRleInt32BlockEncoderSequence) { + const uint32_t size = 10000; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 12345 + i; + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(ints.get(), size); +} + +TEST_F(RlePageTest, TestRleInt32BlockEncoderSize) { + size_t size = 100; + + std::unique_ptr ints(new int32_t[size]); + for (int i = 0; i < size; i++) { + ints.get()[i] = 0; + } + PageBuilderOptions builder_options; + builder_options.data_page_size = 256 * 1024; + segment_v2::RlePageBuilder rle_page_builder(builder_options); + rle_page_builder.add(reinterpret_cast(ints.get()), &size); + Slice s = rle_page_builder.finish(); + // 4 bytes header + // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 + // 4 bytes values + ASSERT_EQ(10, s.size); +} + +TEST_F(RlePageTest, TestRleBoolBlockEncoderRandom) { + const uint32_t size = 10000; + + std::unique_ptr bools(new bool[size]); + for (int i = 0; i < size; i++) { + if (random() % 2 == 0) { + bools.get()[i] = true; + } else { + bools.get()[i] = false; + } + } + + test_encode_decode_page_template, + segment_v2::RlePageDecoder >(bools.get(), size); +} + +TEST_F(RlePageTest, TestRleBoolBlockEncoderSize) { + size_t size = 100; + + std::unique_ptr bools(new bool[size]); + for (int i = 0; i < size; i++) { + bools.get()[i] = true; + } + PageBuilderOptions builder_options; + builder_options.data_page_size = 256 * 1024; + segment_v2::RlePageBuilder rle_page_builder(builder_options); + rle_page_builder.add(reinterpret_cast(bools.get()), &size); + Slice s = rle_page_builder.finish(); + // 4 bytes header + // 2 bytes indicate_value(): 0x64 << 1 | 1 = 201 + // 1 bytes values + ASSERT_EQ(7, s.size); +} + +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/faststring_test.cpp b/be/test/util/faststring_test.cpp index 2a6120f3fd..68231c0dc9 100644 --- a/be/test/util/faststring_test.cpp +++ b/be/test/util/faststring_test.cpp @@ -1,83 +1,83 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include -#include - -#include "util/faststring.h" -#include "util/random.h" - -namespace doris { -class FaststringTest : public ::testing::Test {}; - -void RandomString(void* dest, size_t n, doris::Random* rng) { - size_t i = 0; - uint32_t random = rng->Next(); - char* cdest = static_cast(dest); - static const size_t sz = sizeof(random); - if (n >= sz) { - for (i = 0; i <= n - sz; i += sz) { - memcpy(&cdest[i], &random, sizeof(random)); - random = rng->Next(); - } - } - memcpy(cdest + i, &random, n - i); -} - -TEST_F(FaststringTest, TestShrinkToFit_Empty) { - faststring s; - s.shrink_to_fit(); - ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); -} - -// Test that, if the string contents is shorter than the initial capacity -// of the faststring, shrink_to_fit() leaves the string in the built-in -// array. -TEST_F(FaststringTest, TestShrinkToFit_SmallerThanInitialCapacity) { - faststring s; - s.append("hello"); - s.shrink_to_fit(); - ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); -} - -TEST_F(FaststringTest, TestShrinkToFit_Random) { - doris::Random r(time(nullptr)); - int kMaxSize = faststring::kInitialCapacity * 2; - std::unique_ptr random_bytes(new char[kMaxSize]); - RandomString(random_bytes.get(), kMaxSize, &r); - - faststring s; - for (int i = 0; i < 100; i++) { - int new_size = r.Uniform(kMaxSize); - s.resize(new_size); - memcpy(s.data(), random_bytes.get(), new_size); - s.shrink_to_fit(); - ASSERT_EQ(0, memcmp(s.data(), random_bytes.get(), new_size)); - ASSERT_EQ(std::max(faststring::kInitialCapacity, new_size), s.capacity()); - } -} - -} // namespace doris - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +#include "util/faststring.h" +#include "util/random.h" + +namespace doris { +class FaststringTest : public ::testing::Test {}; + +void RandomString(void* dest, size_t n, doris::Random* rng) { + size_t i = 0; + uint32_t random = rng->Next(); + char* cdest = static_cast(dest); + static const size_t sz = sizeof(random); + if (n >= sz) { + for (i = 0; i <= n - sz; i += sz) { + memcpy(&cdest[i], &random, sizeof(random)); + random = rng->Next(); + } + } + memcpy(cdest + i, &random, n - i); +} + +TEST_F(FaststringTest, TestShrinkToFit_Empty) { + faststring s; + s.shrink_to_fit(); + ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); +} + +// Test that, if the string contents is shorter than the initial capacity +// of the faststring, shrink_to_fit() leaves the string in the built-in +// array. +TEST_F(FaststringTest, TestShrinkToFit_SmallerThanInitialCapacity) { + faststring s; + s.append("hello"); + s.shrink_to_fit(); + ASSERT_EQ(faststring::kInitialCapacity, s.capacity()); +} + +TEST_F(FaststringTest, TestShrinkToFit_Random) { + doris::Random r(time(nullptr)); + int kMaxSize = faststring::kInitialCapacity * 2; + std::unique_ptr random_bytes(new char[kMaxSize]); + RandomString(random_bytes.get(), kMaxSize, &r); + + faststring s; + for (int i = 0; i < 100; i++) { + int new_size = r.Uniform(kMaxSize); + s.resize(new_size); + memcpy(s.data(), random_bytes.get(), new_size); + s.shrink_to_fit(); + ASSERT_EQ(0, memcmp(s.data(), random_bytes.get(), new_size)); + ASSERT_EQ(std::max(faststring::kInitialCapacity, new_size), s.capacity()); + } +} + +} // namespace doris + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/util/rle_encoding_test.cpp b/be/test/util/rle_encoding_test.cpp index 50c92c707d..8c8491ca44 100644 --- a/be/test/util/rle_encoding_test.cpp +++ b/be/test/util/rle_encoding_test.cpp @@ -1,426 +1,426 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include - -// Must come before gtest.h. -#include -#include -#include - -#include "util/bit_stream_utils.h" -#include "util/bit_stream_utils.inline.h" -#include "util/bit_util.h" -#include "util/faststring.h" -#include "util/rle_encoding.h" -#include "util/debug_util.h" - -using std::string; -using std::vector; - -namespace doris { - -const int kMaxWidth = 64; - -class TestRle : public testing::Test {}; -// Validates encoding of values by encoding and decoding them. If -// expected_encoding != NULL, also validates that the encoded buffer is -// exactly 'expected_encoding'. -// if expected_len is not -1, it will validate the encoded size is correct. -template -void ValidateRle(const vector& values, int bit_width, - uint8_t* expected_encoding, int expected_len) { - faststring buffer; - RleEncoder encoder(&buffer, bit_width); - - for (const auto& value : values) { - encoder.Put(value); - } - int encoded_len = encoder.Flush(); - - if (expected_len != -1) { - EXPECT_EQ(encoded_len, expected_len); - } - if (expected_encoding != nullptr) { - EXPECT_EQ(memcmp(buffer.data(), expected_encoding, expected_len), 0) - << "\n" - << "Expected: " << hexdump((const char*)expected_encoding, expected_len) << "\n" - << "Got: " << hexdump((const char*)buffer.data(), buffer.size()); - } - - // Verify read - RleDecoder decoder(buffer.data(), encoded_len, bit_width); - for (const auto& value : values) { - T val = 0; - bool result = decoder.Get(&val); - EXPECT_TRUE(result); - EXPECT_EQ(value, val); - } -} - -TEST(Rle, SpecificSequences) { - const int kTestLen = 1024; - uint8_t expected_buffer[kTestLen]; - vector values; - - // Test 50 0' followed by 50 1's - values.resize(100); - for (int i = 0; i < 50; ++i) { - values[i] = 0; - } - for (int i = 50; i < 100; ++i) { - values[i] = 1; - } - - // expected_buffer valid for bit width <= 1 byte - expected_buffer[0] = (50 << 1); - expected_buffer[1] = 0; - expected_buffer[2] = (50 << 1); - expected_buffer[3] = 1; - for (int width = 1; width <= 8; ++width) { - ValidateRle(values, width, expected_buffer, 4); - } - - for (int width = 9; width <= kMaxWidth; ++width) { - ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8))); - } - - // Test 100 0's and 1's alternating - for (int i = 0; i < 100; ++i) { - values[i] = i % 2; - } - int num_groups = BitUtil::Ceil(100, 8); - expected_buffer[0] = (num_groups << 1) | 1; - for (int i = 0; i < 100/8; ++i) { - expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa - } - // Values for the last 4 0 and 1's - expected_buffer[1 + 100/8] = BOOST_BINARY(0 0 0 0 1 0 1 0); // 0x0a - - // num_groups and expected_buffer only valid for bit width = 1 - ValidateRle(values, 1, expected_buffer, 1 + num_groups); - for (int width = 2; width <= kMaxWidth; ++width) { - ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8)); - } -} - -// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value -// is used, otherwise alternating values are used. -void TestRleValues(int bit_width, int num_vals, int value = -1) { - const uint64_t mod = bit_width == 64 ? 1ULL : 1ULL << bit_width; - vector values; - for (uint64_t v = 0; v < num_vals; ++v) { - values.push_back((value != -1) ? value : (bit_width == 64 ? v : (v % mod))); - } - ValidateRle(values, bit_width, nullptr, -1); -} - -TEST(Rle, TestValues) { - for (int width = 1; width <= kMaxWidth; ++width) { - TestRleValues(width, 1); - TestRleValues(width, 1024); - TestRleValues(width, 1024, 0); - TestRleValues(width, 1024, 1); - } -} - -class BitRle : public testing::Test { -public: - BitRle() { - } - - virtual ~BitRle() { - } -}; - -// Tests all true/false values -TEST_F(BitRle, AllSame) { - const int kTestLen = 1024; - vector values; - - for (int v = 0; v < 2; ++v) { - values.clear(); - for (int i = 0; i < kTestLen; ++i) { - values.push_back(v ? true : false); - } - - ValidateRle(values, 1, nullptr, 3); - } -} - -// Test that writes out a repeated group and then a literal -// group but flush before finishing. -TEST_F(BitRle, Flush) { - vector values; - for (int i = 0; i < 16; ++i) values.push_back(1); - values.push_back(false); - ValidateRle(values, 1, nullptr, -1); - values.push_back(true); - ValidateRle(values, 1, nullptr, -1); - values.push_back(true); - ValidateRle(values, 1, nullptr, -1); - values.push_back(true); - ValidateRle(values, 1, nullptr, -1); -} - -// Test some random bool sequences. -TEST_F(BitRle, RandomBools) { - int iters = 0; - const int n_iters = 20; - while (iters < n_iters) { - srand(iters++); - if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; - vector values; - bool parity = 0; - for (int i = 0; i < 1000; ++i) { - int group_size = rand() % 20 + 1; // NOLINT(*) - if (group_size > 16) { - group_size = 1; - } - for (int i = 0; i < group_size; ++i) { - values.push_back(parity); - } - parity = !parity; - } - ValidateRle(values, (iters % kMaxWidth) + 1, nullptr, -1); - } -} - -// Test some random 64-bit sequences. -TEST_F(BitRle, Random64Bit) { - int iters = 0; - const int n_iters = 20; - while (iters < n_iters) { - srand(iters++); - if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; - vector values; - for (int i = 0; i < 1000; ++i) { - int group_size = rand() % 20 + 1; // NOLINT(*) - uint64_t cur_value = (static_cast(rand()) << 32) + static_cast(rand()); - if (group_size > 16) { - group_size = 1; - } - for (int i = 0; i < group_size; ++i) { - values.push_back(cur_value); - } - - } - ValidateRle(values, 64, nullptr, -1); - } -} - -// Test a sequence of 1 0's, 2 1's, 3 0's. etc -// e.g. 011000111100000 -TEST_F(BitRle, RepeatedPattern) { - vector values; - const int min_run = 1; - const int max_run = 32; - - for (int i = min_run; i <= max_run; ++i) { - int v = i % 2; - for (int j = 0; j < i; ++j) { - values.push_back(v); - } - } - - // And go back down again - for (int i = max_run; i >= min_run; --i) { - int v = i % 2; - for (int j = 0; j < i; ++j) { - values.push_back(v); - } - } - - ValidateRle(values, 1, nullptr, -1); -} - -TEST_F(TestRle, TestBulkPut) { - size_t run_length; - bool val = false; - - faststring buffer(1); - RleEncoder encoder(&buffer, 1); - encoder.Put(true, 10); - encoder.Put(false, 7); - encoder.Put(true, 5); - encoder.Put(true, 15); - encoder.Flush(); - - RleDecoder decoder(buffer.data(), encoder.len(), 1); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(10, run_length); - - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_FALSE(val); - ASSERT_EQ(7, run_length); - - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(20, run_length); - - ASSERT_EQ(0, decoder.GetNextRun(&val, std::numeric_limits::max())); -} - -TEST_F(TestRle, TestGetNextRun) { - // Repeat the test with different number of items - for (int num_items = 7; num_items < 200; num_items += 13) { - // Test different block patterns - // 1: 01010101 01010101 - // 2: 00110011 00110011 - // 3: 00011100 01110001 - // ... - for (int block = 1; block <= 20; ++block) { - faststring buffer(1); - RleEncoder encoder(&buffer, 1); - for (int j = 0; j < num_items; ++j) { - encoder.Put(!!(j & 1), block); - } - encoder.Flush(); - - RleDecoder decoder(buffer.data(), encoder.len(), 1); - size_t count = num_items * block; - for (int j = 0; j < num_items; ++j) { - size_t run_length; - bool val = false; - DCHECK_GT(count, 0); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - run_length = std::min(run_length, count); - - ASSERT_EQ(!!(j & 1), val); - ASSERT_EQ(block, run_length); - count -= run_length; - } - DCHECK_EQ(count, 0); - } - } -} - -// Generate a random bit string which consists of 'num_runs' runs, -// each with a random length between 1 and 100. Returns the number -// of values encoded (i.e the sum run length). -static size_t GenerateRandomBitString(int num_runs, faststring* enc_buf, string* string_rep) { - RleEncoder enc(enc_buf, 1); - int num_bits = 0; - for (int i = 0; i < num_runs; i++) { - int run_length = random() % 100; - bool value = static_cast(i & 1); - enc.Put(value, run_length); - string_rep->append(run_length, value ? '1' : '0'); - num_bits += run_length; - } - enc.Flush(); - return num_bits; -} - -TEST_F(TestRle, TestRoundTripRandomSequencesWithRuns) { - srand(time(nullptr)); - - // Test the limiting function of GetNextRun. - const int kMaxToReadAtOnce = (random() % 20) + 1; - - // Generate a bunch of random bit sequences, and "round-trip" them - // through the encode/decode sequence. - for (int rep = 0; rep < 100; rep++) { - faststring buf; - string string_rep; - int num_bits = GenerateRandomBitString(10, &buf, &string_rep); - RleDecoder decoder(buf.data(), buf.size(), 1); - string roundtrip_str; - int rem_to_read = num_bits; - size_t run_len; - bool val; - while (rem_to_read > 0 && - (run_len = decoder.GetNextRun(&val, std::min(kMaxToReadAtOnce, rem_to_read))) != 0) { - ASSERT_LE(run_len, kMaxToReadAtOnce); - roundtrip_str.append(run_len, val ? '1' : '0'); - rem_to_read -= run_len; - } - - ASSERT_EQ(string_rep, roundtrip_str); - } -} -TEST_F(TestRle, TestSkip) { - faststring buffer(1); - RleEncoder encoder(&buffer, 1); - - // 0101010[1] 01010101 01 - // "A" - for (int j = 0; j < 18; ++j) { - encoder.Put(!!(j & 1)); - } - - // 0011[00] 11001100 11001100 11001100 11001100 - // "B" - for (int j = 0; j < 19; ++j) { - encoder.Put(!!(j & 1), 2); - } - - // 000000000000 11[1111111111] 000000000000 111111111111 - // "C" - // 000000000000 111111111111 0[00000000000] 111111111111 - // "D" - // 000000000000 111111111111 000000000000 111111111111 - for (int j = 0; j < 12; ++j) { - encoder.Put(!!(j & 1), 12); - } - encoder.Flush(); - - bool val = false; - size_t run_length; - RleDecoder decoder(buffer.data(), encoder.len(), 1); - - // position before "A" - ASSERT_EQ(3, decoder.Skip(7)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(1, run_length); - - // position before "B" - ASSERT_EQ(7, decoder.Skip(14)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_FALSE(val); - ASSERT_EQ(2, run_length); - - // position before "C" - ASSERT_EQ(18, decoder.Skip(46)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_TRUE(val); - ASSERT_EQ(10, run_length); - - // position before "D" - ASSERT_EQ(24, decoder.Skip(49)); - run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); - ASSERT_FALSE(val); - ASSERT_EQ(11, run_length); - - encoder.Flush(); -} - -} // namespace doris - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +// Must come before gtest.h. +#include +#include +#include + +#include "util/bit_stream_utils.h" +#include "util/bit_stream_utils.inline.h" +#include "util/bit_util.h" +#include "util/faststring.h" +#include "util/rle_encoding.h" +#include "util/debug_util.h" + +using std::string; +using std::vector; + +namespace doris { + +const int kMaxWidth = 64; + +class TestRle : public testing::Test {}; +// Validates encoding of values by encoding and decoding them. If +// expected_encoding != NULL, also validates that the encoded buffer is +// exactly 'expected_encoding'. +// if expected_len is not -1, it will validate the encoded size is correct. +template +void ValidateRle(const vector& values, int bit_width, + uint8_t* expected_encoding, int expected_len) { + faststring buffer; + RleEncoder encoder(&buffer, bit_width); + + for (const auto& value : values) { + encoder.Put(value); + } + int encoded_len = encoder.Flush(); + + if (expected_len != -1) { + EXPECT_EQ(encoded_len, expected_len); + } + if (expected_encoding != nullptr) { + EXPECT_EQ(memcmp(buffer.data(), expected_encoding, expected_len), 0) + << "\n" + << "Expected: " << hexdump((const char*)expected_encoding, expected_len) << "\n" + << "Got: " << hexdump((const char*)buffer.data(), buffer.size()); + } + + // Verify read + RleDecoder decoder(buffer.data(), encoded_len, bit_width); + for (const auto& value : values) { + T val = 0; + bool result = decoder.Get(&val); + EXPECT_TRUE(result); + EXPECT_EQ(value, val); + } +} + +TEST(Rle, SpecificSequences) { + const int kTestLen = 1024; + uint8_t expected_buffer[kTestLen]; + vector values; + + // Test 50 0' followed by 50 1's + values.resize(100); + for (int i = 0; i < 50; ++i) { + values[i] = 0; + } + for (int i = 50; i < 100; ++i) { + values[i] = 1; + } + + // expected_buffer valid for bit width <= 1 byte + expected_buffer[0] = (50 << 1); + expected_buffer[1] = 0; + expected_buffer[2] = (50 << 1); + expected_buffer[3] = 1; + for (int width = 1; width <= 8; ++width) { + ValidateRle(values, width, expected_buffer, 4); + } + + for (int width = 9; width <= kMaxWidth; ++width) { + ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8))); + } + + // Test 100 0's and 1's alternating + for (int i = 0; i < 100; ++i) { + values[i] = i % 2; + } + int num_groups = BitUtil::Ceil(100, 8); + expected_buffer[0] = (num_groups << 1) | 1; + for (int i = 0; i < 100/8; ++i) { + expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa + } + // Values for the last 4 0 and 1's + expected_buffer[1 + 100/8] = BOOST_BINARY(0 0 0 0 1 0 1 0); // 0x0a + + // num_groups and expected_buffer only valid for bit width = 1 + ValidateRle(values, 1, expected_buffer, 1 + num_groups); + for (int width = 2; width <= kMaxWidth; ++width) { + ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8)); + } +} + +// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value +// is used, otherwise alternating values are used. +void TestRleValues(int bit_width, int num_vals, int value = -1) { + const uint64_t mod = bit_width == 64 ? 1ULL : 1ULL << bit_width; + vector values; + for (uint64_t v = 0; v < num_vals; ++v) { + values.push_back((value != -1) ? value : (bit_width == 64 ? v : (v % mod))); + } + ValidateRle(values, bit_width, nullptr, -1); +} + +TEST(Rle, TestValues) { + for (int width = 1; width <= kMaxWidth; ++width) { + TestRleValues(width, 1); + TestRleValues(width, 1024); + TestRleValues(width, 1024, 0); + TestRleValues(width, 1024, 1); + } +} + +class BitRle : public testing::Test { +public: + BitRle() { + } + + virtual ~BitRle() { + } +}; + +// Tests all true/false values +TEST_F(BitRle, AllSame) { + const int kTestLen = 1024; + vector values; + + for (int v = 0; v < 2; ++v) { + values.clear(); + for (int i = 0; i < kTestLen; ++i) { + values.push_back(v ? true : false); + } + + ValidateRle(values, 1, nullptr, 3); + } +} + +// Test that writes out a repeated group and then a literal +// group but flush before finishing. +TEST_F(BitRle, Flush) { + vector values; + for (int i = 0; i < 16; ++i) values.push_back(1); + values.push_back(false); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); +} + +// Test some random bool sequences. +TEST_F(BitRle, RandomBools) { + int iters = 0; + const int n_iters = 20; + while (iters < n_iters) { + srand(iters++); + if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; + vector values; + bool parity = 0; + for (int i = 0; i < 1000; ++i) { + int group_size = rand() % 20 + 1; // NOLINT(*) + if (group_size > 16) { + group_size = 1; + } + for (int i = 0; i < group_size; ++i) { + values.push_back(parity); + } + parity = !parity; + } + ValidateRle(values, (iters % kMaxWidth) + 1, nullptr, -1); + } +} + +// Test some random 64-bit sequences. +TEST_F(BitRle, Random64Bit) { + int iters = 0; + const int n_iters = 20; + while (iters < n_iters) { + srand(iters++); + if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; + vector values; + for (int i = 0; i < 1000; ++i) { + int group_size = rand() % 20 + 1; // NOLINT(*) + uint64_t cur_value = (static_cast(rand()) << 32) + static_cast(rand()); + if (group_size > 16) { + group_size = 1; + } + for (int i = 0; i < group_size; ++i) { + values.push_back(cur_value); + } + + } + ValidateRle(values, 64, nullptr, -1); + } +} + +// Test a sequence of 1 0's, 2 1's, 3 0's. etc +// e.g. 011000111100000 +TEST_F(BitRle, RepeatedPattern) { + vector values; + const int min_run = 1; + const int max_run = 32; + + for (int i = min_run; i <= max_run; ++i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + // And go back down again + for (int i = max_run; i >= min_run; --i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + ValidateRle(values, 1, nullptr, -1); +} + +TEST_F(TestRle, TestBulkPut) { + size_t run_length; + bool val = false; + + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + encoder.Put(true, 10); + encoder.Put(false, 7); + encoder.Put(true, 5); + encoder.Put(true, 15); + encoder.Flush(); + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(10, run_length); + + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_FALSE(val); + ASSERT_EQ(7, run_length); + + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(20, run_length); + + ASSERT_EQ(0, decoder.GetNextRun(&val, std::numeric_limits::max())); +} + +TEST_F(TestRle, TestGetNextRun) { + // Repeat the test with different number of items + for (int num_items = 7; num_items < 200; num_items += 13) { + // Test different block patterns + // 1: 01010101 01010101 + // 2: 00110011 00110011 + // 3: 00011100 01110001 + // ... + for (int block = 1; block <= 20; ++block) { + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + for (int j = 0; j < num_items; ++j) { + encoder.Put(!!(j & 1), block); + } + encoder.Flush(); + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + size_t count = num_items * block; + for (int j = 0; j < num_items; ++j) { + size_t run_length; + bool val = false; + DCHECK_GT(count, 0); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + run_length = std::min(run_length, count); + + ASSERT_EQ(!!(j & 1), val); + ASSERT_EQ(block, run_length); + count -= run_length; + } + DCHECK_EQ(count, 0); + } + } +} + +// Generate a random bit string which consists of 'num_runs' runs, +// each with a random length between 1 and 100. Returns the number +// of values encoded (i.e the sum run length). +static size_t GenerateRandomBitString(int num_runs, faststring* enc_buf, string* string_rep) { + RleEncoder enc(enc_buf, 1); + int num_bits = 0; + for (int i = 0; i < num_runs; i++) { + int run_length = random() % 100; + bool value = static_cast(i & 1); + enc.Put(value, run_length); + string_rep->append(run_length, value ? '1' : '0'); + num_bits += run_length; + } + enc.Flush(); + return num_bits; +} + +TEST_F(TestRle, TestRoundTripRandomSequencesWithRuns) { + srand(time(nullptr)); + + // Test the limiting function of GetNextRun. + const int kMaxToReadAtOnce = (random() % 20) + 1; + + // Generate a bunch of random bit sequences, and "round-trip" them + // through the encode/decode sequence. + for (int rep = 0; rep < 100; rep++) { + faststring buf; + string string_rep; + int num_bits = GenerateRandomBitString(10, &buf, &string_rep); + RleDecoder decoder(buf.data(), buf.size(), 1); + string roundtrip_str; + int rem_to_read = num_bits; + size_t run_len; + bool val; + while (rem_to_read > 0 && + (run_len = decoder.GetNextRun(&val, std::min(kMaxToReadAtOnce, rem_to_read))) != 0) { + ASSERT_LE(run_len, kMaxToReadAtOnce); + roundtrip_str.append(run_len, val ? '1' : '0'); + rem_to_read -= run_len; + } + + ASSERT_EQ(string_rep, roundtrip_str); + } +} +TEST_F(TestRle, TestSkip) { + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + + // 0101010[1] 01010101 01 + // "A" + for (int j = 0; j < 18; ++j) { + encoder.Put(!!(j & 1)); + } + + // 0011[00] 11001100 11001100 11001100 11001100 + // "B" + for (int j = 0; j < 19; ++j) { + encoder.Put(!!(j & 1), 2); + } + + // 000000000000 11[1111111111] 000000000000 111111111111 + // "C" + // 000000000000 111111111111 0[00000000000] 111111111111 + // "D" + // 000000000000 111111111111 000000000000 111111111111 + for (int j = 0; j < 12; ++j) { + encoder.Put(!!(j & 1), 12); + } + encoder.Flush(); + + bool val = false; + size_t run_length; + RleDecoder decoder(buffer.data(), encoder.len(), 1); + + // position before "A" + ASSERT_EQ(3, decoder.Skip(7)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(1, run_length); + + // position before "B" + ASSERT_EQ(7, decoder.Skip(14)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_FALSE(val); + ASSERT_EQ(2, run_length); + + // position before "C" + ASSERT_EQ(18, decoder.Skip(46)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_TRUE(val); + ASSERT_EQ(10, run_length); + + // position before "D" + ASSERT_EQ(24, decoder.Skip(49)); + run_length = decoder.GetNextRun(&val, std::numeric_limits::max()); + ASSERT_FALSE(val); + ASSERT_EQ(11, run_length); + + encoder.Flush(); +} + +} // namespace doris + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/docs/documentation/cn/community/subscribe-mail-list.md b/docs/documentation/cn/community/subscribe-mail-list.md index dbd23c074f..9f56cd0611 100644 --- a/docs/documentation/cn/community/subscribe-mail-list.md +++ b/docs/documentation/cn/community/subscribe-mail-list.md @@ -1,43 +1,43 @@ -# 订阅邮件列表 - +# 订阅邮件列表 + 邮件列表(Mail List)是 Apache 社区最被认可的交流方式。一般来说,开源社区的提问与解答、技术讨论、事务决策等都通过邮件列表来承载。邮件列表异步、广播的特性,也非常适合开源社区的沟通交流。那么,如何订阅 Apache Doris (incubating) 的邮件列表呢?主要包括以下五个步骤。 - -## 1. 发送订阅邮件 - -打开自己的邮箱,新建邮件,向`dev-subscribe@doris.apache.org`发送一封邮件(邮件主题和内容任意) - -![step1](../../../resources/images/subscribe-mail-list-step1.png) - -## 2. 接收来自 dev-help@doris.apache.org 的确认邮件 - -执行完第一步之后,您将收到一封来自`dev-help@doris.apache.org`的确认邮件,邮件内容如下图所示。(**如果长时间未能收到,请确认该邮件是否已被拦截,或已经被自动归入“订阅邮件”、“垃圾邮件”、“推广邮件”等文件夹**) + +## 1. 发送订阅邮件 + +打开自己的邮箱,新建邮件,向`dev-subscribe@doris.apache.org`发送一封邮件(邮件主题和内容任意) + +![step1](../../../resources/images/subscribe-mail-list-step1.png) + +## 2. 接收来自 dev-help@doris.apache.org 的确认邮件 + +执行完第一步之后,您将收到一封来自`dev-help@doris.apache.org`的确认邮件,邮件内容如下图所示。(**如果长时间未能收到,请确认该邮件是否已被拦截,或已经被自动归入“订阅邮件”、“垃圾邮件”、“推广邮件”等文件夹**) ![step2](../../../resources/images/subscribe-mail-list-step2.png) - -## 3. 回复确认邮件 - -​针对上一步接收到的邮件, - -​**a.直接回复该邮件** - -​***或*** - -**b. 新建一封`收件人`为上一步中的`回复地址`的邮件** - -​均可,内容主题不限 - -![step3](../../../resources/images/subscribe-mail-list-step3.png) - - -## 4. 接收欢迎邮件 - -​完成第三步之后,将会受到一封标题为**WELCOME to dev@doris.apache.org**的欢迎邮件。至此,订阅邮件列表的工作已经完成了,社区的动态都会通过邮件的方式通知您。 - -![step4](../../../resources/images/subscribe-mail-list-step4.png) - - -## 5. 发起邮件讨论(可选) - + +## 3. 回复确认邮件 + +​针对上一步接收到的邮件, + +​**a.直接回复该邮件** + +​***或*** + +**b. 新建一封`收件人`为上一步中的`回复地址`的邮件** + +​均可,内容主题不限 + +![step3](../../../resources/images/subscribe-mail-list-step3.png) + + +## 4. 接收欢迎邮件 + +​完成第三步之后,将会受到一封标题为**WELCOME to dev@doris.apache.org**的欢迎邮件。至此,订阅邮件列表的工作已经完成了,社区的动态都会通过邮件的方式通知您。 + +![step4](../../../resources/images/subscribe-mail-list-step4.png) + + +## 5. 发起邮件讨论(可选) + ​成功订阅邮件列表后,若想发起讨论,直接往`dev@doris.apache.org`发送邮件即可。所有订阅了邮件列表的人都会收到邮件。 ​ ​ \ No newline at end of file diff --git a/docs/documentation/cn/internal/doris_storage_optimization.md b/docs/documentation/cn/internal/doris_storage_optimization.md index 51661cefd4..ab34b206aa 100644 --- a/docs/documentation/cn/internal/doris_storage_optimization.md +++ b/docs/documentation/cn/internal/doris_storage_optimization.md @@ -1,206 +1,206 @@ -# Doris存储文件格式优化 # - -## 文件格式 ## - -![](../../../resources/images/segment_v2.png) -
图1. doris segment文件格式
- -文件包括: -- 文件开始是8个字节的magic code,用于识别文件格式和版本 -- Data Region:用于存储各个列的数据信息,这里的数据是按需分page加载的 -- Index Region: doris中将各个列的index数据统一存储在Index Region,这里的数据会按照列粒度进行加载,所以跟列的数据信息分开存储 -- Footer信息 - - FileFooterPB:定义文件的元数据信息 - - 4个字节的footer pb内容的checksum - - 4个字节的FileFooterPB消息长度,用于读取FileFooterPB - - 8个字节的MAGIC CODE,之所以在末位存储,是方便不同的场景进行文件类型的识别 - -文件中的数据按照page的方式进行组织,page是编码和压缩的基本单位。现在的page类型包括以下几种: - -### DataPage ### - -DataPage分为两种:nullable和non-nullable的data page。 - -nullable的data page内容包括: -``` - - +----------------+ - | value count | - |----------------| - | first row id | - |----------------| - | bitmap length | - |----------------| - | null bitmap | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ -``` - -non-nullable data page结构如下: - -``` - |----------------| - | value count | - |----------------| - | first row id | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ -``` - -其中各个字段含义如下: - -- value count - - 表示page中的行数 -- first row id - - page中第一行的行号 -- bitmap length - - 表示接下来bitmap的字节数 -- null bitmap - - 表示null信息的bitmap -- data - - 存储经过encoding和compress之后的数据 - - 需要在数据的头部信息中写入:is_compressed - - 各种不同编码的data需要在头部信息写入一些字段信息,以实现数据的解析 - - TODO:添加各种encoding的header信息 -- checksum - - 存储page粒度的校验和,包括page的header和之后的实际数据 - - -### Bloom Filter Pages ### - -针对每个bloom filter列,会在page的粒度相应的生成一个bloom filter的page,保存在bloom filter pages区域 - -### Ordinal Index Page ### - -针对每个列,都会按照page粒度,建立行号的稀疏索引。内容为这个page的起始行的行号到这个block的指针(包括offset和length) - -### Short Key Index page ### - -我们会每隔N行(可配置)生成一个short key的稀疏索引,索引的内容为:short key->行号(ordinal) - -### Column的其他索引 ### - -该格式设计支持后续扩展其他的索引信息,比如bitmap索引,spatial索引等等,只需要将需要的数据写到现有的列数据后面,并且添加对应的元数据字段到FileFooterPB中 - -### 元数据定义 ### -FileFooterPB的定义为: - -``` -message ColumnPB { - optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 - optional string type = 2; // 列类型 - optional string aggregation = 3; // 是否聚合 - optional uint32 length = 4; // 长度 - optional bool is_key = 5; // 是否是主键列 - optional string default_value = 6; // 默认值 - optional uint32 precision = 9 [default = 27]; // 精度 - optional uint32 frac = 10 [default = 9]; - optional bool is_nullable = 11 [default=false]; // 是否有null - optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 - optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 -} - -// page偏移 -message PagePointerPB { - required uint64 offset; // page在文件中的偏移 - required uint32 length; // page的大小 -} - -message MetadataPairPB { - optional string key = 1; - optional bytes value = 2; -} - -message ColumnMetaPB { - optional ColumnMessage encoding; // 编码方式 - - optional PagePointerPB dict_page // 词典page - repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 - optional PagePointerPB ordinal_index_page; // 行号索引数据 - optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 - - optional PagePointerPB bitmap_index_page; // bitmap索引数据 - - optional uint64 data_footprint; // 列中索引的大小 - optional uint64 index_footprint; // 列中数据的大小 - optional uint64 raw_data_footprint; // 原始列数据大小 - - optional CompressKind compress_kind; // 列的压缩方式 - - optional ZoneMapPB column_zone_map; //文件级别的过滤条件 - repeated MetadataPairPB column_meta_datas; -} - -message FileFooterPB { - optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 - repeated ColumnPB schema = 5; // 列Schema - optional uint64 num_values = 4; // 文件中保存的行数 - optional uint64 index_footprint = 7; // 索引大小 - optional uint64 data_footprint = 8; // 数据大小 - optional uint64 raw_data_footprint = 8; // 原始数据大小 - - optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 - repeated ColumnMetaPB column_metas = 10; // 列元数据 - optional PagePointerPB key_index_page; // short key索引page -} - -``` - -## 读写逻辑 ## - -### 写入 ### - -大体的写入流程如下: -1. 写入magic -2. 根据schema信息,生成对应的ColumnWriter,每个ColumnWriter按照不同的类型,获取对应的encoding信息(可配置),根据encoding,生成对应的encoder -3. 调用encoder->add(value)进行数据写入,每个K行,生成一个short key index entry,并且,如果当前的page满足一定条件(大小超过1M或者行数为K),就生成一个新的page,缓存在内存中。 -4. 不断的循环步骤3,直到数据写入完成。将各个列的数据依序刷入文件中 -5. 生成FileFooterPB信息,写入文件中。 - -相关的问题: - -- short key的索引如何生成? - - 现在还是按照每隔多少行生成一个short key的稀疏索引,保持每隔1024行生成一个short的稀疏索引,具体的内容是:short key -> ordinal - -- ordinal索引里面应该存什么? - - 存储page的第一个ordinal到page pointer的映射信息 -- 不同encoding类型的page里存什么? - - 词典压缩 - - plain - - rle - - bshuf - -### 读取 ### - -1. 读取文件的magic,判断文件类型和版本 -2. 读取FileFooterPB,进行checksum校验 -3. 按照需要的列,读取short key索引和对应列的数据ordinal索引信息 -4. 使用start key和end key,通过short key索引定位到要读取的行号,然后通过ordinal索引确定需要读取的row ranges, 同时需要通过统计信息、bitmap索引等过滤需要读取的row ranges -5. 然后按照row ranges通过ordinal索引读取行的数据 - -相关的问题: -1. 如何实现在page内部快速的定位到某一行? - - page内部是的数据是经过encoding的,无法快速进行行级数据的定位。不同的encoding方式,在内部进行快速的行号定位的方案不一样,需要具体分析: - - 如果是rle编码的,需要通过解析rle的header进行skip,直到到达包含该行的那个rle块之后,再进行反解。 - - binary plain encoding:会在page的中存储offset信息,并且会在page header中指定offset信息的offset,读取的时候会先解析offset信息到数组中,这样子就可以通过各个行的offset数据信息快速的定位block某一行的数据 -2. 如何实现块的高效读取?可以考虑将相邻的块在读取的时候进行merge,一次性读取? - 这个需要在读取的时候,判断block是否连续,如果连续,就一次性的读取 - -## 编码 ## - -现有的doris存储中,针对string类型的编码,采用plain encoding的方式,效率比较低。经过对比,发现在百度统计的场景下,数据会因为string类型的编码膨胀超过一倍。所以,计划引入基于词典的编码压缩。 - -## 压缩 ## - -实现可扩展的压缩框架,支持多种压缩算法,方便后续添加新的压缩算法,计划引入zstd压缩。 - -## TODO ## -1. 如何实现嵌套类型?如何在嵌套类型中进行行号定位? -2. 如何优化现在的ScanRange拆分导致的下游bitmap、column statistic统计等进行多次? +# Doris存储文件格式优化 # + +## 文件格式 ## + +![](../../../resources/images/segment_v2.png) +
图1. doris segment文件格式
+ +文件包括: +- 文件开始是8个字节的magic code,用于识别文件格式和版本 +- Data Region:用于存储各个列的数据信息,这里的数据是按需分page加载的 +- Index Region: doris中将各个列的index数据统一存储在Index Region,这里的数据会按照列粒度进行加载,所以跟列的数据信息分开存储 +- Footer信息 + - FileFooterPB:定义文件的元数据信息 + - 4个字节的footer pb内容的checksum + - 4个字节的FileFooterPB消息长度,用于读取FileFooterPB + - 8个字节的MAGIC CODE,之所以在末位存储,是方便不同的场景进行文件类型的识别 + +文件中的数据按照page的方式进行组织,page是编码和压缩的基本单位。现在的page类型包括以下几种: + +### DataPage ### + +DataPage分为两种:nullable和non-nullable的data page。 + +nullable的data page内容包括: +``` + + +----------------+ + | value count | + |----------------| + | first row id | + |----------------| + | bitmap length | + |----------------| + | null bitmap | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ +``` + +non-nullable data page结构如下: + +``` + |----------------| + | value count | + |----------------| + | first row id | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ +``` + +其中各个字段含义如下: + +- value count + - 表示page中的行数 +- first row id + - page中第一行的行号 +- bitmap length + - 表示接下来bitmap的字节数 +- null bitmap + - 表示null信息的bitmap +- data + - 存储经过encoding和compress之后的数据 + - 需要在数据的头部信息中写入:is_compressed + - 各种不同编码的data需要在头部信息写入一些字段信息,以实现数据的解析 + - TODO:添加各种encoding的header信息 +- checksum + - 存储page粒度的校验和,包括page的header和之后的实际数据 + + +### Bloom Filter Pages ### + +针对每个bloom filter列,会在page的粒度相应的生成一个bloom filter的page,保存在bloom filter pages区域 + +### Ordinal Index Page ### + +针对每个列,都会按照page粒度,建立行号的稀疏索引。内容为这个page的起始行的行号到这个block的指针(包括offset和length) + +### Short Key Index page ### + +我们会每隔N行(可配置)生成一个short key的稀疏索引,索引的内容为:short key->行号(ordinal) + +### Column的其他索引 ### + +该格式设计支持后续扩展其他的索引信息,比如bitmap索引,spatial索引等等,只需要将需要的数据写到现有的列数据后面,并且添加对应的元数据字段到FileFooterPB中 + +### 元数据定义 ### +FileFooterPB的定义为: + +``` +message ColumnPB { + optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 + optional string type = 2; // 列类型 + optional string aggregation = 3; // 是否聚合 + optional uint32 length = 4; // 长度 + optional bool is_key = 5; // 是否是主键列 + optional string default_value = 6; // 默认值 + optional uint32 precision = 9 [default = 27]; // 精度 + optional uint32 frac = 10 [default = 9]; + optional bool is_nullable = 11 [default=false]; // 是否有null + optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 + optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 +} + +// page偏移 +message PagePointerPB { + required uint64 offset; // page在文件中的偏移 + required uint32 length; // page的大小 +} + +message MetadataPairPB { + optional string key = 1; + optional bytes value = 2; +} + +message ColumnMetaPB { + optional ColumnMessage encoding; // 编码方式 + + optional PagePointerPB dict_page // 词典page + repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 + optional PagePointerPB ordinal_index_page; // 行号索引数据 + optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 + + optional PagePointerPB bitmap_index_page; // bitmap索引数据 + + optional uint64 data_footprint; // 列中索引的大小 + optional uint64 index_footprint; // 列中数据的大小 + optional uint64 raw_data_footprint; // 原始列数据大小 + + optional CompressKind compress_kind; // 列的压缩方式 + + optional ZoneMapPB column_zone_map; //文件级别的过滤条件 + repeated MetadataPairPB column_meta_datas; +} + +message FileFooterPB { + optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 + repeated ColumnPB schema = 5; // 列Schema + optional uint64 num_values = 4; // 文件中保存的行数 + optional uint64 index_footprint = 7; // 索引大小 + optional uint64 data_footprint = 8; // 数据大小 + optional uint64 raw_data_footprint = 8; // 原始数据大小 + + optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 + repeated ColumnMetaPB column_metas = 10; // 列元数据 + optional PagePointerPB key_index_page; // short key索引page +} + +``` + +## 读写逻辑 ## + +### 写入 ### + +大体的写入流程如下: +1. 写入magic +2. 根据schema信息,生成对应的ColumnWriter,每个ColumnWriter按照不同的类型,获取对应的encoding信息(可配置),根据encoding,生成对应的encoder +3. 调用encoder->add(value)进行数据写入,每个K行,生成一个short key index entry,并且,如果当前的page满足一定条件(大小超过1M或者行数为K),就生成一个新的page,缓存在内存中。 +4. 不断的循环步骤3,直到数据写入完成。将各个列的数据依序刷入文件中 +5. 生成FileFooterPB信息,写入文件中。 + +相关的问题: + +- short key的索引如何生成? + - 现在还是按照每隔多少行生成一个short key的稀疏索引,保持每隔1024行生成一个short的稀疏索引,具体的内容是:short key -> ordinal + +- ordinal索引里面应该存什么? + - 存储page的第一个ordinal到page pointer的映射信息 +- 不同encoding类型的page里存什么? + - 词典压缩 + - plain + - rle + - bshuf + +### 读取 ### + +1. 读取文件的magic,判断文件类型和版本 +2. 读取FileFooterPB,进行checksum校验 +3. 按照需要的列,读取short key索引和对应列的数据ordinal索引信息 +4. 使用start key和end key,通过short key索引定位到要读取的行号,然后通过ordinal索引确定需要读取的row ranges, 同时需要通过统计信息、bitmap索引等过滤需要读取的row ranges +5. 然后按照row ranges通过ordinal索引读取行的数据 + +相关的问题: +1. 如何实现在page内部快速的定位到某一行? + + page内部是的数据是经过encoding的,无法快速进行行级数据的定位。不同的encoding方式,在内部进行快速的行号定位的方案不一样,需要具体分析: + - 如果是rle编码的,需要通过解析rle的header进行skip,直到到达包含该行的那个rle块之后,再进行反解。 + - binary plain encoding:会在page的中存储offset信息,并且会在page header中指定offset信息的offset,读取的时候会先解析offset信息到数组中,这样子就可以通过各个行的offset数据信息快速的定位block某一行的数据 +2. 如何实现块的高效读取?可以考虑将相邻的块在读取的时候进行merge,一次性读取? + 这个需要在读取的时候,判断block是否连续,如果连续,就一次性的读取 + +## 编码 ## + +现有的doris存储中,针对string类型的编码,采用plain encoding的方式,效率比较低。经过对比,发现在百度统计的场景下,数据会因为string类型的编码膨胀超过一倍。所以,计划引入基于词典的编码压缩。 + +## 压缩 ## + +实现可扩展的压缩框架,支持多种压缩算法,方便后续添加新的压缩算法,计划引入zstd压缩。 + +## TODO ## +1. 如何实现嵌套类型?如何在嵌套类型中进行行号定位? +2. 如何优化现在的ScanRange拆分导致的下游bitmap、column statistic统计等进行多次? diff --git a/docs/documentation/en/internal/doris_storage_optimization_EN.md b/docs/documentation/en/internal/doris_storage_optimization_EN.md index ef7721e8dc..0376aa0631 100644 --- a/docs/documentation/en/internal/doris_storage_optimization_EN.md +++ b/docs/documentation/en/internal/doris_storage_optimization_EN.md @@ -22,35 +22,35 @@ The data in the file is organized in the form of page, which is the basic unit o Data Page is divided into two types: nullable and non-nullable data pages. Nullable's data page includes: -``` - - +----------------+ - | value count | - |----------------| - | first row id | - |----------------| - | bitmap length | - |----------------| - | null bitmap | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ +``` + + +----------------+ + | value count | + |----------------| + | first row id | + |----------------| + | bitmap length | + |----------------| + | null bitmap | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ ``` non -zero data page32467;- 26500;- 229140;- -``` - |----------------| - | value count | - |----------------| - | first row id | - |----------------| - | data | - |----------------| - | checksum | - +----------------+ +``` + |----------------| + | value count | + |----------------| + | first row id | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ ``` The meanings of each field are as follows: @@ -91,65 +91,65 @@ The format design supports the subsequent expansion of other index information, ### Metadata Definition### FileFooterPB is defined as: -``` -message ColumnPB { - optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 - optional string type = 2; // 列类型 - optional string aggregation = 3; // 是否聚合 - optional uint32 length = 4; // 长度 - optional bool is_key = 5; // 是否是主键列 - optional string default_value = 6; // 默认值 - optional uint32 precision = 9 [default = 27]; // 精度 - optional uint32 frac = 10 [default = 9]; - optional bool is_nullable = 11 [default=false]; // 是否有null - optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 - optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 -} - -// page偏移 -message PagePointerPB { - required uint64 offset; // page在文件中的偏移 - required uint32 length; // page的大小 -} - -message MetadataPairPB { - optional string key = 1; - optional bytes value = 2; -} - -message ColumnMetaPB { - optional ColumnMessage encoding; // 编码方式 - - optional PagePointerPB dict_page // 词典page - repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 - optional PagePointerPB ordinal_index_page; // 行号索引数据 - optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 - - optional PagePointerPB bitmap_index_page; // bitmap索引数据 - - optional uint64 data_footprint; // 列中索引的大小 - optional uint64 index_footprint; // 列中数据的大小 - optional uint64 raw_data_footprint; // 原始列数据大小 - - optional CompressKind compress_kind; // 列的压缩方式 - - optional ZoneMapPB column_zone_map; //文件级别的过滤条件 - repeated MetadataPairPB column_meta_datas; -} - -message FileFooterPB { - optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 - repeated ColumnPB schema = 5; // 列Schema - optional uint64 num_values = 4; // 文件中保存的行数 - optional uint64 index_footprint = 7; // 索引大小 - optional uint64 data_footprint = 8; // 数据大小 - optional uint64 raw_data_footprint = 8; // 原始数据大小 - - optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 - repeated ColumnMetaPB column_metas = 10; // 列元数据 - optional PagePointerPB key_index_page; // short key索引page -} - +``` +message ColumnPB { + optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 + optional string type = 2; // 列类型 + optional string aggregation = 3; // 是否聚合 + optional uint32 length = 4; // 长度 + optional bool is_key = 5; // 是否是主键列 + optional string default_value = 6; // 默认值 + optional uint32 precision = 9 [default = 27]; // 精度 + optional uint32 frac = 10 [default = 9]; + optional bool is_nullable = 11 [default=false]; // 是否有null + optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 + optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 +} + +// page偏移 +message PagePointerPB { + required uint64 offset; // page在文件中的偏移 + required uint32 length; // page的大小 +} + +message MetadataPairPB { + optional string key = 1; + optional bytes value = 2; +} + +message ColumnMetaPB { + optional ColumnMessage encoding; // 编码方式 + + optional PagePointerPB dict_page // 词典page + repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 + optional PagePointerPB ordinal_index_page; // 行号索引数据 + optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 + + optional PagePointerPB bitmap_index_page; // bitmap索引数据 + + optional uint64 data_footprint; // 列中索引的大小 + optional uint64 index_footprint; // 列中数据的大小 + optional uint64 raw_data_footprint; // 原始列数据大小 + + optional CompressKind compress_kind; // 列的压缩方式 + + optional ZoneMapPB column_zone_map; //文件级别的过滤条件 + repeated MetadataPairPB column_meta_datas; +} + +message FileFooterPB { + optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 + repeated ColumnPB schema = 5; // 列Schema + optional uint64 num_values = 4; // 文件中保存的行数 + optional uint64 index_footprint = 7; // 索引大小 + optional uint64 data_footprint = 8; // 数据大小 + optional uint64 raw_data_footprint = 8; // 原始数据大小 + + optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 + repeated ColumnMetaPB column_metas = 10; // 列元数据 + optional PagePointerPB key_index_page; // short key索引page +} + ``` ## Read-write logic##