From adb758dcaca9826be3bbbe533c16a53c2a024716 Mon Sep 17 00:00:00 2001 From: yiguolei <676222867@qq.com> Date: Thu, 26 Jan 2023 16:21:12 +0800 Subject: [PATCH] [refactor](remove non vec code) remove json functions string functions match functions and some code (#16141) remove json functions code remove string functions code remove math functions code move MatchPredicate to olap since it is only used in storage predicate process remove some code in tuple, Tuple structure should be removed in the future. remove many code in collection value structure, they are useless --- be/src/common/daemon.cpp | 6 - be/src/exec/arrow/arrow_reader.h | 6 +- be/src/exec/arrow/parquet_reader.cpp | 308 ------ be/src/exec/arrow/parquet_reader.h | 6 - be/src/exec/base_scanner.h | 6 - be/src/exec/es/es_scroll_parser.cpp | 268 ----- be/src/exec/es/es_scroll_parser.h | 2 - be/src/exec/text_converter.h | 14 - be/src/exec/text_converter.hpp | 129 --- be/src/exprs/CMakeLists.txt | 1 - be/src/exprs/json_functions.cpp | 230 ----- be/src/exprs/json_functions.h | 34 - be/src/exprs/math_functions.cpp | 557 ---------- be/src/exprs/math_functions.h | 157 --- be/src/exprs/string_functions.cpp | 964 ------------------ be/src/exprs/string_functions.h | 154 --- be/src/olap/CMakeLists.txt | 1 + be/src/olap/delta_writer.h | 1 - be/src/{exprs => olap}/match_predicate.cpp | 2 +- be/src/{exprs => olap}/match_predicate.h | 3 - be/src/olap/memtable.h | 1 - be/src/olap/predicate_creator.h | 2 +- be/src/runtime/collection_value.cpp | 558 ---------- be/src/runtime/collection_value.h | 146 --- be/src/runtime/primitive_type.cpp | 50 - be/src/runtime/primitive_type.h | 2 - be/src/runtime/raw_value.cpp | 516 ---------- be/src/runtime/raw_value.h | 30 - be/src/runtime/result_buffer_mgr.cpp | 6 - be/src/runtime/tuple.cpp | 128 --- be/src/runtime/tuple.h | 35 - be/src/runtime/types.h | 3 - be/src/util/CMakeLists.txt | 3 - be/src/util/array_parser.cpp | 43 - be/src/util/array_parser.h | 247 ----- be/src/util/symbols_util.cpp | 310 ------ be/src/util/symbols_util.h | 72 -- be/src/util/topn_counter.cpp | 148 --- be/src/util/topn_counter.h | 172 ---- be/src/vec/core/block.cpp | 107 -- be/src/vec/core/block.h | 4 - .../exec/data_gen_functions/vnumbers_tvf.h | 1 - be/src/vec/exec/vdata_gen_scan_node.h | 1 - be/src/vec/exec/vmysql_scan_node.cpp | 13 - be/src/vec/exec/vmysql_scan_node.h | 6 - be/src/vec/functions/function_string.h | 53 +- be/test/CMakeLists.txt | 4 - be/test/exprs/json_function_test.cpp | 211 ---- be/test/exprs/math_functions_test.cpp | 288 ------ be/test/exprs/string_functions_test.cpp | 819 --------------- be/test/runtime/array_test.cpp | 873 ---------------- be/test/runtime/collection_value_test.cpp | 96 -- be/test/testutil/array_utils.cpp | 14 - be/test/testutil/array_utils.h | 2 - be/test/util/array_parser_test.cpp | 189 ---- 55 files changed, 50 insertions(+), 7952 deletions(-) rename be/src/{exprs => olap}/match_predicate.cpp (99%) rename be/src/{exprs => olap}/match_predicate.h (98%) delete mode 100644 be/src/util/array_parser.cpp delete mode 100644 be/src/util/array_parser.h delete mode 100644 be/src/util/symbols_util.cpp delete mode 100644 be/src/util/symbols_util.h delete mode 100644 be/src/util/topn_counter.cpp delete mode 100644 be/src/util/topn_counter.h delete mode 100644 be/test/exprs/math_functions_test.cpp delete mode 100644 be/test/exprs/string_functions_test.cpp delete mode 100644 be/test/runtime/array_test.cpp delete mode 100644 be/test/runtime/collection_value_test.cpp delete mode 100644 be/test/util/array_parser_test.cpp diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 1498f41488..6b150cfefc 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -23,9 +23,7 @@ #include "common/config.h" #include "common/logging.h" -#include "exprs/json_functions.h" #include "exprs/like_predicate.h" -#include "exprs/match_predicate.h" #include "exprs/math_functions.h" #include "exprs/string_functions.h" #include "geo/geo_functions.h" @@ -356,11 +354,7 @@ void Daemon::init(int argc, char** argv, const std::vector& paths) { MemInfo::init(); UserFunctionCache::instance()->init(config::user_function_dir); LikePredicate::init(); - StringFunctions::init(); - MathFunctions::init(); - JsonFunctions::init(); GeoFunctions::init(); - MatchPredicate::init(); LOG(INFO) << CpuInfo::debug_string(); LOG(INFO) << DiskInfo::debug_string(); diff --git a/be/src/exec/arrow/arrow_reader.h b/be/src/exec/arrow/arrow_reader.h index 7fce8f0925..b5e5aa22d6 100644 --- a/be/src/exec/arrow/arrow_reader.h +++ b/be/src/exec/arrow/arrow_reader.h @@ -44,7 +44,6 @@ class ExecEnv; class TBrokerRangeDesc; class TNetworkAddress; class RuntimeState; -class Tuple; class SlotDescriptor; class MemPool; class FileReader; @@ -84,10 +83,7 @@ public: virtual ~ArrowReaderWrap(); virtual Status init_reader(const TupleDescriptor* tuple_desc, const std::string& timezone) = 0; - // for row - virtual Status read(Tuple* tuple, MemPool* mem_pool, bool* eof) { - return Status::NotSupported("Not Implemented read"); - } + // for vec Status get_next_block(vectorized::Block* block, size_t* read_row, bool* eof) override; // This method should be deprecated once the old scanner is removed. diff --git a/be/src/exec/arrow/parquet_reader.cpp b/be/src/exec/arrow/parquet_reader.cpp index ba2fe9098f..0a376f2deb 100644 --- a/be/src/exec/arrow/parquet_reader.cpp +++ b/be/src/exec/arrow/parquet_reader.cpp @@ -118,32 +118,6 @@ Status ParquetReaderWrap::size(int64_t* size) { } } -// TODO: NEED TO REWRITE COMPLETELY. the way writing now is WRONG. -// StringRef shouldn't managing exclusive memory cause it will break RAII. -// besides, accessing object which is essentially const by non-const object -// is UB! -inline void ParquetReaderWrap::fill_slot(Tuple* tuple, SlotDescriptor* slot_desc, MemPool* mem_pool, - const uint8_t* value, int32_t len) { - tuple->set_not_null(slot_desc->null_indicator_offset()); - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - StringRef* str_slot = reinterpret_cast(slot); - str_slot->data = reinterpret_cast(mem_pool->allocate(len)); - memcpy(const_cast(str_slot->data), value, len); // ! - str_slot->size = len; -} - -inline Status ParquetReaderWrap::set_field_null(Tuple* tuple, const SlotDescriptor* slot_desc) { - if (!slot_desc->is_nullable()) { - std::stringstream str_error; - str_error << "The field name(" << slot_desc->col_name() - << ") is not allowed null, but Parquet field is null."; - LOG(WARNING) << str_error.str(); - return Status::RuntimeError(str_error.str()); - } - tuple->set_null(slot_desc->null_indicator_offset()); - return Status::OK(); -} - Status ParquetReaderWrap::read_record_batch(bool* eof) { if (_current_line_of_group >= _rows_of_group) { // read next row group VLOG_DEBUG << "read_record_batch, current group id:" << _current_group @@ -233,288 +207,6 @@ Status ParquetReaderWrap::init_parquet_type() { return Status::OK(); } -Status ParquetReaderWrap::read(Tuple* tuple, MemPool* mem_pool, bool* eof) { - if (_batch == nullptr) { - _current_line_of_group += _rows_of_group; - return read_record_batch(eof); - } - uint8_t tmp_buf[128] = {0}; - int32_t wbytes = 0; - const uint8_t* value = nullptr; - int column_index = 0; - try { - size_t slots = _include_column_ids.size(); - for (size_t i = 0; i < slots; ++i) { - auto slot_desc = _file_slot_descs[i]; - column_index = i; // column index in batch record - switch (_parquet_column_type[i]) { - case arrow::Type::type::STRING: { - auto str_array = - std::static_pointer_cast(_batch->column(column_index)); - if (str_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - value = str_array->GetValue(_current_line_of_batch, &wbytes); - fill_slot(tuple, slot_desc, mem_pool, value, wbytes); - } - break; - } - case arrow::Type::type::INT32: { - auto int32_array = - std::static_pointer_cast(_batch->column(column_index)); - if (int32_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - int32_t value = int32_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%d", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::INT64: { - auto int64_array = - std::static_pointer_cast(_batch->column(column_index)); - if (int64_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - int64_t value = int64_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%" PRId64, value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::UINT32: { - auto uint32_array = - std::static_pointer_cast(_batch->column(column_index)); - if (uint32_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - uint32_t value = uint32_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%u", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::UINT64: { - auto uint64_array = - std::static_pointer_cast(_batch->column(column_index)); - if (uint64_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - uint64_t value = uint64_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%" PRIu64, value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::BINARY: { - auto str_array = - std::static_pointer_cast(_batch->column(column_index)); - if (str_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - value = str_array->GetValue(_current_line_of_batch, &wbytes); - fill_slot(tuple, slot_desc, mem_pool, value, wbytes); - } - break; - } - case arrow::Type::type::FIXED_SIZE_BINARY: { - auto fixed_array = std::static_pointer_cast( - _batch->column(column_index)); - if (fixed_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - std::string value = fixed_array->GetString(_current_line_of_batch); - fill_slot(tuple, slot_desc, mem_pool, (uint8_t*)value.c_str(), value.length()); - } - break; - } - case arrow::Type::type::BOOL: { - auto boolean_array = - std::static_pointer_cast(_batch->column(column_index)); - if (boolean_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - bool value = boolean_array->Value(_current_line_of_batch); - if (value) { - fill_slot(tuple, slot_desc, mem_pool, (uint8_t*)"true", 4); - } else { - fill_slot(tuple, slot_desc, mem_pool, (uint8_t*)"false", 5); - } - } - break; - } - case arrow::Type::type::UINT8: { - auto uint8_array = - std::static_pointer_cast(_batch->column(column_index)); - if (uint8_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - uint8_t value = uint8_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%d", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::INT8: { - auto int8_array = - std::static_pointer_cast(_batch->column(column_index)); - if (int8_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - int8_t value = int8_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%d", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::UINT16: { - auto uint16_array = - std::static_pointer_cast(_batch->column(column_index)); - if (uint16_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - uint16_t value = uint16_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%d", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::INT16: { - auto int16_array = - std::static_pointer_cast(_batch->column(column_index)); - if (int16_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - int16_t value = int16_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%d", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::HALF_FLOAT: { - auto half_float_array = std::static_pointer_cast( - _batch->column(column_index)); - if (half_float_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - float value = half_float_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%f", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::FLOAT: { - auto float_array = - std::static_pointer_cast(_batch->column(column_index)); - if (float_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - float value = float_array->Value(_current_line_of_batch); - // Because the decimal type currently only supports (27, 9). - // Therefore, we use %.9f to give priority to the progress of the decimal type. - // Cannot use %f directly, this will cause 4000.9 to be converted to 4000.8999 - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%.9f", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::DOUBLE: { - auto double_array = - std::static_pointer_cast(_batch->column(column_index)); - if (double_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - double value = double_array->Value(_current_line_of_batch); - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%.9f", value); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::TIMESTAMP: { - auto ts_array = std::static_pointer_cast( - _batch->column(column_index)); - if (ts_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - RETURN_IF_ERROR(handle_timestamp(ts_array, tmp_buf, - &wbytes)); // convert timestamp to string time - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::DECIMAL: { - auto decimal_array = - std::static_pointer_cast(_batch->column(column_index)); - if (decimal_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - std::string value = decimal_array->FormatValue(_current_line_of_batch); - fill_slot(tuple, slot_desc, mem_pool, (const uint8_t*)value.c_str(), - value.length()); - } - break; - } - case arrow::Type::type::DATE32: { - auto ts_array = - std::static_pointer_cast(_batch->column(column_index)); - if (ts_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - time_t timestamp = (time_t)((int64_t)ts_array->Value(_current_line_of_batch) * - 24 * 60 * 60); - struct tm local; - localtime_r(×tamp, &local); - char* to = reinterpret_cast(&tmp_buf); - wbytes = (uint32_t)strftime(to, 64, "%Y-%m-%d", &local); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - case arrow::Type::type::DATE64: { - auto ts_array = - std::static_pointer_cast(_batch->column(column_index)); - if (ts_array->IsNull(_current_line_of_batch)) { - RETURN_IF_ERROR(set_field_null(tuple, slot_desc)); - } else { - // convert milliseconds to seconds - time_t timestamp = - (time_t)((int64_t)ts_array->Value(_current_line_of_batch) / 1000); - struct tm local; - localtime_r(×tamp, &local); - char* to = reinterpret_cast(&tmp_buf); - wbytes = (uint32_t)strftime(to, 64, "%Y-%m-%d %H:%M:%S", &local); - fill_slot(tuple, slot_desc, mem_pool, tmp_buf, wbytes); - } - break; - } - default: { - // other type not support. - std::stringstream str_error; - str_error << "The field name(" << slot_desc->col_name() << "), type(" - << _parquet_column_type[i] - << ") not support. RowGroup: " << _current_group - << ", Row: " << _current_line_of_group - << ", ColumnIndex:" << column_index; - LOG(WARNING) << str_error.str(); - return Status::InternalError(str_error.str()); - } - } - } - } catch (parquet::ParquetException& e) { - std::stringstream str_error; - str_error << e.what() << " RowGroup:" << _current_group - << ", Row:" << _current_line_of_group << ", ColumnIndex " << column_index; - LOG(WARNING) << str_error.str(); - return Status::InternalError(str_error.str()); - } - - // update data value - ++_current_line_of_group; - ++_current_line_of_batch; - return read_record_batch(eof); -} - Status ParquetReaderWrap::read_next_batch() { std::unique_lock lock(_mtx); while (!_closed && _queue.empty()) { diff --git a/be/src/exec/arrow/parquet_reader.h b/be/src/exec/arrow/parquet_reader.h index 3d8bb0d36a..4de5b5167c 100644 --- a/be/src/exec/arrow/parquet_reader.h +++ b/be/src/exec/arrow/parquet_reader.h @@ -51,7 +51,6 @@ class ExecEnv; class TBrokerRangeDesc; class TNetworkAddress; class RuntimeState; -class Tuple; class SlotDescriptor; class MemPool; class FileReader; @@ -66,16 +65,11 @@ public: int64_t range_start_offset, int64_t range_size, bool case_sensitive = true); ~ParquetReaderWrap() override = default; - // Read - Status read(Tuple* tuple, MemPool* mem_pool, bool* eof) override; Status size(int64_t* size) override; Status init_reader(const TupleDescriptor* tuple_desc, const std::string& timezone) override; Status init_parquet_type(); private: - void fill_slot(Tuple* tuple, SlotDescriptor* slot_desc, MemPool* mem_pool, const uint8_t* value, - int32_t len); - Status set_field_null(Tuple* tuple, const SlotDescriptor* slot_desc); Status read_record_batch(bool* eof); Status handle_timestamp(const std::shared_ptr& ts_array, uint8_t* buf, int32_t* wbtyes); diff --git a/be/src/exec/base_scanner.h b/be/src/exec/base_scanner.h index 0be92f9437..8706c1b9c9 100644 --- a/be/src/exec/base_scanner.h +++ b/be/src/exec/base_scanner.h @@ -25,7 +25,6 @@ namespace doris { -class Tuple; class TupleDescriptor; class RowDescriptor; class RuntimeState; @@ -59,11 +58,6 @@ public: // Open this scanner, will initialize information need to virtual Status open(); - // Get next tuple - virtual Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) { - return Status::NotSupported("Not Implemented get block"); - } - // Get next block virtual Status get_next(vectorized::Block* block, bool* eof) { return Status::NotSupported("Not Implemented get block"); diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp index 46eced5d08..6c6a5240f5 100644 --- a/be/src/exec/es/es_scroll_parser.cpp +++ b/be/src/exec/es/es_scroll_parser.cpp @@ -302,274 +302,6 @@ const std::string& ScrollParser::get_scroll_id() { return _scroll_id; } -Status ScrollParser::fill_tuple(const TupleDescriptor* tuple_desc, Tuple* tuple, - MemPool* tuple_pool, bool* line_eof, - const std::map& docvalue_context) { - *line_eof = true; - - if (_size <= 0 || _line_index >= _size) { - return Status::OK(); - } - - const rapidjson::Value& obj = _inner_hits_node[_line_index++]; - bool pure_doc_value = false; - if (obj.HasMember("fields")) { - pure_doc_value = true; - } - const rapidjson::Value& line = obj.HasMember(FIELD_SOURCE) ? obj[FIELD_SOURCE] : obj["fields"]; - - tuple->init(tuple_desc->byte_size()); - for (int i = 0; i < tuple_desc->slots().size(); ++i) { - const SlotDescriptor* slot_desc = tuple_desc->slots()[i]; - - if (!slot_desc->is_materialized()) { - continue; - } - // _id field must exist in every document, this is guaranteed by ES - // if _id was found in tuple, we would get `_id` value from inner-hit node - // json-format response would like below: - // "hits": { - // "hits": [ - // { - // "_id": "UhHNc3IB8XwmcbhBk1ES", - // "_source": { - // "k": 201, - // } - // } - // ] - // } - if (slot_desc->col_name() == FIELD_ID) { - // actually this branch will not be reached, this is guaranteed by Doris FE. - if (pure_doc_value) { - return Status::RuntimeError("obtain `_id` is not supported in doc_values mode"); - } - tuple->set_not_null(slot_desc->null_indicator_offset()); - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - // obj[FIELD_ID] must not be nullptr - std::string _id = obj[FIELD_ID].GetString(); - size_t len = _id.length(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(len)); - if (UNLIKELY(buffer == nullptr)) { - std::string details = strings::Substitute(ERROR_MEM_LIMIT_EXCEEDED, - "MaterializeNextRow", len, "string slot"); - RETURN_LIMIT_EXCEEDED(nullptr, details, len); - } - memcpy(buffer, _id.data(), len); - reinterpret_cast(slot)->data = buffer; - reinterpret_cast(slot)->size = len; - continue; - } - - // if pure_doc_value enabled, docvalue_context must contains the key - // todo: need move all `pure_docvalue` for every tuple outside fill_tuple - // should check pure_docvalue for one table scan not every tuple - const char* col_name = pure_doc_value ? docvalue_context.at(slot_desc->col_name()).c_str() - : slot_desc->col_name().c_str(); - - rapidjson::Value::ConstMemberIterator itr = line.FindMember(col_name); - if (itr == line.MemberEnd()) { - tuple->set_null(slot_desc->null_indicator_offset()); - continue; - } - - tuple->set_not_null(slot_desc->null_indicator_offset()); - const rapidjson::Value& col = line[col_name]; - - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - PrimitiveType type = slot_desc->type().type; - - // when the column value is null, the subsequent type casting will report an error - if (col.IsNull()) { - slot = nullptr; - continue; - } - switch (type) { - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: { - // sometimes elasticsearch user post some not-string value to Elasticsearch Index. - // because of reading value from _source, we can not process all json type and then just transfer the value to original string representation - // this may be a tricky, but we can work around this issue - std::string val; - if (pure_doc_value) { - if (!col[0].IsString()) { - val = json_value_to_string(col[0]); - } else { - val = col[0].GetString(); - } - } else { - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); - if (!col.IsString()) { - val = json_value_to_string(col); - } else { - val = col.GetString(); - } - } - size_t val_size = val.length(); - char* buffer = reinterpret_cast(tuple_pool->try_allocate_unaligned(val_size)); - if (UNLIKELY(buffer == nullptr)) { - std::string details = strings::Substitute( - ERROR_MEM_LIMIT_EXCEEDED, "MaterializeNextRow", val_size, "string slot"); - RETURN_LIMIT_EXCEEDED(nullptr, details, val_size); - } - memcpy(buffer, val.data(), val_size); - reinterpret_cast(slot)->data = buffer; - reinterpret_cast(slot)->size = val_size; - break; - } - - case TYPE_TINYINT: { - Status status = get_int_value(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_SMALLINT: { - Status status = get_int_value(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_INT: { - Status status = get_int_value(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_BIGINT: { - Status status = get_int_value(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_LARGEINT: { - Status status = get_int_value<__int128>(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_DOUBLE: { - Status status = get_float_value(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_FLOAT: { - Status status = get_float_value(col, type, slot, pure_doc_value); - if (!status.ok()) { - return status; - } - break; - } - - case TYPE_BOOLEAN: { - if (col.IsBool()) { - *reinterpret_cast(slot) = col.GetBool(); - break; - } - - if (col.IsNumber()) { - *reinterpret_cast(slot) = col.GetInt(); - break; - } - - bool is_nested_str = false; - if (pure_doc_value && col.IsArray() && col[0].IsBool()) { - *reinterpret_cast(slot) = col[0].GetBool(); - break; - } else if (pure_doc_value && col.IsArray() && col[0].IsString()) { - is_nested_str = true; - } else if (pure_doc_value && col.IsArray()) { - return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN"); - } - - const rapidjson::Value& str_col = is_nested_str ? col[0] : col; - const std::string& val = str_col.GetString(); - size_t val_size = str_col.GetStringLength(); - StringParser::ParseResult result; - bool b = StringParser::string_to_bool(val.c_str(), val_size, &result); - RETURN_ERROR_IF_PARSING_FAILED(result, str_col, type); - *reinterpret_cast(slot) = b; - break; - } - case TYPE_DECIMALV2: { - DecimalV2Value data; - - if (col.IsDouble()) { - data.assign_from_double(col.GetDouble()); - } else { - std::string val; - if (pure_doc_value) { - if (!col[0].IsString()) { - val = json_value_to_string(col[0]); - } else { - val = col[0].GetString(); - } - } else { - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); - if (!col.IsString()) { - val = json_value_to_string(col); - } else { - val = col.GetString(); - } - } - data.parse_from_str(val.data(), val.length()); - } - reinterpret_cast(slot)->set_value(data.value()); - break; - } - - case TYPE_DATE: - case TYPE_DATETIME: { - // this would happend just only when `enable_docvalue_scan = false`, and field has timestamp format date from _source - if (col.IsNumber()) { - // ES process date/datetime field would use millisecond timestamp for index or docvalue - // processing date type field, if a number is encountered, Doris On ES will force it to be processed according to ms - // Doris On ES needs to be consistent with ES, so just divided by 1000 because the unit for from_unixtime is seconds - RETURN_IF_ERROR(fill_date_slot_with_timestamp(slot, col, type)); - } else if (col.IsArray() && pure_doc_value) { - // this would happened just only when `enable_docvalue_scan = true` - // ES add default format for all field after ES 6.4, if we not provided format for `date` field ES would impose - // a standard date-format for date field as `2020-06-16T00:00:00.000Z` - // At present, we just process this string format date. After some PR were merged into Doris, we would impose `epoch_mills` for - // date field's docvalue - if (col[0].IsString()) { - RETURN_IF_ERROR(fill_date_slot_with_strval(slot, col[0], type)); - break; - } - // ES would return millisecond timestamp for date field, divided by 1000 because the unit for from_unixtime is seconds - RETURN_IF_ERROR(fill_date_slot_with_timestamp(slot, col[0], type)); - } else { - // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source - RETURN_ERROR_IF_COL_IS_ARRAY(col, type); - RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type); - RETURN_IF_ERROR(fill_date_slot_with_strval(slot, col, type)); - } - break; - } - default: { - DCHECK(false); - break; - } - } - } - - *line_eof = false; - return Status::OK(); -} - Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, std::vector& columns, MemPool* tuple_pool, bool* line_eof, diff --git a/be/src/exec/es/es_scroll_parser.h b/be/src/exec/es/es_scroll_parser.h index f3c7af70fc..40d421a924 100644 --- a/be/src/exec/es/es_scroll_parser.h +++ b/be/src/exec/es/es_scroll_parser.h @@ -34,8 +34,6 @@ public: ~ScrollParser(); Status parse(const std::string& scroll_result, bool exactly_once = false); - Status fill_tuple(const TupleDescriptor* _tuple_desc, Tuple* tuple, MemPool* mem_pool, - bool* line_eof, const std::map& docvalue_context); Status fill_columns(const TupleDescriptor* _tuple_desc, std::vector& columns, MemPool* mem_pool, bool* line_eof, const std::map& docvalue_context); diff --git a/be/src/exec/text_converter.h b/be/src/exec/text_converter.h index 69c27e2c29..b8aa8fb5ba 100644 --- a/be/src/exec/text_converter.h +++ b/be/src/exec/text_converter.h @@ -24,7 +24,6 @@ class MemPool; class SlotDescriptor; class Status; struct StringRef; -class Tuple; class TupleDescriptor; // Helper class for dealing with text data, e.g., converting text data to @@ -33,19 +32,6 @@ class TextConverter { public: TextConverter(char escape_char); - // Converts slot data, of length 'len', into type of slot_desc, - // and writes the result into the tuples's slot. - // copy_string indicates whether we need to make a separate copy of the string data: - // For regular unescaped strings, we point to the original data in the _file_buf. - // For regular escaped strings, we copy an its unescaped string into a separate buffer - // and point to it. - // If the string needs to be copied, the memory is allocated from 'pool', otherwise - // 'pool' is unused. - // Unsuccessful conversions are turned into NULLs. - // Returns true if the value was written successfully. - bool write_slot(const SlotDescriptor* slot_desc, Tuple* tuple, const char* data, int len, - bool copy_string, bool need_escape, MemPool* pool); - void write_string_column(const SlotDescriptor* slot_desc, vectorized::MutableColumnPtr* column_ptr, const char* data, size_t len); diff --git a/be/src/exec/text_converter.hpp b/be/src/exec/text_converter.hpp index b6add9183f..d25b19340f 100644 --- a/be/src/exec/text_converter.hpp +++ b/be/src/exec/text_converter.hpp @@ -36,135 +36,6 @@ namespace doris { -// Note: this function has a codegen'd version. Changing this function requires -// corresponding changes to CodegenWriteSlot. -inline bool TextConverter::write_slot(const SlotDescriptor* slot_desc, Tuple* tuple, - const char* data, int len, bool copy_string, bool need_escape, - MemPool* pool) { - //Small batch import only \N is considered to be NULL, there is no replace_value function for batch import - if (slot_desc->is_nullable()) { - if (len == 2 && data[0] == '\\' && data[1] == 'N') { - tuple->set_null(slot_desc->null_indicator_offset()); - return true; - } else { - tuple->set_not_null(slot_desc->null_indicator_offset()); - } - } - - StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - - // Parse the raw-text data. Translate the text string to internal format. - switch (slot_desc->type().type) { - case TYPE_HLL: - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_STRING: { - StringRef* str_slot = reinterpret_cast(slot); - str_slot->data = const_cast(data); - str_slot->size = len; - if (len != 0 && (copy_string || need_escape)) { - DCHECK(pool != nullptr); - char* slot_data = reinterpret_cast(pool->allocate(len)); - - if (need_escape) { - unescape_string(data, slot_data, &str_slot->size); - } else { - memcpy(slot_data, data, str_slot->size); - } - - str_slot->data = slot_data; - } - - break; - } - - case TYPE_BOOLEAN: - *reinterpret_cast(slot) = StringParser::string_to_bool(data, len, &parse_result); - break; - - case TYPE_TINYINT: - *reinterpret_cast(slot) = - StringParser::string_to_int(data, len, &parse_result); - break; - - case TYPE_SMALLINT: - *reinterpret_cast(slot) = - StringParser::string_to_int(data, len, &parse_result); - break; - - case TYPE_INT: - *reinterpret_cast(slot) = - StringParser::string_to_int(data, len, &parse_result); - break; - - case TYPE_BIGINT: - *reinterpret_cast(slot) = - StringParser::string_to_int(data, len, &parse_result); - break; - - case TYPE_LARGEINT: { - __int128 tmp = StringParser::string_to_int<__int128>(data, len, &parse_result); - memcpy(slot, &tmp, sizeof(tmp)); - break; - } - - case TYPE_FLOAT: - *reinterpret_cast(slot) = - StringParser::string_to_float(data, len, &parse_result); - break; - - case TYPE_DOUBLE: - *reinterpret_cast(slot) = - StringParser::string_to_float(data, len, &parse_result); - break; - - case TYPE_DATE: { - DateTimeValue* ts_slot = reinterpret_cast(slot); - if (!ts_slot->from_date_str(data, len)) { - parse_result = StringParser::PARSE_FAILURE; - break; - } - - ts_slot->cast_to_date(); - break; - } - - case TYPE_DATETIME: { - DateTimeValue* ts_slot = reinterpret_cast(slot); - if (!ts_slot->from_date_str(data, len)) { - parse_result = StringParser::PARSE_FAILURE; - } - - ts_slot->to_datetime(); - break; - } - - case TYPE_DECIMALV2: { - DecimalV2Value decimal_slot; - - if (decimal_slot.parse_from_str(data, len)) { - parse_result = StringParser::PARSE_FAILURE; - } - - *reinterpret_cast(slot) = decimal_slot.value(); - break; - } - - default: - DCHECK(false) << "bad slot type: " << slot_desc->type(); - break; - } - - // TODO: add warning for overflow case - if (parse_result != StringParser::PARSE_SUCCESS) { - tuple->set_null(slot_desc->null_indicator_offset()); - return false; - } - - return true; -} - inline void TextConverter::write_string_column(const SlotDescriptor* slot_desc, vectorized::MutableColumnPtr* column_ptr, const char* data, size_t len) { diff --git a/be/src/exprs/CMakeLists.txt b/be/src/exprs/CMakeLists.txt index 7f18934f58..fb16f804e9 100644 --- a/be/src/exprs/CMakeLists.txt +++ b/be/src/exprs/CMakeLists.txt @@ -28,7 +28,6 @@ add_library(Exprs runtime_filter.cpp runtime_filter_rpc.cpp like_predicate.cpp - match_predicate.cpp math_functions.cpp rpc_fn_comm.cpp string_functions.cpp diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 98756a4d8b..8dea4f1775 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -44,114 +44,6 @@ namespace doris { // json path cannot contains: ", [, ] static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?"); -void JsonFunctions::init() {} - -IntVal JsonFunctions::get_json_int(FunctionContext* context, const StringVal& json_str, - const StringVal& path) { - if (json_str.is_null || path.is_null) { - return IntVal::null(); - } - std::string_view json_string((char*)json_str.ptr, json_str.len); - std::string_view path_string((char*)path.ptr, path.len); - rapidjson::Document document; - rapidjson::Value* root = - get_json_object(context, json_string, path_string, JSON_FUN_INT, &document); - if (root != nullptr && root->IsInt()) { - return IntVal(root->GetInt()); - } else { - return IntVal::null(); - } -} - -StringVal JsonFunctions::get_json_string(FunctionContext* context, const StringVal& json_str, - const StringVal& path) { - if (json_str.is_null || path.is_null) { - return StringVal::null(); - } - - std::string_view json_string((char*)json_str.ptr, json_str.len); - std::string_view path_string((char*)path.ptr, path.len); - rapidjson::Document document; - rapidjson::Value* root = - get_json_object(context, json_string, path_string, JSON_FUN_STRING, &document); - if (root == nullptr || root->IsNull()) { - return StringVal::null(); - } else if (root->IsString()) { - return AnyValUtil::from_string_temp(context, root->GetString()); - } else { - rapidjson::StringBuffer buf; - rapidjson::Writer writer(buf); - root->Accept(writer); - return AnyValUtil::from_string_temp(context, std::string(buf.GetString())); - } -} - -DoubleVal JsonFunctions::get_json_double(FunctionContext* context, const StringVal& json_str, - const StringVal& path) { - if (json_str.is_null || path.is_null) { - return DoubleVal::null(); - } - std::string_view json_string((char*)json_str.ptr, json_str.len); - std::string_view path_string((char*)path.ptr, path.len); - rapidjson::Document document; - rapidjson::Value* root = - get_json_object(context, json_string, path_string, JSON_FUN_DOUBLE, &document); - if (root == nullptr || root->IsNull()) { - return DoubleVal::null(); - } else if (root->IsInt()) { - return DoubleVal(static_cast(root->GetInt())); - } else if (root->IsDouble()) { - return DoubleVal(root->GetDouble()); - } else { - return DoubleVal::null(); - } -} - -StringVal JsonFunctions::json_array(FunctionContext* context, int num_args, - const StringVal* json_str) { - if (json_str->is_null) { - return StringVal::null(); - } - rapidjson::Value array_obj(rapidjson::kArrayType); - rapidjson::Document document; - rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); - //flag: The number it contains represents the type of previous parameters - const StringVal& flag = json_str[num_args - 1]; - DCHECK_EQ(num_args - 1, flag.len); - for (int i = 0; i < num_args - 1; ++i) { - const StringVal& arg = json_str[i]; - rapidjson::Value val = parse_str_with_flag(arg, flag, i, allocator); - array_obj.PushBack(val, allocator); - } - rapidjson::StringBuffer buf; - rapidjson::Writer writer(buf); - array_obj.Accept(writer); - return AnyValUtil::from_string_temp(context, std::string(buf.GetString())); -} - -StringVal JsonFunctions::json_object(FunctionContext* context, int num_args, - const StringVal* json_str) { - if (json_str->is_null) { - return StringVal::null(); - } - rapidjson::Document document(rapidjson::kObjectType); - rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); - const StringVal& flag = json_str[num_args - 1]; - document.SetObject(); - DCHECK_EQ(num_args - 1, flag.len); - for (int i = 1; i < num_args - 1; i = i + 2) { - const StringVal& arg = json_str[i]; - rapidjson::Value key(rapidjson::kStringType); - key.SetString((char*)json_str[i - 1].ptr, json_str[i - 1].len, allocator); - rapidjson::Value val = parse_str_with_flag(arg, flag, i, allocator); - document.AddMember(key, val, allocator); - } - rapidjson::StringBuffer buf; - rapidjson::Writer writer(buf); - document.Accept(writer); - return AnyValUtil::from_string_temp(context, std::string(buf.GetString())); -} - rapidjson::Value JsonFunctions::parse_str_with_flag(const StringVal& arg, const StringVal& flag, const int num, rapidjson::Document::AllocatorType& allocator) { @@ -185,17 +77,6 @@ rapidjson::Value JsonFunctions::parse_str_with_flag(const StringVal& arg, const } return val; } -StringVal JsonFunctions::json_quote(FunctionContext* context, const StringVal& json_str) { - if (json_str.is_null) { - return StringVal::null(); - } - rapidjson::Value array_obj(rapidjson::kObjectType); - array_obj.SetString(rapidjson::StringRef((char*)json_str.ptr, json_str.len)); - rapidjson::StringBuffer buf; - rapidjson::Writer writer(buf); - array_obj.Accept(writer); - return AnyValUtil::from_string_temp(context, std::string(buf.GetString())); -} rapidjson::Value* JsonFunctions::match_value(const std::vector& parsed_paths, rapidjson::Value* document, @@ -298,74 +179,6 @@ rapidjson::Value* JsonFunctions::match_value(const std::vector& parsed return root; } -rapidjson::Value* JsonFunctions::get_json_object(FunctionContext* context, - std::string_view json_string, - std::string_view path_string, - const JsonFunctionType& fntype, - rapidjson::Document* document) { - // split path by ".", and escape quota by "\" - // eg: - // '$.text#abc.xyz' -> [$, text#abc, xyz] - // '$."text.abc".xyz' -> [$, text.abc, xyz] - // '$."text.abc"[1].xyz' -> [$, text.abc[1], xyz] - JsonState* json_state = nullptr; - JsonState tmp_json_state; - -#ifndef BE_TEST - json_state = reinterpret_cast( - context->get_function_state(FunctionContext::FRAGMENT_LOCAL)); - if (json_state == nullptr) { - json_state = &tmp_json_state; - } - - if (json_state->json_paths.size() == 0) { -#ifdef USE_LIBCPP - std::string s(path_string); - auto tok = get_json_token(s); -#else - auto tok = get_json_token(path_string); -#endif - std::vector paths(tok.begin(), tok.end()); - get_parsed_paths(paths, &json_state->json_paths); - } -#else - json_state = &tmp_json_state; - std::string s(path_string); - auto tok = get_json_token(s); - std::vector paths(tok.begin(), tok.end()); - get_parsed_paths(paths, &json_state->json_paths); -#endif - - VLOG_TRACE << "first parsed path: " << json_state->json_paths[0].debug_string(); - - if (!json_state->json_paths[0].is_valid) { - return document; - } - - if (UNLIKELY(json_state->json_paths.size() == 1)) { - if (fntype == JSON_FUN_STRING) { - document->SetString(json_string.data(), json_string.length(), document->GetAllocator()); - } else { - return document; - } - } - - if (!json_state->document.IsNull()) { - document = &json_state->document; - } else { - document->Parse(json_string.data(), json_string.length()); - //rapidjson::Document document; - if (UNLIKELY(document->HasParseError())) { - VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": " - << GetParseError_En(document->GetParseError()); - document->SetNull(); - return document; - } - } - - return match_value(json_state->json_paths, document, document->GetAllocator()); -} - rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( const std::string& json_path, rapidjson::Value* document, rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { @@ -426,49 +239,6 @@ rapidjson::Value* JsonFunctions::get_json_object_from_parsed_json( return root; } -void JsonFunctions::json_path_prepare(doris_udf::FunctionContext* context, - doris_udf::FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::FRAGMENT_LOCAL) { - return; - } - - if (!context->is_arg_constant(0) && !context->is_arg_constant(1)) { - return; - } - - JsonState* json_state = new JsonState; - - StringVal* json_str = reinterpret_cast(context->get_constant_arg(0)); - if (json_str != nullptr && !json_str->is_null) { - std::string json_string((char*)json_str->ptr, json_str->len); - json_state->document.Parse(json_string.c_str()); - } - StringVal* path = reinterpret_cast(context->get_constant_arg(1)); - if (path != nullptr && !path->is_null) { - std::string path_str(reinterpret_cast(path->ptr), path->len); - boost::tokenizer> tok( - path_str, boost::escaped_list_separator("\\", ".", "\"")); - std::vector path_exprs(tok.begin(), tok.end()); - get_parsed_paths(path_exprs, &json_state->json_paths); - } - - context->set_function_state(scope, json_state); - VLOG_TRACE << "prepare json path. size: " << json_state->json_paths.size(); -} - -void JsonFunctions::json_path_close(doris_udf::FunctionContext* context, - doris_udf::FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::FRAGMENT_LOCAL) { - return; - } - - JsonState* json_state = reinterpret_cast(context->get_function_state(scope)); - if (json_state != nullptr) { - delete json_state; - VLOG_TRACE << "close json state"; - } -} - void JsonFunctions::parse_json_paths(const std::string& path_string, std::vector* parsed_paths) { // split path by ".", and escape quota by "\" diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index 61b3e8d0db..71dcf8ec55 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -88,36 +88,8 @@ struct JsonPath { } }; -struct JsonState { - std::vector json_paths; - rapidjson::Document document; -}; - class JsonFunctions { public: - static void init(); - static doris_udf::IntVal get_json_int(doris_udf::FunctionContext* context, - const doris_udf::StringVal& json_str, - const doris_udf::StringVal& path); - static doris_udf::StringVal get_json_string(doris_udf::FunctionContext* context, - const doris_udf::StringVal& json_str, - const doris_udf::StringVal& path); - static doris_udf::DoubleVal get_json_double(doris_udf::FunctionContext* context, - const doris_udf::StringVal& json_str, - const doris_udf::StringVal& path); - - static rapidjson::Value* get_json_object(FunctionContext* context, std::string_view json_string, - std::string_view path_string, - const JsonFunctionType& fntype, - rapidjson::Document* document); - - static doris_udf::StringVal json_array(doris_udf::FunctionContext* context, int num_args, - const doris_udf::StringVal* json_str); - static doris_udf::StringVal json_object(doris_udf::FunctionContext* context, int num_args, - const doris_udf::StringVal* json_str); - static doris_udf::StringVal json_quote(doris_udf::FunctionContext* context, - const doris_udf::StringVal& json_str); - /** * The `document` parameter must be has parsed. * return Value Is Array object @@ -137,12 +109,6 @@ public: const std::vector& parsed_paths, rapidjson::Value* document, rapidjson::Document::AllocatorType& mem_allocator); - static void json_path_prepare(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - - static void json_path_close(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - static void parse_json_paths(const std::string& path_strings, std::vector* parsed_paths); diff --git a/be/src/exprs/math_functions.cpp b/be/src/exprs/math_functions.cpp index 6eacf5fe6e..94de1353e6 100644 --- a/be/src/exprs/math_functions.cpp +++ b/be/src/exprs/math_functions.cpp @@ -100,414 +100,6 @@ double MathFunctions::my_double_round(double value, int64_t dec, bool dec_unsign return tmp2; } -void MathFunctions::init() {} - -DoubleVal MathFunctions::pi(FunctionContext* ctx) { - return DoubleVal(M_PI); -} - -DoubleVal MathFunctions::e(FunctionContext* ctx) { - return DoubleVal(M_E); -} - -// libc++ did not have std::abs for int128 -__int128_t largeint_abs(__int128_t x) { - return x > 0 ? x : -x; -} - -DecimalV2Val MathFunctions::abs(FunctionContext* ctx, const doris_udf::DecimalV2Val& val) { - if (val.is_null) { - return DecimalV2Val::null(); - } - if (UNLIKELY(val.val == MIN_INT128)) { - return DecimalV2Val::null(); - } else { - return DecimalV2Val(largeint_abs(val.val)); - } -} - -LargeIntVal MathFunctions::abs(FunctionContext* ctx, const doris_udf::LargeIntVal& val) { - if (val.is_null) { - return LargeIntVal::null(); - } - if (UNLIKELY(val.val == MIN_INT128)) { - return LargeIntVal::null(); - } else { - return LargeIntVal(largeint_abs(val.val)); - } -} - -LargeIntVal MathFunctions::abs(FunctionContext* ctx, const doris_udf::BigIntVal& val) { - if (val.is_null) { - return LargeIntVal::null(); - } - return LargeIntVal(largeint_abs(__int128(val.val))); -} - -BigIntVal MathFunctions::abs(FunctionContext* ctx, const doris_udf::IntVal& val) { - if (val.is_null) { - return BigIntVal::null(); - } - return BigIntVal(std::abs(int64_t(val.val))); -} - -IntVal MathFunctions::abs(FunctionContext* ctx, const doris_udf::SmallIntVal& val) { - if (val.is_null) { - return IntVal::null(); - } - return IntVal(std::abs(int32_t(val.val))); -} - -SmallIntVal MathFunctions::abs(FunctionContext* ctx, const doris_udf::TinyIntVal& val) { - if (val.is_null) { - return SmallIntVal::null(); - } - return SmallIntVal(std::abs(int16_t(val.val))); -} - -#define LOG_MATH_FN(NAME, RET_TYPE, INPUT_TYPE, FN) \ - RET_TYPE MathFunctions::NAME(FunctionContext* ctx, const INPUT_TYPE& v) { \ - if (v.is_null || v.val <= 0) return RET_TYPE::null(); \ - return RET_TYPE(FN(v.val)); \ - } - -// Generates a UDF that always calls FN() on the input val and returns it. -#define ONE_ARG_MATH_FN(NAME, RET_TYPE, INPUT_TYPE, FN) \ - RET_TYPE MathFunctions::NAME(FunctionContext* ctx, const INPUT_TYPE& v) { \ - if (v.is_null) return RET_TYPE::null(); \ - return RET_TYPE(FN(v.val)); \ - } - -ONE_ARG_MATH_FN(abs, DoubleVal, DoubleVal, std::fabs); -ONE_ARG_MATH_FN(abs, FloatVal, FloatVal, std::fabs); -ONE_ARG_MATH_FN(sin, DoubleVal, DoubleVal, std::sin); -ONE_ARG_MATH_FN(asin, DoubleVal, DoubleVal, std::asin); -ONE_ARG_MATH_FN(cos, DoubleVal, DoubleVal, std::cos); -ONE_ARG_MATH_FN(acos, DoubleVal, DoubleVal, std::acos); -ONE_ARG_MATH_FN(tan, DoubleVal, DoubleVal, std::tan); -ONE_ARG_MATH_FN(atan, DoubleVal, DoubleVal, std::atan); -ONE_ARG_MATH_FN(sqrt, DoubleVal, DoubleVal, std::sqrt); -ONE_ARG_MATH_FN(cbrt, DoubleVal, DoubleVal, std::cbrt); -ONE_ARG_MATH_FN(ceil, BigIntVal, DoubleVal, std::ceil); -ONE_ARG_MATH_FN(floor, BigIntVal, DoubleVal, std::floor); -ONE_ARG_MATH_FN(exp, DoubleVal, DoubleVal, std::exp); -LOG_MATH_FN(ln, DoubleVal, DoubleVal, std::log); -LOG_MATH_FN(log10, DoubleVal, DoubleVal, std::log10); - -TinyIntVal MathFunctions::sign(FunctionContext* ctx, const DoubleVal& v) { - if (v.is_null) { - return TinyIntVal::null(); - } - return TinyIntVal((v.val > 0) ? 1 : ((v.val < 0) ? -1 : 0)); -} - -DoubleVal MathFunctions::radians(FunctionContext* ctx, const DoubleVal& v) { - if (v.is_null) { - return v; - } - return DoubleVal(v.val * M_PI / 180.0); -} - -DoubleVal MathFunctions::degrees(FunctionContext* ctx, const DoubleVal& v) { - if (v.is_null) { - return v; - } - return DoubleVal(v.val * 180.0 / M_PI); -} - -BigIntVal MathFunctions::round(FunctionContext* ctx, const DoubleVal& v) { - if (v.is_null) { - return BigIntVal::null(); - } - return BigIntVal(static_cast(v.val + ((v.val < 0) ? -0.5 : 0.5))); -} - -BigIntVal MathFunctions::round_bankers(FunctionContext* ctx, const DoubleVal& v) { - return BigIntVal(static_cast(round_bankers(ctx, v, IntVal(0)).val)); -} - -DoubleVal MathFunctions::round_bankers(doris_udf::FunctionContext* ctx, const DoubleVal& v, - const IntVal& d) { - const double TOLERANCE = 1e-10; - double shift = std::pow(10, d.val); - double t = v.val * shift; - double rounded = std::round(t); - if (int64_t(rounded) % 2 == 1) { - if (::abs(rounded - t) - 0.5 < TOLERANCE) { - rounded -= 1; - } else { - rounded += 1; - } - } - return DoubleVal(rounded / shift); -} - -DoubleVal MathFunctions::round_up_to(FunctionContext* ctx, const DoubleVal& v, - const IntVal& scale) { - if (v.is_null || scale.is_null) { - return DoubleVal::null(); - } - return DoubleVal(my_double_round(v.val, scale.val, false, false)); -} - -DoubleVal MathFunctions::truncate(FunctionContext* ctx, const DoubleVal& v, const IntVal& scale) { - if (v.is_null || scale.is_null) { - return DoubleVal::null(); - } - return DoubleVal(my_double_round(v.val, scale.val, false, true)); -} - -DoubleVal MathFunctions::log2(FunctionContext* ctx, const DoubleVal& v) { - if (v.is_null || v.val <= 0.0) { - return DoubleVal::null(); - } - return DoubleVal(std::log(v.val) / std::log(2.0)); -} - -const double EPSILON = 1e-9; -DoubleVal MathFunctions::log(FunctionContext* ctx, const DoubleVal& base, const DoubleVal& v) { - if (base.is_null || v.is_null) { - return DoubleVal::null(); - } - if (base.val <= 0 || std::fabs(base.val - 1.0) < EPSILON || v.val <= 0.0) { - return DoubleVal::null(); - } - - return DoubleVal(std::log(v.val) / std::log(base.val)); -} - -DoubleVal MathFunctions::pow(FunctionContext* ctx, const DoubleVal& base, const DoubleVal& exp) { - if (base.is_null || exp.is_null) { - return DoubleVal::null(); - } - return DoubleVal(std::pow(base.val, exp.val)); -} - -void MathFunctions::rand_prepare(FunctionContext* ctx, FunctionContext::FunctionStateScope scope) { - std::mt19937* generator = reinterpret_cast(ctx->allocate(sizeof(std::mt19937))); - if (UNLIKELY(generator == nullptr)) { - LOG(ERROR) << "allocate random seed generator failed."; - return; - } - ctx->set_function_state(scope, generator); - new (generator) std::mt19937(); - if (scope == FunctionContext::THREAD_LOCAL) { - if (ctx->get_num_args() == 1) { - uint32_t seed = 0; - // This is a call to RandSeed, initialize the seed - // TODO: should we support non-constant seed? - if (!ctx->is_arg_constant(0)) { - ctx->set_error("Seed argument to rand() must be constant"); - return; - } - BigIntVal* seed_arg = static_cast(ctx->get_constant_arg(0)); - if (!seed_arg->is_null) { - seed = seed_arg->val; - } - generator->seed(seed); - } else { - generator->seed(std::random_device()()); - } - } -} - -DoubleVal MathFunctions::rand(FunctionContext* ctx) { - std::mt19937* generator = - reinterpret_cast(ctx->get_function_state(FunctionContext::THREAD_LOCAL)); - DCHECK(generator != nullptr); - static const double min = 0.0; - static const double max = 1.0; - std::uniform_real_distribution distribution(min, max); - return DoubleVal(distribution(*generator)); -} - -DoubleVal MathFunctions::rand_seed(FunctionContext* ctx, const BigIntVal& seed) { - if (seed.is_null) { - return DoubleVal::null(); - } - return rand(ctx); -} - -void MathFunctions::rand_close(FunctionContext* ctx, FunctionContext::FunctionStateScope scope) { - if (scope == FunctionContext::THREAD_LOCAL) { - uint8_t* generator = - reinterpret_cast(ctx->get_function_state(FunctionContext::THREAD_LOCAL)); - ctx->free(generator); - ctx->set_function_state(FunctionContext::THREAD_LOCAL, nullptr); - } -} - -StringVal MathFunctions::bin(FunctionContext* ctx, const BigIntVal& v) { - if (v.is_null) { - return StringVal::null(); - } - // Cast to an unsigned integer because it is compiler dependent - // whether the sign bit will be shifted like a regular bit. - // (logical vs. arithmetic shift for signed numbers) - uint64_t n = static_cast(v.val); - const size_t max_bits = sizeof(uint64_t) * 8; - char result[max_bits]; - uint32_t index = max_bits; - do { - result[--index] = '0' + (n & 1); - } while (n >>= 1); - return AnyValUtil::from_buffer_temp(ctx, result + index, max_bits - index); -} - -StringVal MathFunctions::hex_int(FunctionContext* ctx, const BigIntVal& v) { - if (v.is_null) { - return StringVal::null(); - } - - uint64_t num = v.val; - if (num == 0) { - return AnyValUtil::from_string_temp(ctx, "0"); - } - char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; - // uint64_t max value 0xFFFFFFFFFFFFFFFF , 16 'F' - // need 1 more space for '\0' - char ans[17]; - int i = 0; - while (num) { - ans[i++] = hex[num & 15]; - num = num >> 4; - } - ans[i] = '\0'; - // reverse - for (int k = 0, j = i - 1; k <= j; k++, j--) { - char tmp = ans[j]; - ans[j] = ans[k]; - ans[k] = tmp; - } - return AnyValUtil::from_string_temp(ctx, ans); -} - -StringVal MathFunctions::hex_string(FunctionContext* ctx, const StringVal& s) { - if (s.is_null) { - return StringVal::null(); - } - - StringVal result = StringVal::create_temp_string_val(ctx, s.len * 2); - simd::VStringFunctions::hex_encode(s.ptr, s.len, reinterpret_cast(result.ptr)); - return result; -} - -StringVal MathFunctions::unhex(FunctionContext* ctx, const StringVal& s) { - if (s.is_null) { - return StringVal::null(); - } - // For odd number of chars return empty string like Hive does. - if (s.len & 1) { - return StringVal(); - } - - int result_len = s.len / 2; - StringVal result_string_val(ctx, result_len); - char* result = reinterpret_cast(result_string_val.ptr); - int res_index = 0; - int s_index = 0; - while (s_index < s.len) { - char c = 0; - for (int j = 0; j < 2; ++j, ++s_index) { - switch (s.ptr[s_index]) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - c += (s.ptr[s_index] - '0') * ((j == 0) ? 16 : 1); - break; - case 'A': - case 'B': - case 'C': - case 'D': - case 'E': - case 'F': - // Map to decimal values [10, 15] - c += (s.ptr[s_index] - 'A' + 10) * ((j == 0) ? 16 : 1); - break; - case 'a': - case 'b': - case 'c': - case 'd': - case 'e': - case 'f': - // Map to decimal [10, 15] - c += (s.ptr[s_index] - 'a' + 10) * ((j == 0) ? 16 : 1); - break; - default: - // Character not in hex alphabet, return empty string. - return StringVal(); - } - } - result[res_index] = c; - ++res_index; - } - return result_string_val; -} - -StringVal MathFunctions::conv_int(FunctionContext* ctx, const BigIntVal& num, - const TinyIntVal& src_base, const TinyIntVal& dest_base) { - if (num.is_null || src_base.is_null || dest_base.is_null) { - return StringVal::null(); - } - // As in MySQL and Hive, min base is 2 and max base is 36. - // (36 is max base representable by alphanumeric chars) - // If a negative target base is given, num should be interpreted in 2's complement. - if (std::abs(src_base.val) < MIN_BASE || std::abs(src_base.val) > MAX_BASE || - std::abs(dest_base.val) < MIN_BASE || std::abs(dest_base.val) > MAX_BASE) { - // Return nullptr like Hive does. - return StringVal::null(); - } - // Invalid input. - if (src_base.val < 0 && num.val >= 0) { - return StringVal::null(); - } - int64_t decimal_num = num.val; - if (src_base.val != 10) { - // Convert src_num representing a number in src_base but encoded in decimal - // into its actual decimal number. - if (!decimal_in_base_to_decimal(num.val, src_base.val, &decimal_num)) { - // Handle overflow, setting decimal_num appropriately. - handle_parse_result(dest_base.val, &decimal_num, StringParser::PARSE_OVERFLOW); - } - } - return decimal_to_base(ctx, decimal_num, dest_base.val); -} - -StringVal MathFunctions::conv_string(FunctionContext* ctx, const StringVal& num_str, - const TinyIntVal& src_base, const TinyIntVal& dest_base) { - if (num_str.is_null || src_base.is_null || dest_base.is_null) { - return StringVal::null(); - } - // As in MySQL and Hive, min base is 2 and max base is 36. - // (36 is max base representable by alphanumeric chars) - // If a negative target base is given, num should be interpreted in 2's complement. - if (std::abs(src_base.val) < MIN_BASE || std::abs(src_base.val) > MAX_BASE || - std::abs(dest_base.val) < MIN_BASE || std::abs(dest_base.val) > MAX_BASE) { - // Return nullptr like Hive does. - return StringVal::null(); - } - // Convert digits in num_str in src_base to decimal. - StringParser::ParseResult parse_res; - int64_t decimal_num = StringParser::string_to_int( - reinterpret_cast(num_str.ptr), num_str.len, src_base.val, &parse_res); - if (src_base.val < 0 && decimal_num >= 0) { - // Invalid input. - return StringVal::null(); - } - if (!handle_parse_result(dest_base.val, &decimal_num, parse_res)) { - // Return 0 for invalid input strings like Hive does. - return StringVal(reinterpret_cast(const_cast("0")), 1); - } - return decimal_to_base(ctx, decimal_num, dest_base.val); -} - StringVal MathFunctions::decimal_to_base(FunctionContext* ctx, int64_t src_num, int8_t dest_base) { // Max number of digits of any base (base 2 gives max digits), plus sign. const size_t max_digits = sizeof(uint64_t) * 8 + 1; @@ -579,153 +171,4 @@ bool MathFunctions::handle_parse_result(int8_t dest_base, int64_t* num, return true; } -BigIntVal MathFunctions::pmod_bigint(FunctionContext* ctx, const BigIntVal& a, const BigIntVal& b) { - if (a.is_null || b.is_null) { - return BigIntVal::null(); - } - return BigIntVal(((a.val % b.val) + b.val) % b.val); -} - -DoubleVal MathFunctions::pmod_double(FunctionContext* ctx, const DoubleVal& a, const DoubleVal& b) { - if (a.is_null || b.is_null) { - return DoubleVal::null(); - } - return DoubleVal(fmod(fmod(a.val, b.val) + b.val, b.val)); -} - -FloatVal MathFunctions::fmod_float(FunctionContext* ctx, const FloatVal& a, const FloatVal& b) { - if (a.is_null || b.is_null || b.val == 0) { - return FloatVal::null(); - } - return FloatVal(fmodf(a.val, b.val)); -} - -DoubleVal MathFunctions::fmod_double(FunctionContext* ctx, const DoubleVal& a, const DoubleVal& b) { - if (a.is_null || b.is_null || b.val == 0) { - return DoubleVal::null(); - } - return DoubleVal(fmod(a.val, b.val)); -} - -BigIntVal MathFunctions::positive_bigint(FunctionContext* ctx, const BigIntVal& val) { - return val; -} - -DoubleVal MathFunctions::positive_double(FunctionContext* ctx, const DoubleVal& val) { - return val; -} - -DecimalV2Val MathFunctions::positive_decimal(FunctionContext* ctx, const DecimalV2Val& val) { - return val; -} - -BigIntVal MathFunctions::negative_bigint(FunctionContext* ctx, const BigIntVal& val) { - if (val.is_null) { - return val; - } - return BigIntVal(-val.val); -} - -DoubleVal MathFunctions::negative_double(FunctionContext* ctx, const DoubleVal& val) { - if (val.is_null) { - return val; - } - return DoubleVal(-val.val); -} - -DecimalV2Val MathFunctions::negative_decimal(FunctionContext* ctx, const DecimalV2Val& val) { - if (val.is_null) { - return val; - } - const DecimalV2Value& dv1 = DecimalV2Value::from_decimal_val(val); - DecimalV2Val result; - (-dv1).to_decimal_val(&result); - return result; -} - -#define LEAST_FN(TYPE) \ - TYPE MathFunctions::least(FunctionContext* ctx, int num_args, const TYPE* args) { \ - if (args[0].is_null) return TYPE::null(); \ - int result_idx = 0; \ - for (int i = 1; i < num_args; ++i) { \ - if (args[i].is_null) return TYPE::null(); \ - if (args[i].val < args[result_idx].val) result_idx = i; \ - } \ - return TYPE(args[result_idx].val); \ - } - -#define LEAST_FNS() \ - LEAST_FN(TinyIntVal); \ - LEAST_FN(SmallIntVal); \ - LEAST_FN(IntVal); \ - LEAST_FN(BigIntVal); \ - LEAST_FN(LargeIntVal); \ - LEAST_FN(FloatVal); \ - LEAST_FN(DoubleVal); - -LEAST_FNS(); - -#define LEAST_NONNUMERIC_FN(TYPE_NAME, TYPE, DORIS_TYPE) \ - TYPE MathFunctions::least(FunctionContext* ctx, int num_args, const TYPE* args) { \ - if (args[0].is_null) return TYPE::null(); \ - DORIS_TYPE result_val = DORIS_TYPE::from_##TYPE_NAME(args[0]); \ - for (int i = 1; i < num_args; ++i) { \ - if (args[i].is_null) return TYPE::null(); \ - DORIS_TYPE val = DORIS_TYPE::from_##TYPE_NAME(args[i]); \ - if (val < result_val) result_val = val; \ - } \ - TYPE result; \ - result_val.to_##TYPE_NAME(&result); \ - return result; \ - } - -#define LEAST_NONNUMERIC_FNS() \ - LEAST_NONNUMERIC_FN(string_val, StringVal, StringRef); \ - LEAST_NONNUMERIC_FN(datetime_val, DateTimeVal, DateTimeValue); \ - LEAST_NONNUMERIC_FN(decimal_val, DecimalV2Val, DecimalV2Value); - -LEAST_NONNUMERIC_FNS(); - -#define GREATEST_FN(TYPE) \ - TYPE MathFunctions::greatest(FunctionContext* ctx, int num_args, const TYPE* args) { \ - if (args[0].is_null) return TYPE::null(); \ - int result_idx = 0; \ - for (int i = 1; i < num_args; ++i) { \ - if (args[i].is_null) return TYPE::null(); \ - if (args[i].val > args[result_idx].val) result_idx = i; \ - } \ - return TYPE(args[result_idx].val); \ - } - -#define GREATEST_FNS() \ - GREATEST_FN(TinyIntVal); \ - GREATEST_FN(SmallIntVal); \ - GREATEST_FN(IntVal); \ - GREATEST_FN(BigIntVal); \ - GREATEST_FN(LargeIntVal); \ - GREATEST_FN(FloatVal); \ - GREATEST_FN(DoubleVal); - -GREATEST_FNS(); - -#define GREATEST_NONNUMERIC_FN(TYPE_NAME, TYPE, DORIS_TYPE) \ - TYPE MathFunctions::greatest(FunctionContext* ctx, int num_args, const TYPE* args) { \ - if (args[0].is_null) return TYPE::null(); \ - DORIS_TYPE result_val = DORIS_TYPE::from_##TYPE_NAME(args[0]); \ - for (int i = 1; i < num_args; ++i) { \ - if (args[i].is_null) return TYPE::null(); \ - DORIS_TYPE val = DORIS_TYPE::from_##TYPE_NAME(args[i]); \ - if (val > result_val) result_val = val; \ - } \ - TYPE result; \ - result_val.to_##TYPE_NAME(&result); \ - return result; \ - } - -#define GREATEST_NONNUMERIC_FNS() \ - GREATEST_NONNUMERIC_FN(string_val, StringVal, StringRef); \ - GREATEST_NONNUMERIC_FN(datetime_val, DateTimeVal, DateTimeValue); \ - GREATEST_NONNUMERIC_FN(decimal_val, DecimalV2Val, DecimalV2Value); - -GREATEST_NONNUMERIC_FNS(); } // namespace doris diff --git a/be/src/exprs/math_functions.h b/be/src/exprs/math_functions.h index cac13ef6c0..257916f61d 100644 --- a/be/src/exprs/math_functions.h +++ b/be/src/exprs/math_functions.h @@ -28,163 +28,6 @@ namespace doris { class MathFunctions { public: - static void init(); - - static doris_udf::DoubleVal pi(doris_udf::FunctionContext* ctx); - static doris_udf::DoubleVal e(doris_udf::FunctionContext* ctx); - - static doris_udf::DoubleVal abs(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::FloatVal abs(doris_udf::FunctionContext*, const doris_udf::FloatVal&); - static doris_udf::DecimalV2Val abs(doris_udf::FunctionContext*, const doris_udf::DecimalV2Val&); - - // For integer math, we have to promote ABS() to the next highest integer type because - // in two's complement arithmetic, the largest negative value for any bit width is not - // representable as a positive value within the same width. For the largest width, we - // simply overflow. In the unlikely event a workaround is needed, one can simply cast - // to a higher precision decimal type. - static doris_udf::LargeIntVal abs(doris_udf::FunctionContext*, const doris_udf::LargeIntVal&); - static doris_udf::LargeIntVal abs(doris_udf::FunctionContext*, const doris_udf::BigIntVal&); - static doris_udf::BigIntVal abs(doris_udf::FunctionContext*, const doris_udf::IntVal&); - static doris_udf::IntVal abs(doris_udf::FunctionContext*, const doris_udf::SmallIntVal&); - static doris_udf::SmallIntVal abs(doris_udf::FunctionContext*, const doris_udf::TinyIntVal&); - - static doris_udf::TinyIntVal sign(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v); - - static doris_udf::DoubleVal sin(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal asin(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal cos(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal acos(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal tan(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal atan(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - - static doris_udf::BigIntVal ceil(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::BigIntVal floor(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::BigIntVal round(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v); - static doris_udf::BigIntVal round_bankers(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v); - static doris_udf::DoubleVal round_bankers(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v, - const doris_udf::IntVal& scale); - static doris_udf::DoubleVal round_up_to(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v, - const doris_udf::IntVal& scale); - static doris_udf::DoubleVal truncate(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v, - const doris_udf::IntVal& scale); - - static doris_udf::DoubleVal ln(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal log(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& base, - const doris_udf::DoubleVal& v); - static doris_udf::DoubleVal log2(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v); - static doris_udf::DoubleVal log10(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal exp(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - - static doris_udf::DoubleVal radians(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v); - static doris_udf::DoubleVal degrees(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& v); - - static doris_udf::DoubleVal sqrt(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal cbrt(doris_udf::FunctionContext*, const doris_udf::DoubleVal&); - static doris_udf::DoubleVal pow(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& base, - const doris_udf::DoubleVal& exp); - - /// Used for both rand() and rand_seed() - static void rand_prepare(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - static doris_udf::DoubleVal rand(doris_udf::FunctionContext*); - static doris_udf::DoubleVal rand_seed(doris_udf::FunctionContext*, - const doris_udf::BigIntVal& seed); - static void rand_close(FunctionContext* ctx, FunctionContext::FunctionStateScope scope); - - static doris_udf::StringVal bin(doris_udf::FunctionContext* ctx, const doris_udf::BigIntVal& v); - static doris_udf::StringVal hex_int(doris_udf::FunctionContext* ctx, - const doris_udf::BigIntVal& v); - static doris_udf::StringVal hex_string(doris_udf::FunctionContext* ctx, - const doris_udf::StringVal& s); - static doris_udf::StringVal unhex(doris_udf::FunctionContext* ctx, - const doris_udf::StringVal& s); - - static doris_udf::StringVal conv_int(doris_udf::FunctionContext* ctx, - const doris_udf::BigIntVal& num, - const doris_udf::TinyIntVal& src_base, - const doris_udf::TinyIntVal& dest_base); - static doris_udf::StringVal conv_string(doris_udf::FunctionContext* ctx, - const doris_udf::StringVal& num_str, - const doris_udf::TinyIntVal& src_base, - const doris_udf::TinyIntVal& dest_base); - - static doris_udf::BigIntVal pmod_bigint(doris_udf::FunctionContext* ctx, - const doris_udf::BigIntVal& a, - const doris_udf::BigIntVal& b); - static doris_udf::DoubleVal pmod_double(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& a, - const doris_udf::DoubleVal& b); - static doris_udf::FloatVal fmod_float(doris_udf::FunctionContext*, const doris_udf::FloatVal&, - const doris_udf::FloatVal&); - static doris_udf::DoubleVal fmod_double(doris_udf::FunctionContext*, - const doris_udf::DoubleVal&, - const doris_udf::DoubleVal&); - - static doris_udf::BigIntVal positive_bigint(doris_udf::FunctionContext* ctx, - const doris_udf::BigIntVal& val); - static doris_udf::DoubleVal positive_double(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& val); - static doris_udf::DecimalV2Val positive_decimal(doris_udf::FunctionContext* ctx, - const doris_udf::DecimalV2Val& val); - static doris_udf::BigIntVal negative_bigint(doris_udf::FunctionContext* ctx, - const doris_udf::BigIntVal& val); - static doris_udf::DoubleVal negative_double(doris_udf::FunctionContext* ctx, - const doris_udf::DoubleVal& val); - static doris_udf::DecimalV2Val negative_decimal(doris_udf::FunctionContext* ctx, - const doris_udf::DecimalV2Val& val); - - static doris_udf::TinyIntVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::TinyIntVal* args); - static doris_udf::TinyIntVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::TinyIntVal* args); - static doris_udf::SmallIntVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::SmallIntVal* val); - static doris_udf::SmallIntVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::SmallIntVal* val); - static doris_udf::IntVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::IntVal* val); - static doris_udf::IntVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::IntVal* val); - static doris_udf::BigIntVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::BigIntVal* val); - static doris_udf::BigIntVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::BigIntVal* val); - static doris_udf::LargeIntVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::LargeIntVal* val); - static doris_udf::LargeIntVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::LargeIntVal* val); - static doris_udf::FloatVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::FloatVal* val); - static doris_udf::FloatVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::FloatVal* val); - static doris_udf::DoubleVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::DoubleVal* val); - static doris_udf::DoubleVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::DoubleVal* val); - static doris_udf::StringVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::StringVal* val); - static doris_udf::StringVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::StringVal* val); - static doris_udf::DateTimeVal least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::DateTimeVal* val); - static doris_udf::DateTimeVal greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::DateTimeVal* val); - static doris_udf::DecimalV2Val least(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::DecimalV2Val* val); - static doris_udf::DecimalV2Val greatest(doris_udf::FunctionContext* ctx, int num_args, - const doris_udf::DecimalV2Val* val); - static double my_double_round(double value, int64_t dec, bool dec_unsigned, bool truncate); // Converts src_num in decimal to dest_base, diff --git a/be/src/exprs/string_functions.cpp b/be/src/exprs/string_functions.cpp index 0f9e3f5b19..6ca42affb4 100644 --- a/be/src/exprs/string_functions.cpp +++ b/be/src/exprs/string_functions.cpp @@ -32,461 +32,6 @@ // NOTE: be careful not to use string::append. It is not performant. namespace doris { -void StringFunctions::init() {} - -size_t get_char_len(const StringVal& str, std::vector* str_index) { - size_t char_len = 0; - for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; - str_index->push_back(i); - ++char_len; - } - return char_len; -} - -// This behaves identically to the mysql implementation, namely: -// - 1-indexed positions -// - supported negative positions (count from the end of the string) -// - [optional] len. No len indicates longest substr possible -StringVal StringFunctions::substring(FunctionContext* context, const StringVal& str, - const IntVal& pos, const IntVal& len) { - if (str.is_null || pos.is_null || len.is_null) { - return StringVal::null(); - } - if (len.val <= 0 || str.len == 0 || pos.val == 0 || pos.val > str.len) { - return StringVal(); - } - - // create index indicate every char start byte - // e.g. "hello word 你好" => [0,1,2,3,4,5,6,7,8,9,10,11,14] 你 and 好 are 3 bytes - // why use a vector as index? It is unnecessary if there is no negative pos val, - // but if has pos is negative it is not easy to determine where to start, so need a - // index save every character's length - size_t byte_pos = 0; - std::vector index; - for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; - index.push_back(i); - if (pos.val > 0 && index.size() > pos.val + len.val) { - break; - } - } - - int fixed_pos = pos.val; - if (fixed_pos < 0) { - fixed_pos = index.size() + fixed_pos + 1; - } - if (fixed_pos > index.size()) { - return StringVal::null(); - } - byte_pos = index[fixed_pos - 1]; - int fixed_len = str.len - byte_pos; - if (fixed_pos + len.val <= index.size()) { - fixed_len = index[fixed_pos + len.val - 1] - byte_pos; - } - if (byte_pos <= str.len && fixed_len > 0) { - return StringVal(str.ptr + byte_pos, fixed_len); - } else { - return StringVal(); - } -} - -StringVal StringFunctions::substring(FunctionContext* context, const StringVal& str, - const IntVal& pos) { - // StringVal.len is an int => INT32_MAX - return substring(context, str, pos, IntVal(INT32_MAX)); -} - -// Implementation of Left. The signature is -// string left(string input, int len) -// This behaves identically to the mysql implementation. -StringVal StringFunctions::left(FunctionContext* context, const StringVal& str, const IntVal& len) { - if (len.val >= str.len) return str; - return substring(context, str, 1, len); -} - -// Implementation of Right. The signature is -// string right(string input, int len) -// This behaves identically to the mysql implementation. -StringVal StringFunctions::right(FunctionContext* context, const StringVal& str, - const IntVal& len) { - // Don't index past the beginning of str, otherwise we'll get an empty string back - int32_t pos = std::max(-len.val, static_cast(-str.len)); - return substring(context, str, IntVal(pos), len); -} - -BooleanVal StringFunctions::starts_with(FunctionContext* context, const StringVal& str, - const StringVal& prefix) { - if (str.is_null || prefix.is_null) { - return BooleanVal::null(); - } - re2::StringPiece str_sp(reinterpret_cast(str.ptr), str.len); - re2::StringPiece prefix_sp(reinterpret_cast(prefix.ptr), prefix.len); - return BooleanVal(str_sp.starts_with(prefix_sp)); -} - -BooleanVal StringFunctions::ends_with(FunctionContext* context, const StringVal& str, - const StringVal& suffix) { - if (str.is_null || suffix.is_null) { - return BooleanVal::null(); - } - re2::StringPiece str_sp(reinterpret_cast(str.ptr), str.len); - re2::StringPiece suffix_sp(reinterpret_cast(suffix.ptr), suffix.len); - return BooleanVal(str_sp.ends_with(suffix_sp)); -} - -BooleanVal StringFunctions::null_or_empty(FunctionContext* context, const StringVal& str) { - if (str.is_null || str.len == 0) { - return 1; - } else { - return 0; - } -} - -BooleanVal StringFunctions::not_null_or_empty(FunctionContext* context, const StringVal& str) { - if (str.is_null || str.len == 0) { - return 0; - } else { - return 1; - } -} - -StringVal StringFunctions::space(FunctionContext* context, const IntVal& len) { - if (len.is_null) { - return StringVal::null(); - } - if (len.val <= 0) { - return StringVal(); - } - int32_t space_size = std::min(len.val, 65535); - // TODO pengyubing - // StringVal result = StringVal::create_temp_string_val(context, space_size); - StringVal result(context, space_size); - memset(result.ptr, ' ', space_size); - return result; -} - -StringVal StringFunctions::repeat(FunctionContext* context, const StringVal& str, const IntVal& n) { - if (str.is_null || n.is_null) { - return StringVal::null(); - } - if (str.len == 0 || n.val <= 0) { - return StringVal(); - } - - // TODO pengyubing - // StringVal result = StringVal::create_temp_string_val(context, str.len * n.val); - StringVal result(context, str.len * n.val); - if (UNLIKELY(result.is_null)) { - return result; - } - uint8_t* ptr = result.ptr; - for (int64_t i = 0; i < n.val; ++i) { - memcpy(ptr, str.ptr, str.len); - ptr += str.len; - } - return result; -} - -StringVal StringFunctions::lpad(FunctionContext* context, const StringVal& str, const IntVal& len, - const StringVal& pad) { - if (str.is_null || len.is_null || pad.is_null || len.val < 0) { - return StringVal::null(); - } - - std::vector str_index; - size_t str_char_size = get_char_len(str, &str_index); - std::vector pad_index; - size_t pad_char_size = get_char_len(pad, &pad_index); - - // Corner cases: Shrink the original string, or leave it alone. - // TODO: Hive seems to go into an infinite loop if pad.len == 0, - // so we should pay attention to Hive's future solution to be compatible. - if (len.val <= str_char_size || pad.len == 0) { - if (len.val > str_index.size()) { - return StringVal::null(); - } - if (len.val == str_index.size()) { - return StringVal(str.ptr, str.len); - } - return StringVal(str.ptr, str_index[len.val]); - } - - // TODO pengyubing - // StringVal result = StringVal::create_temp_string_val(context, len.val); - int32_t pad_byte_len = 0; - int32_t pad_times = (len.val - str_char_size) / pad_char_size; - int32_t pad_remainder = (len.val - str_char_size) % pad_char_size; - pad_byte_len = pad_times * pad.len; - pad_byte_len += pad_index[pad_remainder]; - int32_t byte_len = str.len + pad_byte_len; - StringVal result(context, byte_len); - if (result.is_null) { - return result; - } - int pad_idx = 0; - int result_index = 0; - uint8_t* ptr = result.ptr; - - // Prepend chars of pad. - while (result_index < pad_byte_len) { - ptr[result_index++] = pad.ptr[pad_idx++]; - pad_idx = pad_idx % pad.len; - } - - // Append given string. - memcpy(ptr + result_index, str.ptr, str.len); - return result; -} - -StringVal StringFunctions::rpad(FunctionContext* context, const StringVal& str, const IntVal& len, - const StringVal& pad) { - if (str.is_null || len.is_null || pad.is_null || len.val < 0) { - return StringVal::null(); - } - - std::vector str_index; - size_t str_char_size = get_char_len(str, &str_index); - std::vector pad_index; - size_t pad_char_size = get_char_len(pad, &pad_index); - - // Corner cases: Shrink the original string, or leave it alone. - // TODO: Hive seems to go into an infinite loop if pad->len == 0, - // so we should pay attention to Hive's future solution to be compatible. - if (len.val <= str_char_size || pad.len == 0) { - if (len.val > str_index.size()) { - return StringVal::null(); - } - if (len.val == str_index.size()) { - return StringVal(str.ptr, str.len); - } - return StringVal(str.ptr, str_index[len.val]); - } - - // TODO pengyubing - // StringVal result = StringVal::create_temp_string_val(context, len.val); - int32_t pad_byte_len = 0; - int32_t pad_times = (len.val - str_char_size) / pad_char_size; - int32_t pad_remainder = (len.val - str_char_size) % pad_char_size; - pad_byte_len = pad_times * pad.len; - pad_byte_len += pad_index[pad_remainder]; - int32_t byte_len = str.len + pad_byte_len; - StringVal result(context, byte_len); - if (UNLIKELY(result.is_null)) { - return result; - } - memcpy(result.ptr, str.ptr, str.len); - - // Append chars of pad until desired length - uint8_t* ptr = result.ptr; - int pad_idx = 0; - int result_len = str.len; - while (result_len < byte_len) { - ptr[result_len++] = pad.ptr[pad_idx++]; - pad_idx = pad_idx % pad.len; - } - return result; -} - -StringVal StringFunctions::append_trailing_char_if_absent( - doris_udf::FunctionContext* context, const doris_udf::StringVal& str, - const doris_udf::StringVal& trailing_char) { - if (str.is_null || trailing_char.is_null || trailing_char.len != 1) { - return StringVal::null(); - } - if (str.len == 0) { - return trailing_char; - } - if (str.ptr[str.len - 1] == trailing_char.ptr[0]) { - return str; - } - - StringVal result(context, str.len + 1); - memcpy(result.ptr, str.ptr, str.len); - result.ptr[str.len] = trailing_char.ptr[0]; - return result; -} - -// Implementation of LENGTH -// int length(string input) -// Returns the length in bytes of input. If input == nullptr, returns -// nullptr per MySQL -IntVal StringFunctions::length(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return IntVal::null(); - } - return IntVal(str.len); -} - -// Implementation of CHAR_LENGTH -// int char_utf8_length(string input) -// Returns the length of characters of input. If input == nullptr, returns -// nullptr per MySQL -IntVal StringFunctions::char_utf8_length(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return IntVal::null(); - } - size_t char_len = 0; - for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; - ++char_len; - } - return IntVal(char_len); -} - -StringVal StringFunctions::lower(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return StringVal::null(); - } - StringVal result(context, str.len); - if (UNLIKELY(result.is_null)) { - return result; - } - simd::VStringFunctions::to_lower(str.ptr, str.len, result.ptr); - return result; -} - -StringVal StringFunctions::upper(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return StringVal::null(); - } - StringVal result(context, str.len); - if (UNLIKELY(result.is_null)) { - return result; - } - simd::VStringFunctions::to_upper(str.ptr, str.len, result.ptr); - return result; -} - -StringVal StringFunctions::initcap(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return StringVal::null(); - } - StringVal result(context, str.len); - - simd::VStringFunctions::to_lower(str.ptr, str.len, result.ptr); - - bool need_capitalize = true; - for (int64_t i = 0; i < str.len; ++i) { - if (!::isalnum(result.ptr[i])) { - need_capitalize = true; - } else if (need_capitalize) { - result.ptr[i] = ::toupper(result.ptr[i]); - need_capitalize = false; - } - } - - return result; -} - -StringVal StringFunctions::reverse(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return StringVal::null(); - } - - StringVal result(context, str.len); - if (UNLIKELY(result.is_null)) { - return result; - } - - simd::VStringFunctions::reverse(str, result); - return result; -} - -StringVal StringFunctions::trim(FunctionContext* context, const StringVal& str) { - return simd::VStringFunctions::trim(str); -} - -StringVal StringFunctions::ltrim(FunctionContext* context, const StringVal& str) { - return simd::VStringFunctions::ltrim(str); -} - -StringVal StringFunctions::rtrim(FunctionContext* context, const StringVal& str) { - return simd::VStringFunctions::rtrim(str); -} - -IntVal StringFunctions::ascii(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return IntVal::null(); - } - // Hive returns 0 when given an empty string. - return IntVal((str.len == 0) ? 0 : static_cast(str.ptr[0])); -} - -IntVal StringFunctions::instr(FunctionContext* context, const StringVal& str, - const StringVal& substr) { - if (str.is_null || substr.is_null) { - return IntVal::null(); - } - if (substr.len == 0) { - return IntVal(1); - } - StringRef str_sv = StringRef(str); - StringRef substr_sv = StringRef(substr); - StringSearch search(&substr_sv); - // Hive returns positions starting from 1. - int loc = search.search(&str_sv); - if (loc > 0) { - size_t char_len = 0; - for (size_t i = 0, char_size = 0; i < loc; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; - ++char_len; - } - loc = char_len; - } - - return IntVal(loc + 1); -} - -IntVal StringFunctions::locate(FunctionContext* context, const StringVal& substr, - const StringVal& str) { - return instr(context, str, substr); -} - -IntVal StringFunctions::locate_pos(FunctionContext* context, const StringVal& substr, - const StringVal& str, const IntVal& start_pos) { - if (str.is_null || substr.is_null || start_pos.is_null) { - return IntVal::null(); - } - if (substr.len == 0) { - if (start_pos.val <= 0) { - return IntVal(0); - } else if (start_pos.val == 1) { - return IntVal(1); - } else if (start_pos.val > str.len) { - return IntVal(0); - } else { - return IntVal(start_pos.val); - } - } - // Hive returns 0 for *start_pos <= 0, - // but throws an exception for *start_pos > str->len. - // Since returning 0 seems to be Hive's error condition, return 0. - std::vector index; - size_t char_len = get_char_len(str, &index); - if (start_pos.val <= 0 || start_pos.val > str.len || start_pos.val > char_len) { - return IntVal(0); - } - StringRef substr_sv = StringRef(substr); - StringSearch search(&substr_sv); - // Input start_pos.val starts from 1. - StringRef adjusted_str(reinterpret_cast(str.ptr) + index[start_pos.val - 1], - str.len - index[start_pos.val - 1]); - int32_t match_pos = search.search(&adjusted_str); - if (match_pos >= 0) { - // Hive returns the position in the original string starting from 1. - size_t char_len = 0; - for (size_t i = 0, char_size = 0; i < match_pos; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(adjusted_str.data)[i]]; - ++char_len; - } - match_pos = char_len; - return IntVal(start_pos.val + match_pos); - } else { - return IntVal(0); - } -} - // This function sets options in the RE2 library before pattern matching. bool StringFunctions::set_re2_options(const StringVal& match_parameter, std::string* error_str, re2::RE2::Options* opts) { @@ -544,513 +89,4 @@ re2::RE2* StringFunctions::compile_regex(const StringVal& pattern, std::string* return re; } -void StringFunctions::regexp_prepare(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::FRAGMENT_LOCAL) { - return; - } - - if (!context->is_arg_constant(1)) { - return; - } - StringVal* pattern = reinterpret_cast(context->get_constant_arg(1)); - if (pattern->is_null) { - return; - } - std::string error_str; - re2::RE2* re = compile_regex(*pattern, &error_str, StringVal::null()); - if (re == nullptr) { - context->set_error(error_str.c_str()); - return; - } - context->set_function_state(scope, re); -} - -void StringFunctions::regexp_close(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::FRAGMENT_LOCAL) { - return; - } - re2::RE2* re = reinterpret_cast(context->get_function_state(scope)); - delete re; -} - -StringVal StringFunctions::regexp_extract(FunctionContext* context, const StringVal& str, - const StringVal& pattern, const BigIntVal& index) { - if (str.is_null || pattern.is_null || index.is_null) { - return StringVal::null(); - } - if (index.val < 0) { - return StringVal(); - } - - re2::RE2* re = reinterpret_cast( - context->get_function_state(FunctionContext::FRAGMENT_LOCAL)); - std::unique_ptr scoped_re; // destroys re if we have to locally compile it - if (re == nullptr) { - DCHECK(!context->is_arg_constant(1)); - std::string error_str; - re = compile_regex(pattern, &error_str, StringVal::null()); - if (re == nullptr) { - context->add_warning(error_str.c_str()); - return StringVal::null(); - } - scoped_re.reset(re); - } - - re2::StringPiece str_sp(reinterpret_cast(str.ptr), str.len); - int max_matches = 1 + re->NumberOfCapturingGroups(); - if (index.val >= max_matches) { - return StringVal(); - } - // Use a vector because clang complains about non-POD varlen arrays - // TODO: fix this - std::vector matches(max_matches); - bool success = re->Match(str_sp, 0, str.len, re2::RE2::UNANCHORED, &matches[0], max_matches); - if (!success) { - return StringVal(); - } - // matches[0] is the whole string, matches[1] the first group, etc. - const re2::StringPiece& match = matches[index.val]; - return AnyValUtil::from_buffer_temp(context, match.data(), match.size()); -} - -StringVal StringFunctions::regexp_replace(FunctionContext* context, const StringVal& str, - const StringVal& pattern, const StringVal& replace) { - if (str.is_null || pattern.is_null || replace.is_null) { - return StringVal::null(); - } - - re2::RE2* re = reinterpret_cast( - context->get_function_state(FunctionContext::FRAGMENT_LOCAL)); - std::unique_ptr scoped_re; // destroys re if state->re is nullptr - if (re == nullptr) { - DCHECK(!context->is_arg_constant(1)); - std::string error_str; - re = compile_regex(pattern, &error_str, StringVal::null()); - if (re == nullptr) { - context->add_warning(error_str.c_str()); - return StringVal::null(); - } - scoped_re.reset(re); - } - - re2::StringPiece replace_str = - re2::StringPiece(reinterpret_cast(replace.ptr), replace.len); - std::string result_str = AnyValUtil::to_string(str); - re2::RE2::GlobalReplace(&result_str, *re, replace_str); - return AnyValUtil::from_string_temp(context, result_str); -} - -StringVal StringFunctions::concat(FunctionContext* context, int num_children, - const StringVal* strs) { - DCHECK_GE(num_children, 1); - - // Pass through if there's only one argument - if (num_children == 1) { - return strs[0]; - } - - // Loop once to compute the final size and reserve space. - int32_t total_size = 0; - for (int32_t i = 0; i < num_children; ++i) { - if (strs[i].is_null) { - return StringVal::null(); - } - total_size += strs[i].len; - } - - StringVal result(context, total_size); - uint8_t* ptr = result.ptr; - - // Loop again to append the data. - for (int32_t i = 0; i < num_children; ++i) { - memcpy(ptr, strs[i].ptr, strs[i].len); - ptr += strs[i].len; - } - return result; -} - -StringVal StringFunctions::concat_ws(FunctionContext* context, const StringVal& sep, - int num_children, const StringVal* strs) { - DCHECK_GE(num_children, 1); - if (sep.is_null) { - return StringVal::null(); - } - - // Loop once to compute the final size and reserve space. - int32_t total_size = 0; - bool not_first = false; - for (int32_t i = 0; i < num_children; ++i) { - if (strs[i].is_null) { - continue; - } - if (not_first) { - total_size += sep.len; - } - total_size += strs[i].len; - not_first = true; - } - - StringVal result(context, total_size); - uint8_t* ptr = result.ptr; - not_first = false; - // Loop again to append the data. - for (int32_t i = 0; i < num_children; ++i) { - if (strs[i].is_null) { - continue; - } - if (not_first) { - memcpy(ptr, sep.ptr, sep.len); - ptr += sep.len; - } - memcpy(ptr, strs[i].ptr, strs[i].len); - ptr += strs[i].len; - not_first = true; - } - return result; -} - -StringVal StringFunctions::elt(FunctionContext* context, const IntVal& pos, int num_children, - const StringVal* strs) { - if (pos.is_null || pos.val < 1 || num_children == 0 || pos.val > num_children) { - return StringVal::null(); - } - - return strs[pos.val - 1]; -} - -IntVal StringFunctions::find_in_set(FunctionContext* context, const StringVal& str, - const StringVal& str_set) { - if (str.is_null || str_set.is_null) { - return IntVal::null(); - } - // Check str for commas. - for (int i = 0; i < str.len; ++i) { - if (str.ptr[i] == ',') { - return IntVal(0); - } - } - // The result index starts from 1 since 0 is an error condition. - int32_t token_index = 1; - int32_t start = 0; - int32_t end; - StringRef str_sv = StringRef(str); - do { - end = start; - // Position end. - while (end < str_set.len && str_set.ptr[end] != ',') { - ++end; - } - StringRef token(reinterpret_cast(str_set.ptr) + start, end - start); - if (str_sv.eq(token)) { - return IntVal(token_index); - } - - // Re-position start and end past ',' - start = end + 1; - ++token_index; - } while (start < str_set.len); - return IntVal(0); -} - -void StringFunctions::parse_url_prepare(FunctionContext* ctx, - FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::FRAGMENT_LOCAL) { - return; - } - if (!ctx->is_arg_constant(1)) { - return; - } - StringVal* part = reinterpret_cast(ctx->get_constant_arg(1)); - if (part->is_null) { - return; - } - UrlParser::UrlPart* url_part = new UrlParser::UrlPart; - *url_part = UrlParser::get_url_part(StringRef(*part)); - if (*url_part == UrlParser::INVALID) { - std::stringstream ss; - ss << "Invalid URL part: " << AnyValUtil::to_string(*part) << std::endl - << "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', " - << "'USERINFO', 'PORT' and 'QUERY')"; - ctx->set_error(ss.str().c_str()); - return; - } - ctx->set_function_state(scope, url_part); -} - -StringVal StringFunctions::parse_url(FunctionContext* ctx, const StringVal& url, - const StringVal& part) { - if (url.is_null || part.is_null) { - return StringVal::null(); - } - std::string part_str = std::string(reinterpret_cast(part.ptr), part.len); - transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper); - StringVal newPart = AnyValUtil::from_string_temp(ctx, part_str); - void* state = ctx->get_function_state(FunctionContext::FRAGMENT_LOCAL); - UrlParser::UrlPart url_part; - if (state != nullptr) { - url_part = *reinterpret_cast(state); - } else { - DCHECK(!ctx->is_arg_constant(1)); - url_part = UrlParser::get_url_part(StringRef(newPart)); - } - - StringRef result; - if (!UrlParser::parse_url(StringRef(url), url_part, &result)) { - // url is malformed, or url_part is invalid. - if (url_part == UrlParser::INVALID) { - std::stringstream ss; - ss << "Invalid URL part: " << AnyValUtil::to_string(newPart); - ctx->add_warning(ss.str().c_str()); - } else { - std::stringstream ss; - ss << "Could not parse URL: " << AnyValUtil::to_string(url); - ctx->add_warning(ss.str().c_str()); - } - return StringVal::null(); - } - StringVal result_sv; - result.to_string_val(&result_sv); - return result_sv; -} - -void StringFunctions::parse_url_close(FunctionContext* ctx, - FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::FRAGMENT_LOCAL) { - return; - } - UrlParser::UrlPart* url_part = - reinterpret_cast(ctx->get_function_state(scope)); - delete url_part; -} - -StringVal StringFunctions::parse_url_key(FunctionContext* ctx, const StringVal& url, - const StringVal& part, const StringVal& key) { - if (url.is_null || part.is_null || key.is_null) { - return StringVal::null(); - } - void* state = ctx->get_function_state(FunctionContext::FRAGMENT_LOCAL); - UrlParser::UrlPart url_part; - if (state != nullptr) { - url_part = *reinterpret_cast(state); - } else { - DCHECK(!ctx->is_arg_constant(1)); - url_part = UrlParser::get_url_part(StringRef(part)); - } - - StringRef result; - if (!UrlParser::parse_url_key(StringRef(url), url_part, StringRef(key), &result)) { - // url is malformed, or url_part is invalid. - if (url_part == UrlParser::INVALID) { - std::stringstream ss; - ss << "Invalid URL part: " << AnyValUtil::to_string(part); - ctx->add_warning(ss.str().c_str()); - } else { - std::stringstream ss; - ss << "Could not parse URL: " << AnyValUtil::to_string(url); - ctx->add_warning(ss.str().c_str()); - } - return StringVal::null(); - } - StringVal result_sv; - result.to_string_val(&result_sv); - return result_sv; -} - -StringVal StringFunctions::money_format(FunctionContext* context, const DoubleVal& v) { - if (v.is_null) { - return StringVal::null(); - } - double v_cent = MathFunctions::my_double_round(v.val, 2, false, false); - return do_money_format(context, fmt::format("{:.2f}", v_cent)); -} - -StringVal StringFunctions::money_format(FunctionContext* context, const DecimalV2Val& v) { - if (v.is_null) { - return StringVal::null(); - } - - DecimalV2Value rounded(0); - DecimalV2Value::from_decimal_val(v).round(&rounded, 2, HALF_UP); - return do_money_format(context, rounded.int_value(), - abs(rounded.frac_value() / 10000000)); -} - -StringVal StringFunctions::money_format(FunctionContext* context, const BigIntVal& v) { - if (v.is_null) { - return StringVal::null(); - } - return do_money_format(context, v.val); -} - -StringVal StringFunctions::money_format(FunctionContext* context, const LargeIntVal& v) { - if (v.is_null) { - return StringVal::null(); - } - return do_money_format<__int128_t, 52>(context, v.val); -} - -static int index_of(const uint8_t* source, int source_offset, int source_count, - const uint8_t* target, int target_offset, int target_count, int from_index) { - if (from_index >= source_count) { - return (target_count == 0 ? source_count : -1); - } - if (from_index < 0) { - from_index = 0; - } - if (target_count == 0) { - return from_index; - } - const uint8_t first = target[target_offset]; - int max = source_offset + (source_count - target_count); - for (int i = source_offset + from_index; i <= max; i++) { - while (i <= max && source[i] != first) i++; // Look for first character - if (i <= max) { // Found first character, now look at the rest of v2 - int j = i + 1; - int end = j + target_count - 1; - for (int k = target_offset + 1; j < end && source[j] == target[k]; j++, k++) - ; - if (j == end) { - return i - source_offset; // Found whole string. - } - } - } - return -1; -} - -static int last_index_of(const uint8_t* source, int source_len, const uint8_t* target, - int target_len, int to_index) { - if (to_index < 0) { - return -1; - } - if (to_index >= source_len) { - to_index = source_len - 1; - } - if (target_len == 0) { - return to_index; - } - const uint8_t last = target[target_len - 1]; - int min = target_len; - for (int i = to_index; i >= min; i--) { - while (i >= min && source[i] != last) { - i--; // Look for last character - } - if (i >= min) { // Found first character, now look at the rest of v2 - int j = i - 1; - int end = j - target_len + 1; - for (int k = target_len - 2; j > end && source[j] == target[k];) { - j--; - k--; - } - if (j == end) { - return i - target_len + 1; - } - } - } - return -1; -} - -StringVal StringFunctions::split_part(FunctionContext* context, const StringVal& content, - const StringVal& delimiter, const IntVal& field) { - if (content.is_null || delimiter.is_null || field.is_null || field.val == 0) { - return StringVal::null(); - } - - if (field.val > 0) { - int from = 0; - std::vector find(field.val, -1); //store substring position - for (int i = 1; i <= field.val; i++) { // find - int last_index = i - 1; - find[last_index] = - index_of(content.ptr, 0, content.len, delimiter.ptr, 0, delimiter.len, from); - from = find[last_index] + delimiter.len; - if (find[last_index] == -1) { - break; - } - } - if ((field.val > 1 && find[field.val - 2] == -1) || - (field.val == 1 && find[field.val - 1] == -1)) { - // field not find return null - return StringVal::null(); - } - int start_pos; - if (field.val == 1) { // find need split first part - start_pos = 0; - } else { - start_pos = find[field.val - 2] + delimiter.len; - } - int len = (find[field.val - 1] == -1 ? content.len : find[field.val - 1]) - start_pos; - return StringVal(content.ptr + start_pos, len); - } else { - int to = content.len; - int abs_field = -field.val; - std::vector find(abs_field, -1); //store substring position - for (int i = 1; i <= abs_field; i++) { // find - int last_index = i - 1; - find[last_index] = - last_index_of(content.ptr, content.len, delimiter.ptr, delimiter.len, to); - to = find[last_index] - delimiter.len; - if (find[last_index] == -1) { - break; - } - } - if ((abs_field > 1 && find[abs_field - 2] == -1) || - (abs_field == 1 && find[abs_field - 1] == -1)) { - // field not find return null - return StringVal::null(); - } - int end_pos; - if (abs_field == 1) { // find need split first part - end_pos = content.len - 1; - } else { - end_pos = find[abs_field - 2] - 1; - } - int len = - end_pos - (find[abs_field - 1] == -1 ? 0 : find[abs_field - 1] + delimiter.len) + 1; - - return StringVal(content.ptr + end_pos - len + 1, len); - } -} - -StringVal StringFunctions::replace(FunctionContext* context, const StringVal& origStr, - const StringVal& oldStr, const StringVal& newStr) { - if (origStr.is_null || oldStr.is_null || newStr.is_null) { - return StringVal::null(); - } - // Empty string is a substring of all strings. - // If old str is an empty string, the std::string.find(oldStr) is always return 0. - // With an empty old str, there is no need to do replace. - if (oldStr.len == 0) { - return origStr; - } - std::string orig_str = std::string(reinterpret_cast(origStr.ptr), origStr.len); - std::string old_str = std::string(reinterpret_cast(oldStr.ptr), oldStr.len); - std::string new_str = std::string(reinterpret_cast(newStr.ptr), newStr.len); - std::string::size_type pos = 0; - std::string::size_type oldLen = old_str.size(); - std::string::size_type newLen = new_str.size(); - while ((pos = orig_str.find(old_str, pos)) != std::string::npos) { - orig_str.replace(pos, oldLen, new_str); - pos += newLen; - } - return AnyValUtil::from_string_temp(context, orig_str); -} -// Implementation of BIT_LENGTH -// int bit_length(string input) -// Returns the length in bits of input. If input == nullptr, returns -// nullptr per MySQL -IntVal StringFunctions::bit_length(FunctionContext* context, const StringVal& str) { - if (str.is_null) { - return IntVal::null(); - } - return IntVal(str.len * 8); -} - -StringVal StringFunctions::uuid(FunctionContext* ctx) { - boost::uuids::random_generator generator; - std::string uuid = boost::uuids::to_string(generator()); - - return AnyValUtil::from_string_temp(ctx, uuid); -} } // namespace doris diff --git a/be/src/exprs/string_functions.h b/be/src/exprs/string_functions.h index 01fe8646da..bcdff6f80a 100644 --- a/be/src/exprs/string_functions.h +++ b/be/src/exprs/string_functions.h @@ -32,165 +32,11 @@ namespace doris { -class OpcodeRegistry; - class StringFunctions { public: - static void init(); - - static doris_udf::StringVal substring(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, - const doris_udf::IntVal& pos, - const doris_udf::IntVal& len); - static doris_udf::StringVal substring(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, - const doris_udf::IntVal& pos); - static doris_udf::StringVal left(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, const doris_udf::IntVal& len); - static doris_udf::StringVal right(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, - const doris_udf::IntVal& len); - static doris_udf::BooleanVal starts_with(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, - const doris_udf::StringVal& prefix); - static doris_udf::BooleanVal ends_with(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, - const doris_udf::StringVal& suffix); - static doris_udf::BooleanVal null_or_empty(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::BooleanVal not_null_or_empty(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal space(doris_udf::FunctionContext* context, - const doris_udf::IntVal& len); - static doris_udf::StringVal repeat(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, const doris_udf::IntVal& n); - static doris_udf::StringVal lpad(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, const doris_udf::IntVal& len, - const doris_udf::StringVal& pad); - static doris_udf::StringVal rpad(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, const doris_udf::IntVal& len, - const doris_udf::StringVal& pad); - static doris_udf::StringVal append_trailing_char_if_absent( - doris_udf::FunctionContext* context, const doris_udf::StringVal& str, - const doris_udf::StringVal& trailing_char); - static doris_udf::IntVal length(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::IntVal char_utf8_length(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal lower(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal upper(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal initcap(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal reverse(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal trim(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal ltrim(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::StringVal rtrim(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::IntVal ascii(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - static doris_udf::IntVal instr(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, const doris_udf::StringVal&); - static doris_udf::IntVal locate(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, const doris_udf::StringVal&); - static doris_udf::IntVal locate_pos(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str, - const doris_udf::StringVal&, const doris_udf::IntVal&); - static bool set_re2_options(const doris_udf::StringVal& match_parameter, std::string* error_str, re2::RE2::Options* opts); - static void regexp_prepare(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - static StringVal regexp_extract(doris_udf::FunctionContext*, const doris_udf::StringVal& str, - const doris_udf::StringVal& pattern, - const doris_udf::BigIntVal& index); - static StringVal regexp_replace(doris_udf::FunctionContext*, const doris_udf::StringVal& str, - const doris_udf::StringVal& pattern, - const doris_udf::StringVal& replace); - static void regexp_close(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - static StringVal concat(doris_udf::FunctionContext*, int num_children, const StringVal* strs); - static StringVal concat_ws(doris_udf::FunctionContext*, const doris_udf::StringVal& sep, - int num_children, const doris_udf::StringVal* strs); - static StringVal elt(doris_udf::FunctionContext*, const doris_udf::IntVal& pos, - int num_children, const StringVal* strs); - static IntVal find_in_set(doris_udf::FunctionContext*, const doris_udf::StringVal& str, - const doris_udf::StringVal& str_set); - - static void parse_url_prepare(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - static StringVal parse_url(doris_udf::FunctionContext*, const doris_udf::StringVal& url, - const doris_udf::StringVal& part); - static StringVal parse_url_key(doris_udf::FunctionContext*, const doris_udf::StringVal& url, - const doris_udf::StringVal& key, - const doris_udf::StringVal& part); - static void parse_url_close(doris_udf::FunctionContext*, - doris_udf::FunctionContext::FunctionStateScope); - - static doris_udf::StringVal money_format(doris_udf::FunctionContext* context, - const doris_udf::DoubleVal& v); - - static doris_udf::StringVal money_format(doris_udf::FunctionContext* context, - const doris_udf::DecimalV2Val& v); - - static doris_udf::StringVal money_format(doris_udf::FunctionContext* context, - const doris_udf::BigIntVal& v); - - static doris_udf::StringVal money_format(doris_udf::FunctionContext* context, - const doris_udf::LargeIntVal& v); - - template - static StringVal do_money_format(FunctionContext* context, const T int_value, - const int32_t frac_value = 0) { - char local[N]; - char* p = SimpleItoaWithCommas(int_value, local, sizeof(local)); - int32_t string_val_len = local + sizeof(local) - p + 3; - StringVal result = StringVal::create_temp_string_val(context, string_val_len); - memcpy(result.ptr, p, string_val_len - 3); - *(result.ptr + string_val_len - 3) = '.'; - *(result.ptr + string_val_len - 2) = '0' + (frac_value / 10); - *(result.ptr + string_val_len - 1) = '0' + (frac_value % 10); - return result; - }; - - // Note string value must be valid decimal string which contains two digits after the decimal point - static StringVal do_money_format(FunctionContext* context, const string& value) { - bool is_positive = (value[0] != '-'); - int32_t result_len = value.size() + (value.size() - (is_positive ? 4 : 5)) / 3; - StringVal result = StringVal::create_temp_string_val(context, result_len); - if (!is_positive) { - *result.ptr = '-'; - } - for (int i = value.size() - 4, j = result_len - 4; i >= 0; i = i - 3, j = j - 4) { - *(result.ptr + j) = *(value.data() + i); - if (i - 1 < 0) break; - *(result.ptr + j - 1) = *(value.data() + i - 1); - if (i - 2 < 0) break; - *(result.ptr + j - 2) = *(value.data() + i - 2); - if (j - 3 > 1 || (j - 3 == 1 && is_positive)) { - *(result.ptr + j - 3) = ','; - } - } - memcpy(result.ptr + result_len - 3, value.data() + value.size() - 3, 3); - return result; - }; - - static StringVal split_part(FunctionContext* context, const StringVal& content, - const StringVal& delimiter, const IntVal& field); - - static StringVal replace(FunctionContext* context, const StringVal& origStr, - const StringVal& oldStr, const StringVal& newStr); - - static doris_udf::IntVal bit_length(doris_udf::FunctionContext* context, - const doris_udf::StringVal& str); - - static doris_udf::StringVal uuid(doris_udf::FunctionContext*); - // The caller owns the returned regex. Returns nullptr if the pattern could not be compiled. static re2::RE2* compile_regex(const StringVal& pattern, std::string* error_str, const StringVal& match_parameter); diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt index 929dc367b3..b83d233a0a 100644 --- a/be/src/olap/CMakeLists.txt +++ b/be/src/olap/CMakeLists.txt @@ -41,6 +41,7 @@ add_library(Olap STATIC like_column_predicate.cpp key_coder.cpp lru_cache.cpp + match_predicate.cpp memtable.cpp memtable_flush_executor.cpp merger.cpp diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index 34f9be8093..f78e2a577f 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -29,7 +29,6 @@ class MemTable; class MemTracker; class Schema; class StorageEngine; -class Tuple; class TupleDescriptor; class SlotDescriptor; diff --git a/be/src/exprs/match_predicate.cpp b/be/src/olap/match_predicate.cpp similarity index 99% rename from be/src/exprs/match_predicate.cpp rename to be/src/olap/match_predicate.cpp index 2eaaeb929c..e78f52e2f8 100644 --- a/be/src/exprs/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "exprs/match_predicate.h" +#include "olap/match_predicate.h" #include diff --git a/be/src/exprs/match_predicate.h b/be/src/olap/match_predicate.h similarity index 98% rename from be/src/exprs/match_predicate.h rename to be/src/olap/match_predicate.h index 8afe57481c..ff41fb00eb 100644 --- a/be/src/exprs/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -30,9 +30,6 @@ namespace doris { enum class MatchType; class MatchPredicate : public ColumnPredicate { -public: - static void init() {} - public: MatchPredicate(uint32_t column_id, const std::string& value, MatchType match_type); diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index 5e6ec00212..80bcb26c0c 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -35,7 +35,6 @@ class RowsetWriter; class Schema; class SlotDescriptor; class TabletSchema; -class Tuple; class TupleDescriptor; class MemTable { diff --git a/be/src/olap/predicate_creator.h b/be/src/olap/predicate_creator.h index c12e155e0f..35970de723 100644 --- a/be/src/olap/predicate_creator.h +++ b/be/src/olap/predicate_creator.h @@ -23,11 +23,11 @@ #include "exec/olap_utils.h" #include "exprs/create_predicate_function.h" #include "exprs/hybrid_set.h" -#include "exprs/match_predicate.h" #include "olap/bloom_filter_predicate.h" #include "olap/column_predicate.h" #include "olap/comparison_predicate.h" #include "olap/in_list_predicate.h" +#include "olap/match_predicate.h" #include "olap/null_predicate.h" #include "olap/tablet_schema.h" #include "runtime/define_primitive_type.h" diff --git a/be/src/runtime/collection_value.cpp b/be/src/runtime/collection_value.cpp index fc9fb65a24..593a431fb2 100644 --- a/be/src/runtime/collection_value.cpp +++ b/be/src/runtime/collection_value.cpp @@ -29,436 +29,6 @@ namespace doris { -template -struct CollectionValueSubTypeTrait; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int8_t; // slot size : 1 -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = bool; - using AnyValType = BooleanVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int8_t; - using AnyValType = TinyIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int16_t; - using AnyValType = SmallIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int32_t; - using AnyValType = IntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int64_t; - using AnyValType = BigIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = __int128_t; - using AnyValType = LargeIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = float; - using AnyValType = FloatVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = double; - using AnyValType = DoubleVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = StringRef; - using AnyValType = StringVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = StringRef; - using AnyValType = StringVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = StringRef; - using AnyValType = StringVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = uint24_t; - using AnyValType = DateTimeVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = uint64_t; - using AnyValType = DateTimeVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = uint32_t; - using AnyValType = IntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = uint64_t; - using AnyValType = BigIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = decimal12_t; - using AnyValType = DecimalV2Val; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int32_t; - using AnyValType = IntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int64_t; - using AnyValType = BigIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = int128_t; - using AnyValType = LargeIntVal; -}; - -template <> -struct CollectionValueSubTypeTrait { - using CppType = CollectionValue; - using AnyValType = CollectionVal; -}; - -struct ArrayIteratorFunctionsBase {}; - -template -struct GenericArrayIteratorFunctions : public ArrayIteratorFunctionsBase { - using CppType = typename CollectionValueSubTypeTrait::CppType; - using AnyValType = typename CollectionValueSubTypeTrait::AnyValType; - - constexpr static int get_type_size() { return sizeof(CppType); } - static void shallow_set(void* item, const AnyVal* value) { - *static_cast(item) = static_cast(value)->val; - } - static void shallow_get(AnyVal* value, const void* item) { - static_cast(value)->val = *static_cast(item); - } - static void self_deep_copy(void* item, const TypeDescriptor& type_desc, - const GenMemFootprintFunc& gen_mem_footprint, bool convert_ptrs) {} - static void deserialize(void* item, const char* tuple_data, const TypeDescriptor& type_desc) {} - static size_t get_byte_size(const void* item, const TypeDescriptor& type_desc) { return 0; } - static void raw_value_write(void* item, const void* value, const TypeDescriptor& type_desc, - MemPool* pool) { - RawValue::write(value, item, type_desc, pool); - } -}; - -template -struct ArrayIteratorFunctions : public GenericArrayIteratorFunctions {}; - -template -struct ArrayIteratorFunctionsForString : public GenericArrayIteratorFunctions { - using CppType = StringRef; - using AnyValType = StringVal; - - static void shallow_set(void* item, const AnyVal* value) { - const auto* src = static_cast(value); - auto* dst = static_cast(item); - dst->data = convert_to(src->ptr); - dst->size = src->len; - } - static void shallow_get(AnyVal* value, const void* item) { - const auto* src = static_cast(item); - auto* dst = static_cast(value); - dst->ptr = convert_to(src->data); - dst->len = src->size; - } - static void self_deep_copy(void* item, const TypeDescriptor&, - const GenMemFootprintFunc& gen_mem_footprint, bool convert_ptrs) { - auto* string = static_cast(item); - if (!string->size) { - return; - } - MemFootprint footprint = gen_mem_footprint(string->size); - int64_t offset = footprint.first; - auto* copied_string = reinterpret_cast(footprint.second); - memory_copy(copied_string, string->data, string->size); - string->data = (convert_ptrs ? convert_to(offset) : copied_string); - } - static void deserialize(void* item, const char* tuple_data, const TypeDescriptor& type_desc) { - DCHECK((item != nullptr) && (tuple_data != nullptr)) << "item or tuple_data is nullptr"; - auto* string_value = static_cast(item); - if (string_value->size) { - int64_t offset = convert_to(string_value->data); - string_value->data = convert_to(tuple_data + offset); - } - } - static size_t get_byte_size(const void* item, const TypeDescriptor&) { - return static_cast(item)->size; - } -}; - -template <> -struct ArrayIteratorFunctions : public ArrayIteratorFunctionsForString {}; -template <> -struct ArrayIteratorFunctions : public ArrayIteratorFunctionsForString { -}; -template <> -struct ArrayIteratorFunctions : public ArrayIteratorFunctionsForString {}; - -template <> -struct ArrayIteratorFunctions : public GenericArrayIteratorFunctions { - using GenericArrayIteratorFunctions::CppType; - using GenericArrayIteratorFunctions::AnyValType; - - static void shallow_set(void* item, const AnyVal* value) { - const auto* src = static_cast(value); - auto* dst = static_cast(item); - *dst = DateTimeValue::from_datetime_val(*src).to_olap_date(); - } - static void shallow_get(AnyVal* value, const void* item) { - const auto* src = static_cast(item); - auto* dst = static_cast(value); - DateTimeValue data; - data.from_olap_date(uint32_t(*src)); - data.to_datetime_val(dst); - } - static void raw_value_write(void* item, const void* value, const TypeDescriptor& type_desc, - MemPool* pool) { - DateTimeVal date_time_val; - shallow_get(&date_time_val, value); - shallow_set(item, &date_time_val); - } -}; -template <> -struct ArrayIteratorFunctions : public GenericArrayIteratorFunctions { - using GenericArrayIteratorFunctions::CppType; - using GenericArrayIteratorFunctions::AnyValType; - - static void shallow_set(void* item, const AnyVal* value) { - const auto* src = static_cast(value); - auto* dst = static_cast(item); - *dst = DateTimeValue::from_datetime_val(*src).to_olap_datetime(); - } - static void shallow_get(AnyVal* value, const void* item) { - const auto* src = static_cast(item); - auto* dst = static_cast(value); - DateTimeValue data; - data.from_olap_datetime(*src); - data.to_datetime_val(dst); - } - static void raw_value_write(void* item, const void* value, const TypeDescriptor& type_desc, - MemPool* pool) { - DateTimeVal date_time_val; - shallow_get(&date_time_val, value); - shallow_set(item, &date_time_val); - } -}; - -template <> -struct ArrayIteratorFunctions - : public GenericArrayIteratorFunctions { - using GenericArrayIteratorFunctions::CppType; - using GenericArrayIteratorFunctions::AnyValType; - - static void shallow_set(void* item, const AnyVal* value) { - const auto* src = static_cast(value); - auto* dst = static_cast(item); - auto decimal_value = DecimalV2Value::from_decimal_val(*src); - dst->integer = decimal_value.int_value(); - dst->fraction = decimal_value.frac_value(); - } - static void shallow_get(AnyVal* value, const void* item) { - const auto* src = static_cast(item); - auto* dst = static_cast(value); - DecimalV2Value(src->integer, src->fraction).to_decimal_val(dst); - } - static void raw_value_write(void* item, const void* value, const TypeDescriptor& type_desc, - MemPool* pool) { - DecimalV2Val decimal_val; - shallow_get(&decimal_val, value); - shallow_set(item, &decimal_val); - } -}; - -template <> -struct ArrayIteratorFunctions : public GenericArrayIteratorFunctions { - using GenericArrayIteratorFunctions::CppType; - using GenericArrayIteratorFunctions::AnyValType; - - static void shallow_set(void* item, const AnyVal* value) { - *static_cast(item) = - CppType::from_collection_val(*static_cast(value)); - } - static void shallow_get(AnyVal* value, const void* item) { - static_cast(item)->to_collection_val(static_cast(value)); - } - static void self_deep_copy(void* item, const TypeDescriptor& type_desc, - const GenMemFootprintFunc& gen_mem_footprint, bool convert_ptrs) { - auto* collection_value = static_cast(item); - CollectionValue::deep_copy_collection(collection_value, type_desc.children[0], - gen_mem_footprint, convert_ptrs); - } - static void deserialize(void* item, const char* tuple_data, const TypeDescriptor& type_desc) { - CollectionValue::deserialize_collection(static_cast(item), tuple_data, - type_desc.children[0]); - } - static size_t get_byte_size(const void* item, const TypeDescriptor& type_desc) { - const auto* collection_value = static_cast(item); - return collection_value->get_byte_size(type_desc.children[0]); - } -}; - -ArrayIterator CollectionValue::iterator(PrimitiveType child_type) { - return internal_iterator(child_type); -} - -ArrayIterator CollectionValue::internal_iterator(PrimitiveType child_type) const { - switch (child_type) { - case TYPE_BOOLEAN: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_TINYINT: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_SMALLINT: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_INT: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_BIGINT: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_LARGEINT: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_FLOAT: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DOUBLE: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_CHAR: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_VARCHAR: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_STRING: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DATE: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DATETIME: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DATEV2: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DATETIMEV2: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_ARRAY: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DECIMALV2: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DECIMAL32: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DECIMAL64: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - case TYPE_DECIMAL128I: - return ArrayIterator(const_cast(this), - static_cast*>(nullptr)); - default: - DCHECK(false) << "Invalid child type: " << child_type; - __builtin_unreachable(); - } -} - -const ArrayIterator CollectionValue::iterator(PrimitiveType child_type) const { - return internal_iterator(child_type); -} - -Status type_check(PrimitiveType type) { - switch (type) { - case TYPE_NULL: - - case TYPE_BOOLEAN: - - case TYPE_TINYINT: - case TYPE_SMALLINT: - case TYPE_INT: - case TYPE_BIGINT: - case TYPE_LARGEINT: - - case TYPE_FLOAT: - case TYPE_DOUBLE: - - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: - - case TYPE_DATE: - case TYPE_DATETIME: - case TYPE_DATEV2: - case TYPE_DATETIMEV2: - - case TYPE_DECIMALV2: - case TYPE_DECIMAL32: - case TYPE_DECIMAL64: - case TYPE_DECIMAL128I: - - case TYPE_ARRAY: - break; - default: - return Status::InvalidArgument("Type not implemented: {}", type); - } - return Status::OK(); -} - -int sizeof_type(PrimitiveType type) { - if (type_check(type).ok()) { - return CollectionValue().iterator(type).type_size(); - } else { - DCHECK(false) << "Type not implemented: " << type; - return 0; - } -} - void CollectionValue::to_collection_val(CollectionVal* val) const { val->length = _length; val->data = _data; @@ -481,132 +51,4 @@ void CollectionValue::copy_null_signs(const CollectionValue* other) { } } -size_t CollectionValue::get_byte_size(const TypeDescriptor& item_type) const { - size_t result = 0; - if (_length == 0) { - return result; - } - if (_has_null) { - result += _length * sizeof(bool); - } - auto iterator = CollectionValue::iterator(item_type.type); - result += _length * iterator.type_size(); - - while (!iterator.is_type_fixed_width() && iterator.has_next()) { - result += iterator.get_byte_size(item_type); - iterator.next(); - } - return result; -} - -Status CollectionValue::init_collection(ObjectPool* pool, uint64_t size, PrimitiveType child_type, - CollectionValue* value) { - return init_collection( - value, [pool](size_t size) -> uint8_t* { return pool->add_array(new uint8_t[size]); }, - size, child_type); -} - -Status CollectionValue::init_collection(CollectionValue* value, const AllocateMemFunc& allocate, - uint64_t size, PrimitiveType child_type) { - if (value == nullptr) { - return Status::InvalidArgument("collection value is null"); - } - - RETURN_IF_ERROR(type_check(child_type)); - - if (size == 0) { - new (value) CollectionValue(size); - return Status::OK(); - } - - value->_data = allocate(size * sizeof_type(child_type)); - value->_length = size; - value->_has_null = false; - value->_null_signs = reinterpret_cast(allocate(size)); - memset(value->_null_signs, 0, size * sizeof(bool)); - - return Status::OK(); -} - -Status CollectionValue::init_collection(MemPool* pool, uint64_t size, PrimitiveType child_type, - CollectionValue* value) { - return init_collection( - value, [pool](size_t size) { return pool->allocate_aligned(size, 16); }, size, - child_type); -} - -Status CollectionValue::init_collection(FunctionContext* context, uint64_t size, - PrimitiveType child_type, CollectionValue* value) { - return init_collection( - value, [context](size_t size) { return context->aligned_allocate(16, size); }, size, - child_type); -} - -CollectionValue CollectionValue::from_collection_val(const CollectionVal& val) { - return CollectionValue(val.data, val.length, val.has_null, val.null_signs); -} - -// Deep copy collection. -// NOTICE: The CollectionValue* shallow_copied_cv must be initialized by calling memcpy function first ( -// copy data from origin collection value). -void CollectionValue::deep_copy_collection(CollectionValue* shallow_copied_cv, - const TypeDescriptor& item_type, - const GenMemFootprintFunc& gen_mem_footprint, - bool convert_ptrs) { - CollectionValue* cv = shallow_copied_cv; - if (cv->length() == 0) { - return; - } - - auto iterator = cv->iterator(item_type.type); - uint64_t coll_byte_size = cv->length() * iterator.type_size(); - uint64_t nulls_size = cv->has_null() ? cv->length() * sizeof(bool) : 0; - - MemFootprint footprint = gen_mem_footprint(coll_byte_size + nulls_size); - int64_t offset = footprint.first; - char* coll_data = reinterpret_cast(footprint.second); - - // copy and assign null_signs - if (cv->has_null()) { - memory_copy(convert_to(coll_data), cv->null_signs(), nulls_size); - cv->set_null_signs(convert_to(coll_data)); - } else { - cv->set_null_signs(nullptr); - } - // copy and assign data - memory_copy(coll_data + nulls_size, cv->data(), coll_byte_size); - cv->set_data(coll_data + nulls_size); - - while (!iterator.is_type_fixed_width() && iterator.has_next()) { - iterator.self_deep_copy(item_type, gen_mem_footprint, convert_ptrs); - iterator.next(); - } - - if (convert_ptrs) { - cv->set_data(convert_to(offset + nulls_size)); - if (cv->has_null()) { - cv->set_null_signs(convert_to(offset)); - } - } -} - -void CollectionValue::deserialize_collection(CollectionValue* cv, const char* tuple_data, - const TypeDescriptor& item_type) { - if (cv->length() == 0) { - new (cv) CollectionValue(cv->length()); - return; - } - // assign data and null_sign pointer position in tuple_data - int64_t data_offset = convert_to(cv->data()); - cv->set_data(convert_to(tuple_data + data_offset)); - if (cv->has_null()) { - int64_t null_offset = convert_to(cv->null_signs()); - cv->set_null_signs(convert_to(tuple_data + null_offset)); - } - auto iterator = cv->iterator(item_type.type); - while (!iterator.is_type_fixed_width() && iterator.has_next()) { - iterator.deserialize(tuple_data, item_type); - iterator.next(); - } -} } // namespace doris diff --git a/be/src/runtime/collection_value.h b/be/src/runtime/collection_value.h index 3fac161503..a64da20623 100644 --- a/be/src/runtime/collection_value.h +++ b/be/src/runtime/collection_value.h @@ -88,36 +88,6 @@ public: void copy_null_signs(const CollectionValue* other); - size_t get_byte_size(const TypeDescriptor& item_type) const; - - ArrayIterator iterator(PrimitiveType child_type); - const ArrayIterator iterator(PrimitiveType child_type) const; - - /** - * init collection, will alloc (children Type's size + 1) * (children Nums) memory - */ - static Status init_collection(ObjectPool* pool, uint64_t size, PrimitiveType child_type, - CollectionValue* value); - - static Status init_collection(MemPool* pool, uint64_t size, PrimitiveType child_type, - CollectionValue* value); - - static Status init_collection(FunctionContext* context, uint64_t size, PrimitiveType child_type, - CollectionValue* value); - - static CollectionValue from_collection_val(const CollectionVal& val); - - // Deep copy collection. - // NOTICE: The CollectionValue* shallow_copied_cv must be initialized by calling memcpy function first ( - // copy data from origin collection value). - static void deep_copy_collection(CollectionValue* shallow_copied_cv, - const TypeDescriptor& item_type, - const GenMemFootprintFunc& gen_mem_footprint, - bool convert_ptrs); - - static void deserialize_collection(CollectionValue* cv, const char* tuple_data, - const TypeDescriptor& item_type); - const void* data() const { return _data; } bool has_null() const { return _has_null; } const bool* null_signs() const { return _null_signs; } @@ -128,12 +98,6 @@ public: void set_data(void* data) { _data = data; } void set_null_signs(bool* null_signs) { _null_signs = null_signs; } -private: - using AllocateMemFunc = std::function; - static Status init_collection(CollectionValue* value, const AllocateMemFunc& allocate, - uint64_t size, PrimitiveType child_type); - ArrayIterator internal_iterator(PrimitiveType child_type) const; - private: // child column data void* _data; @@ -143,115 +107,5 @@ private: bool _has_null; // null bitmap bool* _null_signs; - - friend ArrayIterator; -}; - -class ArrayIterator { -public: - int type_size() const { return _type_size; } - bool is_type_fixed_width() const { return _is_type_fixed_width; } - - bool has_next() const { return _offset < _collection_value->size(); } - bool next() const { - if (has_next()) { - ++_offset; - return true; - } - return false; - } - bool seek(uint64_t n) const { - if (n >= _collection_value->size()) { - return false; - } - _offset = n; - return true; - } - bool is_null() const { return _collection_value->is_null_at(_offset); } - const void* get() const { - if (is_null()) { - return nullptr; - } - return reinterpret_cast(_collection_value->data()) + _offset * _type_size; - } - void* get() { - if (is_null()) { - return nullptr; - } - return reinterpret_cast(_collection_value->mutable_data()) + _offset * _type_size; - } - void get(AnyVal* value) const { - if (is_null()) { - value->is_null = true; - return; - } - value->is_null = false; - _shallow_get(value, get()); - } - void set(const AnyVal* value) { - if (_collection_value->mutable_null_signs()) { - _collection_value->mutable_null_signs()[_offset] = value->is_null; - } - if (value->is_null) { - _collection_value->set_has_null(true); - } else { - _shallow_set(get(), value); - } - } - void self_deep_copy(const TypeDescriptor& type_desc, - const GenMemFootprintFunc& gen_mem_footprint, bool convert_ptrs) { - if (is_null()) { - return; - } - _self_deep_copy(get(), type_desc, gen_mem_footprint, convert_ptrs); - } - void deserialize(const char* tuple_data, const TypeDescriptor& type_desc) { - if (is_null()) { - return; - } - _deserialize(get(), tuple_data, type_desc); - } - size_t get_byte_size(const TypeDescriptor& type) const { - if (is_null()) { - return 0; - } - return _get_byte_size(get(), type); - } - void raw_value_write(const void* value, const TypeDescriptor& type_desc, MemPool* pool) { - if (is_null()) { - return; - } - return _raw_value_write(get(), value, type_desc, pool); - } - -private: - template >> - ArrayIterator(CollectionValue* data, const T*) - : _shallow_get(T::shallow_get), - _shallow_set(T::shallow_set), - _self_deep_copy(T::self_deep_copy), - _deserialize(T::deserialize), - _get_byte_size(T::get_byte_size), - _raw_value_write(T::raw_value_write), - _collection_value(data), - _offset(0), - _type_size(T::get_type_size()), - _is_type_fixed_width(IsTypeFixedWidth) {} - void (*_shallow_get)(AnyVal*, const void*); - void (*_shallow_set)(void*, const AnyVal*); - void (*_self_deep_copy)(void*, const TypeDescriptor&, const GenMemFootprintFunc&, bool); - void (*_deserialize)(void*, const char*, const TypeDescriptor&); - size_t (*_get_byte_size)(const void* item, const TypeDescriptor&); - void (*_raw_value_write)(void* item, const void* value, const TypeDescriptor& type_desc, - MemPool* pool); - -private: - CollectionValue* _collection_value; - mutable uint64_t _offset; - const int _type_size; - const bool _is_type_fixed_width; - - friend CollectionValue; }; } // namespace doris diff --git a/be/src/runtime/primitive_type.cpp b/be/src/runtime/primitive_type.cpp index a5af1569bc..b2fbb8db8f 100644 --- a/be/src/runtime/primitive_type.cpp +++ b/be/src/runtime/primitive_type.cpp @@ -86,56 +86,6 @@ PrimitiveType convert_type_to_primitive(FunctionContext::Type type) { return PrimitiveType::INVALID_TYPE; } -// Returns the byte size of 'type' Returns 0 for variable length types. -int get_byte_size(PrimitiveType type) { - switch (type) { - case TYPE_VARCHAR: - case TYPE_STRING: - case TYPE_OBJECT: - case TYPE_HLL: - case TYPE_QUANTILE_STATE: - case TYPE_ARRAY: - case TYPE_MAP: - return 0; - - case TYPE_NULL: - case TYPE_BOOLEAN: - case TYPE_TINYINT: - return 1; - - case TYPE_SMALLINT: - return 2; - - case TYPE_INT: - case TYPE_FLOAT: - case TYPE_DECIMAL32: - return 4; - - case TYPE_BIGINT: - case TYPE_DOUBLE: - case TYPE_TIME: - case TYPE_DECIMAL64: - return 8; - - case TYPE_DATETIME: - case TYPE_DATE: - case TYPE_LARGEINT: - case TYPE_DECIMALV2: - case TYPE_DECIMAL128I: - return 16; - - case INVALID_TYPE: - // datev2/datetimev2/timev2 is not supported on row-based engine - case TYPE_DATEV2: - case TYPE_DATETIMEV2: - case TYPE_TIMEV2: - default: - DCHECK(false); - } - - return 0; -} - bool is_type_compatible(PrimitiveType lhs, PrimitiveType rhs) { if (lhs == TYPE_VARCHAR) { return rhs == TYPE_CHAR || rhs == TYPE_VARCHAR || rhs == TYPE_HLL || rhs == TYPE_OBJECT || diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h index 96272edc71..bae0d8c1f5 100644 --- a/be/src/runtime/primitive_type.h +++ b/be/src/runtime/primitive_type.h @@ -96,8 +96,6 @@ constexpr bool has_variable_type(PrimitiveType type) { type == TYPE_QUANTILE_STATE || type == TYPE_STRING; } -// Returns the byte size of 'type' Returns 0 for variable length types. -int get_byte_size(PrimitiveType type); // Returns the byte size of type when in a tuple int get_slot_size(PrimitiveType type); diff --git a/be/src/runtime/raw_value.cpp b/be/src/runtime/raw_value.cpp index 7f2896ee19..5935a1fe89 100644 --- a/be/src/runtime/raw_value.cpp +++ b/be/src/runtime/raw_value.cpp @@ -33,522 +33,6 @@ namespace doris { const int RawValue::ASCII_PRECISION = 16; // print 16 digits for double/float -void RawValue::print_value_as_bytes(const void* value, const TypeDescriptor& type, - std::stringstream* stream) { - if (value == nullptr) { - return; - } - - const char* chars = reinterpret_cast(value); - const StringRef* string_val = nullptr; - - switch (type.type) { - case TYPE_NULL: - break; - case TYPE_BOOLEAN: - stream->write(chars, sizeof(bool)); - return; - - case TYPE_TINYINT: - stream->write(chars, sizeof(int8_t)); - break; - - case TYPE_SMALLINT: - stream->write(chars, sizeof(int16_t)); - break; - - case TYPE_INT: - stream->write(chars, sizeof(int32_t)); - break; - - case TYPE_BIGINT: - stream->write(chars, sizeof(int64_t)); - break; - - case TYPE_FLOAT: - stream->write(chars, sizeof(float)); - break; - - case TYPE_DOUBLE: - stream->write(chars, sizeof(double)); - break; - - case TYPE_VARCHAR: - case TYPE_HLL: - case TYPE_CHAR: - case TYPE_STRING: - string_val = reinterpret_cast(value); - stream->write(const_cast(string_val->data), string_val->size); - return; - - case TYPE_DATE: - case TYPE_DATETIME: - stream->write(chars, sizeof(DateTimeValue)); - break; - - case TYPE_DATEV2: - stream->write(chars, - sizeof(doris::vectorized::DateV2Value)); - break; - - case TYPE_DATETIMEV2: - stream->write( - chars, - sizeof(doris::vectorized::DateV2Value)); - break; - - case TYPE_DECIMALV2: - stream->write(chars, sizeof(DecimalV2Value)); - break; - - case TYPE_DECIMAL32: - stream->write(chars, 4); - break; - - case TYPE_DECIMAL64: - stream->write(chars, 8); - break; - - case TYPE_DECIMAL128I: - stream->write(chars, 16); - break; - - case TYPE_LARGEINT: - stream->write(chars, sizeof(__int128)); - break; - - default: - DCHECK(false) << "bad RawValue::print_value() type: " << type; - } -} - -void RawValue::print_value(const void* value, const TypeDescriptor& type, int scale, - std::stringstream* stream) { - if (value == nullptr) { - *stream << "NULL"; - return; - } - - int old_precision = stream->precision(); - std::ios_base::fmtflags old_flags = stream->flags(); - - if (scale > -1) { - stream->precision(scale); - // Setting 'fixed' causes precision to set the number of digits printed after the - // decimal (by default it sets the maximum number of digits total). - *stream << std::fixed; - } - - std::string tmp; - const StringRef* string_val = nullptr; - - switch (type.type) { - case TYPE_BOOLEAN: { - bool val = *reinterpret_cast(value); - *stream << (val ? "true" : "false"); - return; - } - - case TYPE_TINYINT: - // Extra casting for chars since they should not be interpreted as ASCII. - *stream << static_cast(*reinterpret_cast(value)); - break; - - case TYPE_SMALLINT: - *stream << *reinterpret_cast(value); - break; - - case TYPE_INT: - *stream << *reinterpret_cast(value); - break; - - case TYPE_BIGINT: - *stream << *reinterpret_cast(value); - break; - - case TYPE_FLOAT: - *stream << *reinterpret_cast(value); - break; - - case TYPE_DOUBLE: - *stream << *reinterpret_cast(value); - break; - case TYPE_HLL: - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: - string_val = reinterpret_cast(value); - tmp.assign(const_cast(string_val->data), string_val->size); - *stream << tmp; - return; - - case TYPE_DATE: - case TYPE_DATETIME: - *stream << *reinterpret_cast(value); - break; - - case TYPE_DATEV2: - *stream << *reinterpret_cast< - const doris::vectorized::DateV2Value*>(value); - break; - - case TYPE_DATETIMEV2: - *stream << *reinterpret_cast< - const doris::vectorized::DateV2Value*>( - value); - break; - - case TYPE_DECIMALV2: - *stream << DecimalV2Value(reinterpret_cast(value)->value).to_string(); - break; - - case TYPE_DECIMAL32: { - auto decimal_val = reinterpret_cast(value); - write_text(*decimal_val, type.scale, *stream); - break; - } - - case TYPE_DECIMAL64: { - auto decimal_val = reinterpret_cast(value); - write_text(*decimal_val, type.scale, *stream); - break; - } - - case TYPE_DECIMAL128I: { - auto decimal_val = reinterpret_cast(value); - write_text(*decimal_val, type.scale, *stream); - break; - } - - case TYPE_LARGEINT: - *stream << reinterpret_cast(value)->value; - break; - - case TYPE_ARRAY: { - auto child_type = type.children[0]; - auto array_value = (const CollectionValue*)(value); - - ArrayIterator iter = array_value->iterator(child_type.type); - *stream << "["; - - int begin = 0; - while (iter.has_next()) { - if (begin != 0) { - *stream << ", "; - } - if (!iter.get()) { - *stream << "NULL"; - } else { - if (child_type.is_string_type()) { - *stream << "'"; - print_value(iter.get(), child_type, scale, stream); - *stream << "'"; - } else if (child_type.is_date_type()) { - DateTimeVal data; - iter.get(&data); - auto datetime_value = DateTimeValue::from_datetime_val(data); - print_value(&datetime_value, child_type, scale, stream); - } else if (child_type.is_decimal_v2_type()) { - DecimalV2Val data; - iter.get(&data); - auto decimal_value = DecimalV2Value::from_decimal_val(data); - print_value(&decimal_value, child_type, scale, stream); - } else if (child_type.type == TYPE_DOUBLE) { - // Note: the default precision is 6, here should be reset to 15. - // Otherwise, there is a risk of losing precision. - stream->precision(15); - print_value(iter.get(), child_type, scale, stream); - } else { - print_value(iter.get(), child_type, scale, stream); - } - } - - iter.next(); - begin++; - } - *stream << "]"; - break; - } - - default: - DCHECK(false) << "bad RawValue::print_value() type: " << type; - } - - stream->precision(old_precision); - // Undo setting stream to fixed - stream->flags(old_flags); -} - -void RawValue::print_value(const void* value, const TypeDescriptor& type, int scale, - std::string* str) { - if (value == nullptr) { - *str = "NULL"; - return; - } - - std::stringstream out; - out.precision(ASCII_PRECISION); - const StringRef* string_val = nullptr; - std::string tmp; - bool val = false; - - // Special case types that we can print more efficiently without using a std::stringstream - switch (type.type) { - case TYPE_BOOLEAN: - val = *reinterpret_cast(value); - *str = (val ? "true" : "false"); - return; - - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_OBJECT: - case TYPE_HLL: - case TYPE_QUANTILE_STATE: - case TYPE_STRING: { - string_val = reinterpret_cast(value); - std::stringstream ss; - ss << "ptr:" << (void*)string_val->data << " len:" << string_val->size; - tmp = ss.str(); - if (string_val->size <= 1000) { - tmp.assign(const_cast(string_val->data), string_val->size); - } - str->swap(tmp); - return; - } - case TYPE_NULL: { - *str = "NULL"; - return; - } - default: - print_value(value, type, scale, &out); - } - - *str = out.str(); -} - -void RawValue::write(const void* value, void* dst, const TypeDescriptor& type, MemPool* pool) { - DCHECK(value != nullptr); - - switch (type.type) { - case TYPE_NULL: - break; - case TYPE_BOOLEAN: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_TINYINT: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_SMALLINT: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_INT: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_BIGINT: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_LARGEINT: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_FLOAT: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_TIME: - case TYPE_DOUBLE: { - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - } - - case TYPE_DATE: - case TYPE_DATETIME: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - - case TYPE_DATEV2: - *reinterpret_cast*>( - dst) = - *reinterpret_cast< - const doris::vectorized::DateV2Value*>( - value); - break; - - case TYPE_DATETIMEV2: - *reinterpret_cast*>( - dst) = - *reinterpret_cast*>(value); - break; - - case TYPE_DECIMALV2: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - - case TYPE_DECIMAL32: - *reinterpret_cast(dst) = - *reinterpret_cast(value); - break; - case TYPE_DECIMAL64: - *reinterpret_cast(dst) = - *reinterpret_cast(value); - break; - case TYPE_DECIMAL128I: - *reinterpret_cast(dst) = - *reinterpret_cast(value); - break; - - case TYPE_OBJECT: - case TYPE_HLL: - case TYPE_QUANTILE_STATE: - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_STRING: { - const StringRef* src = reinterpret_cast(value); - StringRef* dest = reinterpret_cast(dst); - dest->size = src->size; - - if (pool != nullptr) { - dest->data = reinterpret_cast(pool->allocate(dest->size)); - memcpy(const_cast(dest->data), src->data, dest->size); - } else { - dest->data = src->data; - } - - break; - } - case TYPE_ARRAY: { - DCHECK_EQ(type.children.size(), 1); - - const CollectionValue* src = reinterpret_cast(value); - CollectionValue* val = reinterpret_cast(dst); - - if (pool != nullptr) { - const auto& item_type = type.children[0]; - CollectionValue::init_collection(pool, src->size(), item_type.type, val); - ArrayIterator src_iter = src->iterator(item_type.type); - ArrayIterator val_iter = val->iterator(item_type.type); - - val->set_has_null(src->has_null()); - val->copy_null_signs(src); - - while (src_iter.has_next() && val_iter.has_next()) { - val_iter.raw_value_write(src_iter.get(), item_type, pool); - src_iter.next(); - val_iter.next(); - } - } else { - val->shallow_copy(src); - } - break; - } - default: - DCHECK(false) << "RawValue::write(): bad type: " << type; - } -} - -// TODO: can we remove some of this code duplication? Templated allocator? -void RawValue::write(const void* value, const TypeDescriptor& type, void* dst, uint8_t** buf) { - DCHECK(value != nullptr); - switch (type.type) { - case TYPE_BOOLEAN: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_TINYINT: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_SMALLINT: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_INT: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_BIGINT: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_LARGEINT: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_FLOAT: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_DOUBLE: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_DATE: - case TYPE_DATETIME: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - case TYPE_DATEV2: - *reinterpret_cast*>( - dst) = - *reinterpret_cast< - const doris::vectorized::DateV2Value*>( - value); - break; - case TYPE_DATETIMEV2: - *reinterpret_cast*>( - dst) = - *reinterpret_cast*>(value); - break; - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_STRING: { - DCHECK(buf != nullptr); - const StringRef* src = reinterpret_cast(value); - StringRef* dest = reinterpret_cast(dst); - dest->size = src->size; - dest->data = reinterpret_cast(*buf); - memcpy(const_cast(dest->data), src->data, dest->size); - *buf += dest->size; - break; - } - - case TYPE_DECIMALV2: - *reinterpret_cast(dst) = *reinterpret_cast(value); - break; - - case TYPE_DECIMAL32: - *reinterpret_cast(dst) = - *reinterpret_cast(value); - break; - case TYPE_DECIMAL64: - *reinterpret_cast(dst) = - *reinterpret_cast(value); - break; - case TYPE_DECIMAL128I: - *reinterpret_cast(dst) = - *reinterpret_cast(value); - break; - - default: - DCHECK(false) << "RawValue::write(): bad type: " << type.debug_string(); - } -} - -void RawValue::write(const void* value, Tuple* tuple, const SlotDescriptor* slot_desc, - MemPool* pool) { - if (value == nullptr) { - tuple->set_null(slot_desc->null_indicator_offset()); - } else { - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - RawValue::write(value, slot, slot_desc->type(), pool); - } -} - int RawValue::compare(const void* v1, const void* v2, const TypeDescriptor& type) { const StringRef* string_value1; const StringRef* string_value2; diff --git a/be/src/runtime/raw_value.h b/be/src/runtime/raw_value.h index 3e1e05a496..551b5fd488 100644 --- a/be/src/runtime/raw_value.h +++ b/be/src/runtime/raw_value.h @@ -41,20 +41,6 @@ public: // Ascii output precision for double/float static const int ASCII_PRECISION; - // Convert 'value' into ascii and write to 'stream'. nullptr turns into NULL. 'scale' - // determines how many digits after the decimal are printed for floating point numbers, - // -1 indicates to use the stream's current formatting. - static void print_value(const void* value, const TypeDescriptor& type, int scale, - std::stringstream* stream); - - // write ascii value to string instead of stringstream. - static void print_value(const void* value, const TypeDescriptor& type, int scale, - std::string* str); - - // Writes the byte representation of a value to a stringstream character-by-character - static void print_value_as_bytes(const void* value, const TypeDescriptor& type, - std::stringstream* stream); - static uint32_t get_hash_value(const void* value, const PrimitiveType& type) { return get_hash_value(value, type, 0); } @@ -98,22 +84,6 @@ public: // Return value is < 0 if v1 < v2, 0 if v1 == v2, > 0 if v1 > v2. static int compare(const void* v1, const void* v2, const TypeDescriptor& type); - // Writes the bytes of a given value into the slot of a tuple. - // For string values, the string data is copied into memory allocated from 'pool' - // only if pool is non-nullptr. - static void write(const void* value, Tuple* tuple, const SlotDescriptor* slot_desc, - MemPool* pool); - - // Writes 'src' into 'dst' for type. - // For string values, the string data is copied into 'pool' if pool is non-nullptr. - // src must be non-nullptr. - static void write(const void* src, void* dst, const TypeDescriptor& type, MemPool* pool); - - // Writes 'src' into 'dst' for type. - // String values are copied into *buffer and *buffer is updated by the length. *buf - // must be preallocated to be large enough. - static void write(const void* src, const TypeDescriptor& type, void* dst, uint8_t** buf); - // Returns true if v1 == v2. // This is more performant than compare() == 0 for string equality, mostly because of // the length comparison check. diff --git a/be/src/runtime/result_buffer_mgr.cpp b/be/src/runtime/result_buffer_mgr.cpp index e1d3069459..51d1878114 100644 --- a/be/src/runtime/result_buffer_mgr.cpp +++ b/be/src/runtime/result_buffer_mgr.cpp @@ -29,12 +29,6 @@ namespace doris { DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(result_buffer_block_count, MetricUnit::NOUNIT); -//std::size_t hash_value(const TUniqueId& fragment_id) { -// uint32_t value = RawValue::get_hash_value(&fragment_id.lo, TypeDescriptor(TYPE_BIGINT), 0); -// value = RawValue::get_hash_value(&fragment_id.hi, TypeDescriptor(TYPE_BIGINT), value); -// return value; -//} - ResultBufferMgr::ResultBufferMgr() : _stop_background_threads_latch(1) { // Each BufferControlBlock has a limited queue size of 1024, it's not needed to count the // actual size of all BufferControlBlock. diff --git a/be/src/runtime/tuple.cpp b/be/src/runtime/tuple.cpp index 1af5942dda..ec4444639c 100644 --- a/be/src/runtime/tuple.cpp +++ b/be/src/runtime/tuple.cpp @@ -34,10 +34,6 @@ namespace doris { -static void deep_copy_collection_slots(Tuple* shallow_copied_tuple, const TupleDescriptor& desc, - const GenMemFootprintFunc& gen_mem_footprint, - bool convert_ptrs); - int64_t Tuple::total_byte_size(const TupleDescriptor& desc) const { int64_t result = desc.byte_size(); if (!desc.has_varlen_slots()) { @@ -62,62 +58,6 @@ int64_t Tuple::varlen_byte_size(const TupleDescriptor& desc) const { return result; } -Tuple* Tuple::deep_copy(const TupleDescriptor& desc, MemPool* pool, bool convert_ptrs) { - Tuple* result = (Tuple*)(pool->allocate(desc.byte_size())); - deep_copy(result, desc, pool, convert_ptrs); - return result; -} - -void Tuple::deep_copy(Tuple* dst, const TupleDescriptor& desc, MemPool* pool, bool convert_ptrs) { - memory_copy(dst, this, desc.byte_size()); - - // allocate in the same pool and then copy all non-null string slots - for (auto string_slot : desc.string_slots()) { - DCHECK(string_slot->type().is_string_type()); - StringRef* string_v = dst->get_string_slot(string_slot->tuple_offset()); - if (!dst->is_null(string_slot->null_indicator_offset())) { - if (string_v->size != 0) { - int64_t offset = pool->total_allocated_bytes(); - char* string_copy = (char*)(pool->allocate(string_v->size)); - memory_copy(string_copy, string_v->data, string_v->size); - string_v->data = (convert_ptrs ? convert_to(offset) : string_copy); - } - } else { - string_v->data = nullptr; - string_v->size = 0; - } - } - - // copy collection slot - deep_copy_collection_slots( - dst, desc, - [pool](int64_t size) -> MemFootprint { - int64_t offset = pool->total_allocated_bytes(); - uint8_t* data = pool->allocate(size); - return {offset, data}; - }, - convert_ptrs); -} - -// Deep copy collection slots. -// NOTICE: The Tuple* shallow_copied_tuple must be initialized by calling memcpy function first ( -// copy data from origin tuple). -static void deep_copy_collection_slots(Tuple* shallow_copied_tuple, const TupleDescriptor& desc, - const GenMemFootprintFunc& gen_mem_footprint, - bool convert_ptrs) { - for (auto slot_desc : desc.collection_slots()) { - DCHECK(slot_desc->type().is_collection_type()); - if (shallow_copied_tuple->is_null(slot_desc->null_indicator_offset())) { - continue; - } - - // copy collection item - CollectionValue* cv = shallow_copied_tuple->get_collection_slot(slot_desc->tuple_offset()); - CollectionValue::deep_copy_collection(cv, slot_desc->type().children[0], gen_mem_footprint, - convert_ptrs); - } -} - Tuple* Tuple::dcopy_with_new(const TupleDescriptor& desc, MemPool* pool, int64_t* bytes) { Tuple* result = (Tuple*)(pool->allocate(desc.byte_size())); *bytes = dcopy_with_new(result, desc); @@ -160,72 +100,4 @@ int64_t Tuple::release_string(const TupleDescriptor& desc) { return bytes; } -void Tuple::deep_copy(const TupleDescriptor& desc, char** data, int64_t* offset, - bool convert_ptrs) { - Tuple* dst = (Tuple*)(*data); - memory_copy(dst, this, desc.byte_size()); - *data += desc.byte_size(); - *offset += desc.byte_size(); - - for (auto slot_desc : desc.string_slots()) { - DCHECK(slot_desc->type().is_string_type()); - StringRef* string_v = dst->get_string_slot(slot_desc->tuple_offset()); - if (!dst->is_null(slot_desc->null_indicator_offset())) { - memory_copy(*data, string_v->data, string_v->size); - string_v->data = (convert_ptrs ? convert_to(*offset) : *data); - *data += string_v->size; - *offset += string_v->size; - } else { - string_v->data = (convert_ptrs ? convert_to(*offset) : *data); - string_v->size = 0; - } - } - - // copy collection slots - deep_copy_collection_slots( - dst, desc, - [offset, data](int64_t size) -> MemFootprint { - MemFootprint footprint = {*offset, reinterpret_cast(*data)}; - *offset += size; - *data += size; - return footprint; - }, - convert_ptrs); -} - -std::string Tuple::to_string(const TupleDescriptor& d) const { - std::stringstream out; - out << "("; - - bool first_value = true; - for (auto slot : d.slots()) { - if (!slot->is_materialized()) { - continue; - } - if (first_value) { - first_value = false; - } else { - out << " "; - } - - if (is_null(slot->null_indicator_offset())) { - out << "null"; - } else { - std::string value_str; - RawValue::print_value(get_slot(slot->tuple_offset()), slot->type(), -1, &value_str); - out << value_str; - } - } - - out << ")"; - return out.str(); -} - -std::string Tuple::to_string(const Tuple* t, const TupleDescriptor& d) { - if (t == nullptr) { - return "null"; - } - return t->to_string(d); -} - } // namespace doris diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h index 4bda9003c5..1d226cbb1f 100644 --- a/be/src/runtime/tuple.h +++ b/be/src/runtime/tuple.h @@ -67,43 +67,11 @@ public: // The size of all referenced string and collection data. int64_t varlen_byte_size(const TupleDescriptor& desc) const; - // create a copy of 'this', including all of its referenced string data, - // using pool to allocate memory. Returns the copy. - // If 'convert_ptrs' is true, converts pointers that are part of the tuple - // into offsets in 'pool'. - Tuple* deep_copy(const TupleDescriptor& desc, MemPool* pool, bool convert_ptrs); - - Tuple* deep_copy(const TupleDescriptor& desc, MemPool* pool) { - return deep_copy(desc, pool, false); - } - - // create a copy of 'this', including all its referenced string data. This - // version does not allocate a tuple, instead copying 'dst'. dst must already - // be allocated to the correct size (desc.byte_size()) - // If 'convert_ptrs' is true, converts pointers that are part of the tuple - // into offsets in 'pool'. - void deep_copy(Tuple* dst, const TupleDescriptor& desc, MemPool* pool, bool convert_ptrs); - void deep_copy(Tuple* dst, const TupleDescriptor& desc, MemPool* pool) { - deep_copy(dst, desc, pool, false); - } - // deep copy use 'new', must be 'free' after use Tuple* dcopy_with_new(const TupleDescriptor& desc, MemPool* pool, int64_t* bytes); int64_t dcopy_with_new(Tuple* dst, const TupleDescriptor& desc); int64_t release_string(const TupleDescriptor& desc); - // create a copy of 'this', including all referenced string data, into - // data. The tuple is written first, followed by any strings. data and offset - // will be incremented by the total number of bytes written. data must already - // be allocated to the correct size. - // If 'convert_ptrs' is true, converts pointers that are part of the tuple - // into offsets in data, based on the provided offset. Otherwise they will be - // pointers directly into data. - void deep_copy(const TupleDescriptor& desc, char** data, int64_t* offset, bool convert_ptrs); - void deep_copy(const TupleDescriptor& desc, char** data, int64_t* offset) { - deep_copy(desc, data, offset, false); - } - // Turn null indicator bit on. // Turn null indicator bit on. For non-nullable slots, the mask will be 0 and // this is a no-op (but we don't have to branch to check is slots are nulalble). @@ -166,9 +134,6 @@ public: void* get_data() { return _data; } - std::string to_string(const TupleDescriptor& d) const; - static std::string to_string(const Tuple* t, const TupleDescriptor& d); - private: char _data[0]; }; diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h index aca6336f6b..32d2dd486b 100644 --- a/be/src/runtime/types.h +++ b/be/src/runtime/types.h @@ -201,9 +201,6 @@ struct TypeDescriptor { bool is_bitmap_type() const { return type == TYPE_OBJECT; } - /// Returns the byte size of this type. Returns 0 for variable length types. - int get_byte_size() const { return ::doris::get_byte_size(type); } - int get_slot_size() const { return ::doris::get_slot_size(type); } static inline int get_decimal_byte_size(int precision) { diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 073e497932..85ffc54135 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -26,7 +26,6 @@ set(UTIL_FILES arrow/row_block.cpp arrow/utils.cpp arrow/block_convertor.cpp - array_parser.cpp bfd_parser.cpp bitmap.cpp block_compression.cpp @@ -58,7 +57,6 @@ set(UTIL_FILES thrift_client.cpp thrift_server.cpp stack_util.cpp - symbols_util.cpp system_metrics.cpp url_parser.cpp url_coding.cpp @@ -105,7 +103,6 @@ set(UTIL_FILES hdfs_storage_backend.cpp hdfs_util.cpp time_lut.cpp - topn_counter.cpp cityhash102/city.cc tuple_row_zorder_compare.cpp telemetry/telemetry.cpp diff --git a/be/src/util/array_parser.cpp b/be/src/util/array_parser.cpp deleted file mode 100644 index 267b2e9896..0000000000 --- a/be/src/util/array_parser.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "util/array_parser.h" - -namespace doris { -std::unordered_map ArrayParser::_types_mapping = { - {FunctionContext::INVALID_TYPE, PrimitiveType::INVALID_TYPE}, - {FunctionContext::TYPE_NULL, PrimitiveType::TYPE_NULL}, - {FunctionContext::TYPE_BOOLEAN, PrimitiveType::TYPE_BOOLEAN}, - {FunctionContext::TYPE_TINYINT, PrimitiveType::TYPE_TINYINT}, - {FunctionContext::TYPE_SMALLINT, PrimitiveType::TYPE_SMALLINT}, - {FunctionContext::TYPE_INT, PrimitiveType::TYPE_INT}, - {FunctionContext::TYPE_BIGINT, PrimitiveType::TYPE_BIGINT}, - {FunctionContext::TYPE_LARGEINT, PrimitiveType::TYPE_LARGEINT}, - {FunctionContext::TYPE_FLOAT, PrimitiveType::TYPE_FLOAT}, - {FunctionContext::TYPE_DOUBLE, PrimitiveType::TYPE_DOUBLE}, - {FunctionContext::TYPE_DATE, PrimitiveType::TYPE_DATE}, - {FunctionContext::TYPE_DATETIME, PrimitiveType::TYPE_DATETIME}, - {FunctionContext::TYPE_CHAR, PrimitiveType::TYPE_CHAR}, - {FunctionContext::TYPE_VARCHAR, PrimitiveType::TYPE_VARCHAR}, - {FunctionContext::TYPE_HLL, PrimitiveType::TYPE_HLL}, - {FunctionContext::TYPE_STRING, PrimitiveType::TYPE_STRING}, - {FunctionContext::TYPE_DECIMALV2, PrimitiveType::TYPE_DECIMALV2}, - {FunctionContext::TYPE_OBJECT, PrimitiveType::TYPE_OBJECT}, - {FunctionContext::TYPE_ARRAY, PrimitiveType::TYPE_ARRAY}, -}; - -} \ No newline at end of file diff --git a/be/src/util/array_parser.h b/be/src/util/array_parser.h deleted file mode 100644 index 6600f6ea9c..0000000000 --- a/be/src/util/array_parser.h +++ /dev/null @@ -1,247 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include - -#include "common/status.h" -#include "runtime/collection_value.h" -#include "runtime/large_int_value.h" -#include "runtime/primitive_type.h" -#include "runtime/types.h" -#include "util/mem_util.hpp" - -namespace doris { - -template -using ConstArray = typename rapidjson::GenericValue::ConstArray; - -template -using ConstArrayIterator = typename ConstArray::ValueIterator; - -class ArrayParser { -public: - static Status parse(CollectionVal& array_val, FunctionContext* context, - const StringVal& str_val) { - rapidjson::Document document; - if (document.Parse(reinterpret_cast(str_val.ptr), str_val.len).HasParseError() || - !document.IsArray()) { - return Status::RuntimeError("Failed to parse the json to array."); - } - if (document.IsNull()) { - array_val = CollectionVal::null(); - return Status::OK(); - } - auto type_desc = _convert_to_type_descriptor(context->get_return_type()); - return _parse>( - array_val, context, - reinterpret_cast(&document)->GetArray(), type_desc); - } - -private: - static TypeDescriptor _convert_to_type_descriptor( - FunctionContext::TypeDesc function_type_desc) { - auto iterator = _types_mapping.find(function_type_desc.type); - if (iterator == _types_mapping.end()) { - return TypeDescriptor(); - } - auto type_desc = TypeDescriptor(iterator->second); - type_desc.len = function_type_desc.len; - type_desc.precision = function_type_desc.precision; - type_desc.scale = function_type_desc.scale; - for (auto child_type_desc : function_type_desc.children) { - type_desc.children.push_back(_convert_to_type_descriptor(child_type_desc)); - } - return type_desc; - } - - template - static Status _parse(CollectionVal& array_val, FunctionContext* context, - const ConstArray& array, const TypeDescriptor& type_desc) { - if (array.Empty()) { - CollectionValue(0).to_collection_val(&array_val); - return Status::OK(); - } - auto child_type_desc = type_desc.children[0]; - auto item_type = child_type_desc.type; - CollectionValue collection_value; - CollectionValue::init_collection(context, array.Size(), item_type, &collection_value); - auto iterator = collection_value.iterator(item_type); - for (auto it = array.Begin(); it != array.End(); ++it, iterator.next()) { - if (it->IsNull()) { - auto null = AnyVal(true); - iterator.set(&null); - continue; - } else if (!_is_type_valid(it, item_type)) { - return Status::RuntimeError("Failed to parse the json to array."); - } - AnyVal* val = nullptr; - Status status = _parse(&val, context, it, child_type_desc); - if (!status.ok()) { - return status; - } - iterator.set(val); - } - collection_value.to_collection_val(&array_val); - return Status::OK(); - } - - template - static bool _is_type_valid(const ConstArrayIterator iterator, - const PrimitiveType type) { - switch (type) { - case TYPE_NULL: - return iterator->IsNull(); - case TYPE_BOOLEAN: - return iterator->IsBool(); - case TYPE_TINYINT: - case TYPE_SMALLINT: - case TYPE_INT: - case TYPE_BIGINT: - case TYPE_FLOAT: - case TYPE_DOUBLE: - return iterator->IsNumber(); - case TYPE_LARGEINT: - return iterator->IsNumber() || iterator->IsString(); - case TYPE_DATE: - case TYPE_DATETIME: - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_HLL: - case TYPE_STRING: - return iterator->IsString(); - case TYPE_OBJECT: - return iterator->IsObject(); - case TYPE_ARRAY: - return iterator->IsArray(); - case TYPE_DECIMALV2: - return iterator->IsNumber() || iterator->IsString(); - default: - return false; - } - } - - template - static Status _parse(AnyVal** val, FunctionContext* context, - const ConstArrayIterator iterator, - const TypeDescriptor& type_desc) { - switch (type_desc.type) { - case TYPE_ARRAY: - *val = reinterpret_cast(context->allocate(sizeof(CollectionVal))); - new (*val) CollectionVal(); - return _parse(*reinterpret_cast(*val), context, - iterator->GetArray(), type_desc); - case TYPE_BOOLEAN: - *val = reinterpret_cast(context->allocate(sizeof(BooleanVal))); - new (*val) BooleanVal(iterator->GetBool()); - break; - case TYPE_TINYINT: - *val = reinterpret_cast(context->allocate(sizeof(TinyIntVal))); - new (*val) TinyIntVal(iterator->GetInt()); - break; - case TYPE_SMALLINT: - *val = reinterpret_cast(context->allocate(sizeof(SmallIntVal))); - new (*val) SmallIntVal(iterator->GetInt()); - break; - case TYPE_INT: - *val = reinterpret_cast(context->allocate(sizeof(IntVal))); - new (*val) IntVal(iterator->GetInt()); - break; - case TYPE_BIGINT: - *val = reinterpret_cast(context->allocate(sizeof(BigIntVal))); - new (*val) BigIntVal(iterator->GetInt64()); - break; - case TYPE_LARGEINT: { - __int128 value = 0; - if (iterator->IsNumber()) { - if (iterator->IsUint64()) { - value = iterator->GetUint64(); - } else { - return Status::RuntimeError( - "rapidjson can't parse the number larger than Uint64, please use " - "String to parse as LARGEINT"); - } - } else { - std::string_view view(iterator->GetString(), iterator->GetStringLength()); - std::stringstream stream; - stream << view; - stream >> value; - } - *val = reinterpret_cast(context->aligned_allocate(16, sizeof(LargeIntVal))); - new (*val) LargeIntVal(value); - break; - } - case TYPE_FLOAT: - *val = reinterpret_cast(context->allocate(sizeof(FloatVal))); - new (*val) FloatVal(iterator->GetFloat()); - break; - case TYPE_DOUBLE: - *val = reinterpret_cast(context->allocate(sizeof(DoubleVal))); - new (*val) DoubleVal(iterator->GetDouble()); - break; - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: { - *val = reinterpret_cast(context->allocate(sizeof(StringVal))); - new (*val) StringVal(context->allocate(iterator->GetStringLength()), - iterator->GetStringLength()); - auto string_val = reinterpret_cast(*val); - memory_copy(string_val->ptr, iterator->GetString(), iterator->GetStringLength()); - break; - } - case TYPE_DATE: - case TYPE_DATETIME: { - DateTimeValue value; - value.from_date_str(iterator->GetString(), iterator->GetStringLength()); - *val = reinterpret_cast(context->allocate(sizeof(DateTimeVal))); - new (*val) DateTimeVal(); - value.to_datetime_val(static_cast(*val)); - break; - } - case TYPE_DECIMALV2: { - *val = reinterpret_cast(context->aligned_allocate(16, sizeof(DecimalV2Val))); - new (*val) DecimalV2Val(); - - if (iterator->IsNumber()) { - if (iterator->IsUint64()) { - DecimalV2Value(iterator->GetUint64(), 0) - .to_decimal_val(static_cast(*val)); - } else { - DecimalV2Value value; - value.assign_from_double(iterator->GetDouble()); - value.to_decimal_val(static_cast(*val)); - } - } else { - std::string_view view(iterator->GetString(), iterator->GetStringLength()); - DecimalV2Value(view).to_decimal_val(static_cast(*val)); - } - break; - } - default: - return Status::RuntimeError("Failed to parse json to type ({}).", - std::to_string(type_desc.type)); - } - return Status::OK(); - } - -private: - static std::unordered_map _types_mapping; -}; -} // namespace doris diff --git a/be/src/util/symbols_util.cpp b/be/src/util/symbols_util.cpp deleted file mode 100644 index 7ccd0793b6..0000000000 --- a/be/src/util/symbols_util.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/symbols-util.cc -// and modified by Doris - -#include "util/symbols_util.h" - -#include - -#include -#include - -namespace doris { -// For the rules about gcc-compatible name mangling, see: -// http://mentorembedded.github.io/cxx-abi/abi.html#mangling -// This implementation *is* not generally compatible. It is hard coded to -// only work with functions that implement the UDF or UDA signature. That is, -// functions of the form: -// namespace::Function(doris_udf::FunctionContext*, const doris_udf::AnyVal&, etc) -// -// The general idea is to walk the types left to right and output them. This happens -// in a single pass. User literals are output as . There are many reserved, -// usually single character tokens for native types and specifying if something is a -// pointer. -// -// One additional piece of complexity is that repeated literals are compressed out. -// As literals are output, they are associated with an ID. The next time that -// we encounter the literal, we output the ID instead. -// We don't implement this generally since the way the literals are added to the -// dictionary is much more general than we need. -// e.g. for the literal ns1::ns2::class::type, -// the dictionary would add 4 literals: 'ns1', 'ns1::ns2', 'ns1::ns2::class', -// 'ns1::ns2::class::type' -// We instead take some shortcuts since we know all the argument types are -// types we define. - -// Mangled symbols must start with this. -const char* MANGLE_PREFIX = "_Z"; - -bool SymbolsUtil::is_mangled(const std::string& symbol) { - return strncmp(symbol.c_str(), MANGLE_PREFIX, strlen(MANGLE_PREFIX)) == 0; -} - -std::string SymbolsUtil::demangle(const std::string& name) { - int status = 0; - char* demangled_name = abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status); - if (status != 0) { - return name; - } - std::string result = demangled_name; - free(demangled_name); - return result; -} - -std::string SymbolsUtil::demangle_no_args(const std::string& symbol) { - std::string fn_name = demangle(symbol); - // Chop off argument list (e.g. "foo(int)" => "foo") - return fn_name.substr(0, fn_name.find('(')); -} - -std::string SymbolsUtil::demangle_name_only(const std::string& symbol) { - std::string fn_name = demangle_no_args(symbol); - // Chop off namespace and/or class name if present (e.g. "doris::foo" => "foo") - // TODO: fix for templates - return fn_name.substr(fn_name.find_last_of(':') + 1); -} - -// Appends to the stream. -// e.g. Hello --> "5Hello" -static void append_mangled_token(const std::string& s, std::stringstream* out) { - DCHECK(!s.empty()); - (*out) << s.size() << s; -} - -// Outputs the seq_id. This is base 36 encoded with an S prefix and _ suffix. -// As an added optimization, the "seq_id - 1" value is output with the first -// token as just "S". -// e.g. seq_id 0: "S_" -// seq_id 1: "S0_" -// seq_id 2: "S1_" -static void append_seq_id(int seq_id, std::stringstream* out) { - DCHECK_GE(seq_id, 0); - if (seq_id == 0) { - (*out) << "S_"; - return; - } - --seq_id; - char buffer[10]; - char* ptr = buffer + 10; - if (seq_id == 0) { - *--ptr = '0'; - } - while (seq_id != 0) { - DCHECK(ptr > buffer); - char c = static_cast(seq_id % 36); - *--ptr = (c < 10 ? '0' + c : 'A' + c - 10); - seq_id /= 36; - } - (*out) << "S"; - out->write(ptr, 10 - (ptr - buffer)); - (*out) << "_"; -} - -static void append_any_val_type(int namespace_id, const TypeDescriptor& type, - std::stringstream* s) { - (*s) << "N"; - // All the AnyVal types are in the doris_udf namespace, that token - // already came with doris_udf::FunctionContext - append_seq_id(namespace_id, s); - - switch (type.type) { - case TYPE_BOOLEAN: - append_mangled_token("BooleanVal", s); - break; - case TYPE_TINYINT: - append_mangled_token("TinyIntVal", s); - break; - case TYPE_SMALLINT: - append_mangled_token("SmallIntVal", s); - break; - case TYPE_INT: - append_mangled_token("IntVal", s); - break; - case TYPE_BIGINT: - append_mangled_token("BigIntVal", s); - break; - case TYPE_LARGEINT: - append_mangled_token("LargeIntVal", s); - break; - case TYPE_FLOAT: - append_mangled_token("FloatVal", s); - break; - case TYPE_TIME: - case TYPE_DOUBLE: - append_mangled_token("DoubleVal", s); - break; - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_HLL: - case TYPE_OBJECT: - case TYPE_STRING: - case TYPE_QUANTILE_STATE: - append_mangled_token("StringVal", s); - break; - case TYPE_DATE: - case TYPE_DATETIME: - append_mangled_token("DateTimeVal", s); - break; - case TYPE_DATEV2: - append_mangled_token("DateV2Val", s); - break; - case TYPE_DATETIMEV2: - append_mangled_token("DateTimeV2Val", s); - break; - case TYPE_DECIMALV2: - append_mangled_token("DecimalV2Val", s); - break; - case TYPE_DECIMAL32: - append_mangled_token("Decimal32Val", s); - break; - case TYPE_DECIMAL64: - append_mangled_token("Decimal64Val", s); - break; - case TYPE_DECIMAL128I: - append_mangled_token("Decimal128Val", s); - break; - default: - DCHECK(false) << "NYI: " << type.debug_string(); - } - (*s) << "E"; // end doris_udf namespace -} - -std::string SymbolsUtil::mangle_user_function(const std::string& fn_name, - const std::vector& arg_types, - bool has_var_args, TypeDescriptor* ret_arg_type) { - // We need to split fn_name by :: to separate scoping from tokens - const std::regex re("::"); - std::sregex_token_iterator it {fn_name.begin(), fn_name.end(), re, -1}; - std::vector name_tokens {it, {}}; - - // Mangled names use substitution as a builtin compression. The first time a token - // is seen, we output the raw token string and store the index ("seq_id"). The - // next time we see the same token, we output the index instead. - int seq_id = 0; - - // Sequence id for the doris_udf namespace token - int doris_udf_seq_id = -1; - - std::stringstream ss; - ss << MANGLE_PREFIX; - if (name_tokens.size() > 1) { - ss << "N"; // Start namespace - seq_id += name_tokens.size() - 1; // Append for all the name space tokens. - } - for (int i = 0; i < name_tokens.size(); ++i) { - append_mangled_token(name_tokens[i], &ss); - } - if (name_tokens.size() > 1) { - ss << "E"; // End fn namespace - } - ss << "PN"; // First argument and start of FunctionContext namespace - append_mangled_token("doris_udf", &ss); - doris_udf_seq_id = seq_id++; - append_mangled_token("FunctionContext", &ss); - ++seq_id; - ss << "E"; // E indicates end of namespace - - std::map argument_map; - for (int i = 0; i < arg_types.size(); ++i) { - int repeated_symbol_idx = -1; // Set to >0, if we've seen the symbol. - if (argument_map.find(arg_types[i].type) != argument_map.end()) { - repeated_symbol_idx = argument_map[arg_types[i].type]; - } - - if (has_var_args && i == arg_types.size() - 1) { - // We always specify varargs as int32 followed by the type. - ss << "i"; // The argument for the number of varargs. - ss << "P"; // This indicates what follows is a ptr (that is the array of varargs) - ++seq_id; // For "P" - if (repeated_symbol_idx > 0) { - append_seq_id(repeated_symbol_idx - 1, &ss); - continue; - } - } else { - if (repeated_symbol_idx > 0) { - append_seq_id(repeated_symbol_idx, &ss); - continue; - } - ss << "R"; // This indicates it is a reference type - ++seq_id; // For R. - } - - ss << "K"; // This indicates it is const - seq_id += 2; // For doris_udf::*Val, which is two tokens. - append_any_val_type(doris_udf_seq_id, arg_types[i], &ss); - argument_map[arg_types[i].type] = seq_id; - } - - // Output return argument. - if (ret_arg_type != nullptr) { - int repeated_symbol_idx = -1; - if (argument_map.find(ret_arg_type->type) != argument_map.end()) { - repeated_symbol_idx = argument_map[ret_arg_type->type]; - } - ss << "P"; // Return argument is a pointer - - if (repeated_symbol_idx != -1) { - // This is always last and a pointer type. - append_seq_id(argument_map[ret_arg_type->type] - 2, &ss); - } else { - append_any_val_type(doris_udf_seq_id, *ret_arg_type, &ss); - } - } - - return ss.str(); -} - -std::string SymbolsUtil::mangle_prepare_or_close_function(const std::string& fn_name) { - // We need to split fn_name by :: to separate scoping from tokens - const std::regex re("::"); - std::sregex_token_iterator it {fn_name.begin(), fn_name.end(), re, -1}; - std::vector name_tokens {it, {}}; - - // Mangled names use substitution as a builtin compression. The first time a token - // is seen, we output the raw token string and store the index ("seq_id"). The - // next time we see the same token, we output the index instead. - int seq_id = 0; - - std::stringstream ss; - ss << MANGLE_PREFIX; - if (name_tokens.size() > 1) { - ss << "N"; // Start namespace - seq_id += name_tokens.size() - 1; // Append for all the name space tokens. - } - for (int i = 0; i < name_tokens.size(); ++i) { - append_mangled_token(name_tokens[i], &ss); - } - if (name_tokens.size() > 1) { - ss << "E"; // End fn namespace - } - - ss << "PN"; // FunctionContext* argument and start of FunctionContext namespace - append_mangled_token("doris_udf", &ss); - append_mangled_token("FunctionContext", &ss); - ss << "E"; // E indicates end of namespace - - ss << "NS"; // FunctionStateScope argument - ss << seq_id; - ss << "_"; - append_mangled_token("FunctionStateScope", &ss); - ss << "E"; // E indicates end of namespace - - return ss.str(); -} -} // namespace doris diff --git a/be/src/util/symbols_util.h b/be/src/util/symbols_util.h deleted file mode 100644 index 1c4dc7b8b5..0000000000 --- a/be/src/util/symbols_util.h +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/symbols-util.h -// and modified by Doris - -#pragma once - -#include -#include - -#include "runtime/types.h" - -namespace doris { - -/// Utility class to manipulate c++/IR symbols, mangling and demangling names. -class SymbolsUtil { -public: - /// Returns true if this symbol is mangled. - static bool is_mangled(const std::string& symbol); - - /// Returns the demangled string. The name is assumed to be a mangled string using the - /// gcc/llvm convention. - /// Returns the original input if it cannot be demangled. - static std::string demangle(const std::string& name); - - /// Returns the fully-qualified function name of 'symbol' (i.e. it strips the arguments - /// but retains any namespace and class names). 'symbol' may be mangled or unmangled. - /// Returns the original input if it cannot be demangled. - /// Example: "doris::foo(int arg1)" => "doris::foo" - static std::string demangle_no_args(const std::string& symbol); - - /// Returns the function name of 'symbol' (i.e., it strips the arguments and any - /// namespace/class qualifiers). 'symbol' may be mangled or unmangled. - /// Returns the original input if it cannot be demangled. - /// Example: "doris::foo(int arg1)" => "foo" - static std::string demangle_name_only(const std::string& symbol); - - /// Mangles fn_name with 'arg_types' to the function signature for user functions. - /// This maps types to AnyVal* and automatically adds the FunctionContext* - /// as the first argument. - /// The fn_name must be fully qualified. i.e namespace::class::fn. - /// if 'has_var_args' is true, the last argument in arg_types can be variable. - /// if 'ret_argument' is non-null, it is added as a last return argument. - /// TODO: this is not a general mangling function and that is more difficult to - /// do. Find a library to do this. - /// There is no place we require this to be perfect, if we can't do this right, - /// the user will need to specify the full mangled string. - static std::string mangle_user_function(const std::string& fn_name, - const std::vector& arg_types, - bool has_var_args, TypeDescriptor* ret_argument); - - /// Mangles fn_name assuming arguments - /// (doris_udf::FunctionContext*, doris_udf::FunctionContext::FunctionStateScope). - static std::string mangle_prepare_or_close_function(const std::string& fn_name); -}; - -} // namespace doris diff --git a/be/src/util/topn_counter.cpp b/be/src/util/topn_counter.cpp deleted file mode 100644 index 321612cf12..0000000000 --- a/be/src/util/topn_counter.cpp +++ /dev/null @@ -1,148 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "topn_counter.h" - -#include -#include - -#include - -#include "gen_cpp/olap_common.pb.h" -#include "slice.h" - -namespace doris { - -void TopNCounter::add_item(const std::string& item, uint64_t incrementCount) { - auto iter = _counter_map->find(item); - if (iter != _counter_map->end()) { - iter->second.add_count(incrementCount); - } else { - _counter_map->insert(std::make_pair(item, Counter(item, incrementCount))); - } - _ordered = false; -} - -void TopNCounter::serialize(std::string* buffer) { - sort_retain(_capacity); - PTopNCounter topn_counter; - topn_counter.set_top_num(_top_num); - topn_counter.set_space_expand_rate(_space_expand_rate); - for (std::vector::const_iterator it = _counter_vec->begin(); it != _counter_vec->end(); - ++it) { - PCounter* counter = topn_counter.add_counter(); - counter->set_item(it->get_item()); - counter->set_count(it->get_count()); - } - topn_counter.SerializeToString(buffer); -} - -bool TopNCounter::deserialize(const doris::Slice& src) { - PTopNCounter topn_counter; - if (!topn_counter.ParseFromArray(src.data, src.size)) { - LOG(WARNING) << "topn counter deserialize failed"; - return false; - } - - _space_expand_rate = topn_counter.space_expand_rate(); - set_top_num(topn_counter.top_num()); - for (int i = 0; i < topn_counter.counter_size(); ++i) { - const PCounter& counter = topn_counter.counter(i); - _counter_map->insert( - std::make_pair(counter.item(), Counter(counter.item(), counter.count()))); - _counter_vec->emplace_back(counter.item(), counter.count()); - } - _ordered = true; - return true; -} - -void TopNCounter::sort_retain(uint32_t capacity) { - _counter_vec->clear(); - sort_retain(capacity, _counter_vec); - _ordered = true; -} - -void TopNCounter::sort_retain(uint32_t capacity, std::vector* sort_vec) { - for (std::unordered_map::const_iterator it = _counter_map->begin(); - it != _counter_map->end(); ++it) { - sort_vec->emplace_back(it->second.get_item(), it->second.get_count()); - } - - std::sort(sort_vec->begin(), sort_vec->end(), TopNComparator()); - if (sort_vec->size() > capacity) { - for (uint32_t i = 0, n = sort_vec->size() - capacity; i < n; ++i) { - auto& counter = sort_vec->back(); - _counter_map->erase(counter.get_item()); - sort_vec->pop_back(); - } - } -} - -// Based on the parallel version of the Space Saving algorithm as described in: -// A parallel space saving algorithm for frequent items and the Hurwitz zeta distribution by Massimo Cafaro, et al. -void TopNCounter::merge(doris::TopNCounter&& other) { - if (other._counter_map->size() == 0) { - return; - } - - _space_expand_rate = other._space_expand_rate; - set_top_num(other._top_num); - bool this_full = _counter_map->size() >= _capacity; - bool another_full = other._counter_map->size() >= other._capacity; - - uint64_t m1 = this_full ? _counter_vec->back().get_count() : 0; - uint64_t m2 = another_full ? other._counter_vec->back().get_count() : 0; - - if (another_full == true) { - for (auto& entry : *(this->_counter_map)) { - entry.second.add_count(m2); - } - } - - for (auto& other_entry : *(other._counter_map)) { - auto itr = this->_counter_map->find(other_entry.first); - if (itr != _counter_map->end()) { - itr->second.add_count(other_entry.second.get_count() - m2); - } else { - this->_counter_map->insert(std::make_pair( - other_entry.first, - Counter(other_entry.first, other_entry.second.get_count() + m1))); - } - } - _ordered = false; - sort_retain(_capacity); -} - -void TopNCounter::finalize(std::string& finalize_str) { - if (!_ordered) { - sort_retain(_top_num); - } - // use json format print - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - uint32_t k = 0; - writer.StartObject(); - for (std::vector::const_iterator it = _counter_vec->begin(); - it != _counter_vec->end() && k < _top_num; ++it, ++k) { - writer.Key(it->get_item().data()); - writer.Uint64(it->get_count()); - } - writer.EndObject(); - finalize_str = buffer.GetString(); -} - -} // namespace doris diff --git a/be/src/util/topn_counter.h b/be/src/util/topn_counter.h deleted file mode 100644 index 51fabceed4..0000000000 --- a/be/src/util/topn_counter.h +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "common/logging.h" -#include "runtime/datetime_value.h" -#include "runtime/decimalv2_value.h" -#include "runtime/large_int_value.h" -#include "udf/udf.h" - -namespace doris { - -static const uint32_t DEFAULT_SPACE_EXPAND_RATE = 50; - -struct Slice; - -class Counter { -public: - Counter() = default; - - Counter(const std::string& item, uint64_t count) : _item(item), _count(count) {} - - uint64_t get_count() const { return _count; } - - const std::string& get_item() const { return _item; } - - void add_count(uint64_t count) { _count += count; } - - bool operator==(const Counter& other) { - if (_item.compare(other._item) != 0) { - return false; - } - if (_count != other._count) { - return false; - } - return true; - } - -private: - std::string _item; - uint64_t _count; -}; - -// Refer to TopNCounter.java in https://github.com/apache/kylin -// Based on the Space-Saving algorithm and the Stream-Summary data structure as described in: -// Efficient Computation of Frequent and Top-k Elements in Data Streams by Metwally, Agrawal, and Abbadi -class TopNCounter { -public: - TopNCounter(uint32_t space_expand_rate = DEFAULT_SPACE_EXPAND_RATE) - : _top_num(0), - _space_expand_rate(space_expand_rate), - _capacity(0), - _ordered(false), - _counter_map(new std::unordered_map(_capacity)), - _counter_vec(new std::vector(_capacity)) {} - - TopNCounter(const Slice& src) - : _top_num(0), - _space_expand_rate(0), - _capacity(0), - _ordered(false), - _counter_map(new std::unordered_map(_capacity)), - _counter_vec(new std::vector(_capacity)) { - bool res = deserialize(src); - DCHECK(res); - } - - ~TopNCounter() { - delete _counter_map; - delete _counter_vec; - } - - template - void add_item(const T& item) { - add_item(item, 1); - } - - void add_item(const BooleanVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const TinyIntVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const SmallIntVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const IntVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const BigIntVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const FloatVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const DoubleVal& item, uint64_t incrementCount) { - add_item_numeric(item, incrementCount); - } - void add_item(const StringVal& item, uint64_t incrementCount) { - add_item(std::string((char*)item.ptr, item.len), incrementCount); - } - void add_item(const DateTimeVal& item, uint64_t incrementCount) { - char str[MAX_DTVALUE_STR_LEN]; - DateTimeValue::from_datetime_val(item).to_string(str); - add_item(std::string(str), incrementCount); - } - void add_item(const LargeIntVal& item, uint64_t incrementCount) { - add_item(LargeIntValue::to_string(item.val), incrementCount); - } - void add_item(const DecimalV2Val& item, uint64_t incrementCount) { - add_item(DecimalV2Value::from_decimal_val(item).to_string(), incrementCount); - } - - template - void add_item_numeric(const T& item, uint64_t incrementCount) { - add_item(std::to_string(item.val), incrementCount); - } - - void add_item(const std::string& item, uint64_t incrementCount); - - void serialize(std::string* buffer); - - bool deserialize(const Slice& src); - - void merge(doris::TopNCounter&& other); - - // Sort counter by count value and record it in _counter_vec - void sort_retain(uint32_t capacity); - - void sort_retain(uint32_t capacity, std::vector* sort_vec); - - void finalize(std::string&); - - void set_top_num(uint32_t top_num) { - _top_num = top_num; - _capacity = top_num * _space_expand_rate; - } - -private: - uint32_t _top_num; - uint32_t _space_expand_rate; - uint64_t _capacity; - bool _ordered; - std::unordered_map* _counter_map; - std::vector* _counter_vec; -}; - -class TopNComparator { -public: - bool operator()(const Counter& s1, const Counter& s2) { - return s1.get_count() > s2.get_count(); - } -}; -} // namespace doris diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index ca89e1715a..eabafd755e 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -819,113 +819,6 @@ inline bool Block::is_column_data_null(const doris::TypeDescriptor& type_desc, } } -// TODO: need to refactor this function, too long. -void Block::deep_copy_slot(void* dst, MemPool* pool, const doris::TypeDescriptor& type_desc, - const StringRef& data_ref, const IColumn* column, int row, - bool padding_char) { - if (type_desc.is_collection_type()) { - if (type_desc.type != TYPE_ARRAY) { - return; - } - - Field field; - column->get(row, field); - const auto& array = field.get(); - auto collection_value = reinterpret_cast(dst); - auto item_type_desc = type_desc.children.front(); - CollectionValue::init_collection(pool, array.size(), item_type_desc.type, collection_value); - - const ColumnArray* array_column = nullptr; - if (is_column_nullable(*column)) { - auto& nested_column = - reinterpret_cast(column)->get_nested_column(); - array_column = reinterpret_cast(&nested_column); - } else { - array_column = reinterpret_cast(column); - } - auto item_column = array_column->get_data_ptr().get(); - auto offset = array_column->get_offsets()[row - 1]; - auto iterator = collection_value->iterator(item_type_desc.type); - for (int i = 0; i < collection_value->length(); ++i) { - if (array[i].is_null()) { - const auto& null_value = doris_udf::AnyVal(true); - iterator.set(&null_value); - } else { - auto item_offset = offset + i; - const auto& data_ref = item_type_desc.type != TYPE_ARRAY - ? item_column->get_data_at(item_offset) - : StringRef {}; - if (item_type_desc.is_date_type()) { - // In CollectionValue, date type data is stored as either uint24_t or uint64_t. - DateTimeValue datetime_value; - deep_copy_slot(&datetime_value, pool, item_type_desc, data_ref, item_column, - item_offset, padding_char); - DateTimeVal datetime_val; - datetime_value.to_datetime_val(&datetime_val); - iterator.set(&datetime_val); - } else if (item_type_desc.is_decimal_v2_type()) { - // In CollectionValue, decimal type data is stored as decimal12_t. - DecimalV2Value decimal_value; - deep_copy_slot(&decimal_value, pool, item_type_desc, data_ref, item_column, - item_offset, padding_char); - DecimalV2Val decimal_val; - decimal_value.to_decimal_val(&decimal_val); - iterator.set(&decimal_val); - } else { - deep_copy_slot(iterator.get(), pool, item_type_desc, data_ref, item_column, - item_offset, padding_char); - } - } - iterator.next(); - } - } else if (type_desc.is_date_type()) { - VecDateTimeValue ts = - *reinterpret_cast(data_ref.data); - DateTimeValue dt; - ts.convert_vec_dt_to_dt(&dt); - memcpy(dst, &dt, sizeof(DateTimeValue)); - } else if (type_desc.type == TYPE_OBJECT) { - auto bitmap_value = (BitmapValue*)(data_ref.data); - auto size = bitmap_value->getSizeInBytes(); - - // serialize the content of string - // TODO: NEED TO REWRITE COMPLETELY. the way writing now is WRONG. - // StringRef shouldn't managing exclusive memory cause it will break RAII. - // besides, accessing object which is essentially const by non-const object - // is UB! - auto string_slot = reinterpret_cast(dst); - string_slot->data = reinterpret_cast(pool->allocate(size)); - bitmap_value->write(const_cast(string_slot->data)); //! - string_slot->size = size; - } else if (type_desc.type == TYPE_HLL) { - auto hll_value = (HyperLogLog*)(data_ref.data); - auto size = hll_value->max_serialized_size(); - auto string_slot = reinterpret_cast(dst); - string_slot->data = reinterpret_cast(pool->allocate(size)); - size_t actual_size = hll_value->serialize((uint8_t*)string_slot->data); - string_slot->size = actual_size; - } else if (type_desc.is_string_type()) { // TYPE_OBJECT and TYPE_HLL must be handled before. - memcpy(dst, (const void*)(&data_ref), sizeof(data_ref)); - // Copy the content of string - if (padding_char && type_desc.type == TYPE_CHAR) { - // serialize the content of string - auto string_slot = reinterpret_cast(dst); - string_slot->data = reinterpret_cast(pool->allocate(type_desc.len)); - string_slot->size = type_desc.len; - memset(const_cast(string_slot->data), 0, type_desc.len); //! - memcpy(const_cast(string_slot->data), data_ref.data, data_ref.size); //! - } else { - auto str_ptr = pool->allocate(data_ref.size); - memcpy(str_ptr, data_ref.data, data_ref.size); - auto string_slot = reinterpret_cast(dst); - string_slot->data = reinterpret_cast(str_ptr); - string_slot->size = data_ref.size; - } - } else { - memcpy(dst, data_ref.data, data_ref.size); - } -} - MutableBlock::MutableBlock(const std::vector& tuple_descs, int reserve_size, bool ignore_trivial_slot) { for (auto tuple_desc : tuple_descs) { diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index 86138a63f8..905551a2fc 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -42,7 +42,6 @@ namespace doris { class MemPool; class RowDescriptor; class Status; -class Tuple; class TupleDescriptor; struct TypeDescriptor; @@ -371,9 +370,6 @@ private: void erase_impl(size_t position); bool is_column_data_null(const doris::TypeDescriptor& type_desc, const StringRef& data_ref, const IColumn* column_with_type_and_name, int row); - void deep_copy_slot(void* dst, MemPool* pool, const doris::TypeDescriptor& type_desc, - const StringRef& data_ref, const IColumn* column, int row, - bool padding_char); }; using Blocks = std::vector; diff --git a/be/src/vec/exec/data_gen_functions/vnumbers_tvf.h b/be/src/vec/exec/data_gen_functions/vnumbers_tvf.h index 0c83aae98f..708ed87050 100644 --- a/be/src/vec/exec/data_gen_functions/vnumbers_tvf.h +++ b/be/src/vec/exec/data_gen_functions/vnumbers_tvf.h @@ -25,7 +25,6 @@ namespace doris { class TextConverter; -class Tuple; class TupleDescriptor; class RuntimeState; class MemPool; diff --git a/be/src/vec/exec/vdata_gen_scan_node.h b/be/src/vec/exec/vdata_gen_scan_node.h index 18ca2c040f..7993fe1511 100644 --- a/be/src/vec/exec/vdata_gen_scan_node.h +++ b/be/src/vec/exec/vdata_gen_scan_node.h @@ -26,7 +26,6 @@ namespace doris { class TextConverter; -class Tuple; class TupleDescriptor; class RuntimeState; class MemPool; diff --git a/be/src/vec/exec/vmysql_scan_node.cpp b/be/src/vec/exec/vmysql_scan_node.cpp index 332fa0235b..cda26efd48 100644 --- a/be/src/vec/exec/vmysql_scan_node.cpp +++ b/be/src/vec/exec/vmysql_scan_node.cpp @@ -127,19 +127,6 @@ Status VMysqlScanNode::open(RuntimeState* state) { return Status::OK(); } -Status VMysqlScanNode::write_text_slot(char* value, int value_length, SlotDescriptor* slot, - RuntimeState* state) { - if (!_text_converter->write_slot(slot, _tuple, value, value_length, true, false, - _tuple_pool.get())) { - std::stringstream ss; - ss << "Fail to convert mysql value:'" << value << "' to " << slot->type() << " on column:`" - << slot->col_name() + "`"; - return Status::InternalError(ss.str()); - } - - return Status::OK(); -} - Status VMysqlScanNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { if (state == nullptr || block == nullptr || eos == nullptr) { return Status::InternalError("input is nullptr"); diff --git a/be/src/vec/exec/vmysql_scan_node.h b/be/src/vec/exec/vmysql_scan_node.h index 5bea0fb388..bd431f6b8a 100644 --- a/be/src/vec/exec/vmysql_scan_node.h +++ b/be/src/vec/exec/vmysql_scan_node.h @@ -58,10 +58,6 @@ private: vectorized::MutableColumnPtr* column_ptr, RuntimeState* state); // Write debug string of this into out. void debug_string(int indentation_level, std::stringstream* out) const override; - // Writes a slot in tuple from an MySQL value containing text data. - // The Mysql value is converted into the appropriate target type. - Status write_text_slot(char* value, int value_length, SlotDescriptor* slot, - RuntimeState* state); bool _is_init; MysqlScannerParam _my_param; @@ -86,8 +82,6 @@ private: std::unique_ptr _mysql_scanner; // Helper class for converting text to other types; std::unique_ptr _text_converter; - // Current tuple. - doris::Tuple* _tuple = nullptr; }; } // namespace vectorized } // namespace doris \ No newline at end of file diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 06af3c6c19..07cff73325 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1949,6 +1949,45 @@ public: } }; +namespace MoneyFormat { + +template +static StringVal do_money_format(FunctionContext* context, const T int_value, + const int32_t frac_value = 0) { + char local[N]; + char* p = SimpleItoaWithCommas(int_value, local, sizeof(local)); + int32_t string_val_len = local + sizeof(local) - p + 3; + StringVal result = StringVal::create_temp_string_val(context, string_val_len); + memcpy(result.ptr, p, string_val_len - 3); + *(result.ptr + string_val_len - 3) = '.'; + *(result.ptr + string_val_len - 2) = '0' + (frac_value / 10); + *(result.ptr + string_val_len - 1) = '0' + (frac_value % 10); + return result; +}; + +// Note string value must be valid decimal string which contains two digits after the decimal point +static StringVal do_money_format(FunctionContext* context, const string& value) { + bool is_positive = (value[0] != '-'); + int32_t result_len = value.size() + (value.size() - (is_positive ? 4 : 5)) / 3; + StringVal result = StringVal::create_temp_string_val(context, result_len); + if (!is_positive) { + *result.ptr = '-'; + } + for (int i = value.size() - 4, j = result_len - 4; i >= 0; i = i - 3, j = j - 4) { + *(result.ptr + j) = *(value.data() + i); + if (i - 1 < 0) break; + *(result.ptr + j - 1) = *(value.data() + i - 1); + if (i - 2 < 0) break; + *(result.ptr + j - 2) = *(value.data() + i - 2); + if (j - 3 > 1 || (j - 3 == 1 && is_positive)) { + *(result.ptr + j - 3) = ','; + } + } + memcpy(result.ptr + result_len - 3, value.data() + value.size() - 3, 3); + return result; +}; + +} // namespace MoneyFormat struct MoneyFormatDoubleImpl { static DataTypes get_variadic_argument_types() { return {std::make_shared()}; } @@ -1958,7 +1997,7 @@ struct MoneyFormatDoubleImpl { for (size_t i = 0; i < input_rows_count; i++) { double value = MathFunctions::my_double_round(data_column->get_element(i), 2, false, false); - StringVal str = StringFunctions::do_money_format(context, fmt::format("{:.2f}", value)); + StringVal str = MoneyFormat::do_money_format(context, fmt::format("{:.2f}", value)); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } @@ -1972,7 +2011,7 @@ struct MoneyFormatInt64Impl { const auto* data_column = assert_cast*>(col_ptr.get()); for (size_t i = 0; i < input_rows_count; i++) { Int64 value = data_column->get_element(i); - StringVal str = StringFunctions::do_money_format(context, value); + StringVal str = MoneyFormat::do_money_format(context, value); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } @@ -1986,7 +2025,7 @@ struct MoneyFormatInt128Impl { const auto* data_column = assert_cast*>(col_ptr.get()); for (size_t i = 0; i < input_rows_count; i++) { Int128 value = data_column->get_element(i); - StringVal str = StringFunctions::do_money_format(context, value); + StringVal str = MoneyFormat::do_money_format(context, value); result_column->insert_data(reinterpret_cast(str.ptr), str.len); } } @@ -2006,7 +2045,7 @@ struct MoneyFormatDecimalImpl { DecimalV2Value rounded(0); DecimalV2Value::from_decimal_val(value).round(&rounded, 2, HALF_UP); - StringVal str = StringFunctions::do_money_format( + StringVal str = MoneyFormat::do_money_format( context, rounded.int_value(), abs(rounded.frac_value() / 10000000)); result_column->insert_data(reinterpret_cast(str.ptr), str.len); @@ -2025,7 +2064,7 @@ struct MoneyFormatDecimalImpl { frac_part = frac_part * multiplier; } - StringVal str = StringFunctions::do_money_format( + StringVal str = MoneyFormat::do_money_format( context, decimal32_column->get_whole_part(i), frac_part); result_column->insert_data(reinterpret_cast(str.ptr), str.len); @@ -2044,7 +2083,7 @@ struct MoneyFormatDecimalImpl { frac_part = frac_part * multiplier; } - StringVal str = StringFunctions::do_money_format( + StringVal str = MoneyFormat::do_money_format( context, decimal64_column->get_whole_part(i), frac_part); result_column->insert_data(reinterpret_cast(str.ptr), str.len); @@ -2063,7 +2102,7 @@ struct MoneyFormatDecimalImpl { frac_part = frac_part * multiplier; } - StringVal str = StringFunctions::do_money_format( + StringVal str = MoneyFormat::do_money_format( context, decimal128_column->get_whole_part(i), frac_part); result_column->insert_data(reinterpret_cast(str.ptr), str.len); diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index 794eb47a98..e3f12f525b 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -52,8 +52,6 @@ set(EXPRS_TEST_FILES # exprs/hybrid_set_test.cpp # exprs/in-predicate-test.cpp exprs/json_function_test.cpp - exprs/string_functions_test.cpp - exprs/math_functions_test.cpp exprs/bloom_filter_predicate_test.cpp ) set(GEO_TEST_FILES @@ -166,7 +164,6 @@ set(RUNTIME_TEST_FILES runtime/memory/chunk_allocator_test.cpp runtime/memory/system_allocator_test.cpp runtime/cache/partition_cache_test.cpp - runtime/collection_value_test.cpp runtime/free_pool_test.cpp #runtime/array_test.cpp ) @@ -232,7 +229,6 @@ set(UTIL_TEST_FILES util/sort_heap_test.cpp util/counts_test.cpp util/date_func_test.cpp - util/array_parser_test.cpp util/quantile_state_test.cpp util/hdfs_storage_backend_test.cpp util/interval_tree_test.cpp diff --git a/be/test/exprs/json_function_test.cpp b/be/test/exprs/json_function_test.cpp index b371799836..8fe64e74e4 100644 --- a/be/test/exprs/json_function_test.cpp +++ b/be/test/exprs/json_function_test.cpp @@ -38,217 +38,6 @@ public: JsonFunctionTest() {} }; -TEST_F(JsonFunctionTest, string) { - std::string json_string("{\"id\":\"name\",\"age\":11,\"money\":123000.789}"); - std::string path_string("$.id"); - rapidjson::Document document1; - rapidjson::Value* res1 = JsonFunctions::get_json_object(nullptr, json_string, path_string, - JSON_FUN_STRING, &document1); - EXPECT_EQ(std::string(res1->GetString()), "name"); - - std::string json_string2("{\"price a\": [0,1,2],\"couponFee\":0}"); - std::string path_string2("$.price a"); - rapidjson::Document document2; - rapidjson::Value* res2 = JsonFunctions::get_json_object(nullptr, json_string2, path_string2, - JSON_FUN_STRING, &document2); - rapidjson::StringBuffer buf2; - rapidjson::Writer writer2(buf2); - res2->Accept(writer2); - EXPECT_EQ(std::string(buf2.GetString()), "[0,1,2]"); - - std::string json_string3("{\"price a\": [],\"couponFee\":0}"); - std::string path_string3("$.price a"); - rapidjson::Document document3; - rapidjson::Value* res3 = JsonFunctions::get_json_object(nullptr, json_string3, path_string3, - JSON_FUN_STRING, &document3); - rapidjson::StringBuffer buf3; - rapidjson::Writer writer3(buf3); - res3->Accept(writer3); - EXPECT_EQ(std::string(buf3.GetString()), "[]"); - - std::string json_string4("{\"price a\": [],\"couponFee\":null}"); - std::string path_string4("$.couponFee"); - rapidjson::Document document4; - rapidjson::Value* res4 = JsonFunctions::get_json_object(nullptr, json_string4, path_string4, - JSON_FUN_STRING, &document4); - EXPECT_TRUE(res4->IsNull()); - - std::string json_string5( - "{\"blockNames\": {}," - "\"seatCategories\": [{\"areas\": [{\"areaId\": 205705999,\"blockIds\": []}," - "{\"areaId\": 205705998,\"blockIds\": []}],\"seatCategoryId\": 338937290}]}"); - std::string path_string5_1("$.blockNames"); - rapidjson::Document document5_1; - rapidjson::Value* res5_1 = JsonFunctions::get_json_object(nullptr, json_string5, path_string5_1, - JSON_FUN_STRING, &document5_1); - rapidjson::StringBuffer buf5_1; - rapidjson::Writer writer5_1(buf5_1); - res5_1->Accept(writer5_1); - EXPECT_EQ(std::string(buf5_1.GetString()), "{}"); - - std::string path_string5_2("$.seatCategories.areas.blockIds"); - rapidjson::Document document5_2; - rapidjson::Value* res5_2 = JsonFunctions::get_json_object(nullptr, json_string5, path_string5_2, - JSON_FUN_STRING, &document5_2); - rapidjson::StringBuffer buf5_2; - rapidjson::Writer writer5_2(buf5_2); - res5_2->Accept(writer5_2); - EXPECT_EQ(std::string(buf5_2.GetString()), "[]"); - - std::string path_string5_3("$.seatCategories.areas[0].areaId"); - rapidjson::Document document5_3; - rapidjson::Value* res5_3 = JsonFunctions::get_json_object(nullptr, json_string5, path_string5_3, - JSON_FUN_STRING, &document5_2); - rapidjson::StringBuffer buf5_3; - rapidjson::Writer writer5_3(buf5_3); - res5_3->Accept(writer5_3); - EXPECT_EQ(std::string(buf5_3.GetString()), "205705999"); -} - -TEST_F(JsonFunctionTest, json_quote) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(StringVal::null(), JsonFunctions::json_quote(context, StringVal::null())); - - doris_udf::StringVal res1 = JsonFunctions::json_quote(context, StringVal("null")); - EXPECT_EQ(std::string("\"null\""), std::string((char*)res1.ptr, res1.len)); - - doris_udf::StringVal res2 = JsonFunctions::json_quote(context, StringVal("[1, 2, 3]")); - EXPECT_EQ(std::string("\"[1, 2, 3]\""), std::string((char*)res2.ptr, res2.len)); - - doris_udf::StringVal res3 = JsonFunctions::json_quote(context, StringVal("\n\b\r\t")); - EXPECT_EQ(std::string("\"\\n\\b\\r\\t\""), std::string((char*)res3.ptr, res3.len)); - - doris_udf::StringVal res4 = JsonFunctions::json_quote(context, StringVal("\"")); - EXPECT_EQ(std::string("\"\\\"\""), std::string((char*)res4.ptr, res4.len)); - - doris_udf::StringVal json_str = {""}; - doris_udf::StringVal res5 = JsonFunctions::json_quote(context, json_str); - EXPECT_EQ(std::string("\"\""), std::string((char*)res5.ptr, res5.len)); - delete context; -} - -TEST_F(JsonFunctionTest, json_array) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - doris_udf::StringVal json_str1[2] = {"[1,2,3]", "5"}; - doris_udf::StringVal res1 = JsonFunctions::json_array(context, 2, json_str1); - EXPECT_EQ(std::string("[\"[1,2,3]\"]"), std::string((char*)res1.ptr, res1.len)); - - doris_udf::StringVal json_str2[4] = {"1", "abc", "null", "250"}; - doris_udf::StringVal res2 = JsonFunctions::json_array(context, 4, json_str2); - EXPECT_EQ(std::string("[1,\"abc\",null]"), std::string((char*)res2.ptr, res2.len)); - - doris_udf::StringVal json_str3[1] = {""}; - doris_udf::StringVal res3 = JsonFunctions::json_array(context, 1, json_str3); - EXPECT_EQ(std::string("[]"), std::string((char*)res3.ptr, res3.len)); - - doris_udf::StringVal json_str4[2] = {"null", "0"}; - doris_udf::StringVal res4 = JsonFunctions::json_array(context, 2, json_str4); - EXPECT_EQ(std::string("[null]"), std::string((char*)res4.ptr, res4.len)); - delete context; -} - -TEST_F(JsonFunctionTest, json_object) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - doris_udf::StringVal json_str1[3] = {"id", "87", "52"}; - doris_udf::StringVal res1 = JsonFunctions::json_object(context, 3, json_str1); - EXPECT_EQ(std::string("{\"id\":87}"), std::string((char*)res1.ptr, res1.len)); - - doris_udf::StringVal json_str2[5] = {"name", "Jack", "score", "[87,98,90]", "5555"}; - doris_udf::StringVal res2 = JsonFunctions::json_object(context, 5, json_str2); - EXPECT_EQ(std::string("{\"name\":\"Jack\",\"score\":\"[87,98,90]\"}"), - std::string((char*)res2.ptr, res2.len)); - - doris_udf::StringVal json_str3[3] = {"key", "null", "50"}; - doris_udf::StringVal res3 = JsonFunctions::json_object(context, 3, json_str3); - EXPECT_EQ(std::string("{\"key\":null}"), std::string((char*)res3.ptr, res3.len)); - - doris_udf::StringVal json_str4[1] = {""}; - doris_udf::StringVal res4 = JsonFunctions::json_object(context, 1, json_str4); - EXPECT_EQ(std::string("{}"), std::string((char*)res4.ptr, res4.len)); - delete context; -} - -TEST_F(JsonFunctionTest, int) { - std::string json_string("{\"id\":\"name\",\"age\":11,\"money\":123000.789}"); - std::string path_string("$.age"); - rapidjson::Document document; - rapidjson::Value* res = JsonFunctions::get_json_object(nullptr, json_string, path_string, - JSON_FUN_INT, &document); - EXPECT_EQ(res->GetInt(), 11); - - std::string json_string1( - "{\"list\":[{\"id\":[{\"aa\":1}]},{\"id\":[{\"aa\":\"cc\"}]}," - "{\"id\":[{\"kk\":\"cc\"}]}]}"); - std::string path_string1("$.list.id.aa[0]"); - rapidjson::Document document1; - rapidjson::Value* res1 = JsonFunctions::get_json_object(nullptr, json_string1, path_string1, - JSON_FUN_INT, &document1); - EXPECT_EQ(res1->GetInt(), 1); - - std::string json_string2("[1,2,3,5,8,0]"); - std::string path_string2("$.[3]"); - rapidjson::Document document2; - rapidjson::Value* res2 = JsonFunctions::get_json_object(nullptr, json_string2, path_string2, - JSON_FUN_INT, &document2); - EXPECT_EQ(res2->GetInt(), 5); - - std::string json_string3("{\"price a\": [0,1,2],\"couponFee\":0.0}"); - std::string path_string3_1("$.price a[3]"); - rapidjson::Document document3_1; - rapidjson::Value* res3_1 = JsonFunctions::get_json_object(nullptr, json_string3, path_string3_1, - JSON_FUN_INT, &document3_1); - EXPECT_TRUE(res3_1 == nullptr); - - std::string path_string3_2("$.couponFee"); - rapidjson::Document document3_2; - rapidjson::Value* res3_2 = JsonFunctions::get_json_object(nullptr, json_string3, path_string3_2, - JSON_FUN_INT, &document3_2); - EXPECT_FALSE(res3_2->IsInt()); -} - -TEST_F(JsonFunctionTest, double) { - std::string json_string("{\"id\":\"name\",\"age\":11,\"money\":123000.789}"); - std::string path_string("$.money"); - rapidjson::Document document; - rapidjson::Value* res = JsonFunctions::get_json_object(nullptr, json_string, path_string, - JSON_FUN_DOUBLE, &document); - EXPECT_EQ(res->GetDouble(), 123000.789); - - std::string path_string2("$.age"); - rapidjson::Document document2; - rapidjson::Value* res2 = JsonFunctions::get_json_object(nullptr, json_string, path_string2, - JSON_FUN_DOUBLE, &document2); - EXPECT_EQ(res2->GetInt(), 11); -} - -TEST_F(JsonFunctionTest, special_char) { - std::string json_string("{\"key with.dot\": [\"v1\", \"v2\"]}"); - std::string path_string("$.\"key with.dot\"[1]"); - rapidjson::Document document; - rapidjson::Value* res = JsonFunctions::get_json_object(nullptr, json_string, path_string, - JSON_FUN_DOUBLE, &document); - EXPECT_FALSE(res->GetString() == nullptr); - EXPECT_EQ(std::string(res->GetString()), "v2"); - - std::string json_string2("{\"key with|\": [\"v1\", \"v2\"]}"); - std::string path_string2("$.key with|[0]"); - rapidjson::Document document2; - rapidjson::Value* res2 = JsonFunctions::get_json_object(nullptr, json_string2, path_string2, - JSON_FUN_DOUBLE, &document2); - EXPECT_FALSE(res2->GetString() == nullptr); - EXPECT_EQ(std::string(res2->GetString()), "v1"); - - std::string json_string3("{\"key with.dot\": [{\"key2.dot\":\"v1\"}, {\"key3.dot\":\"v2\"}]}"); - std::string path_string3("$.\"key with.dot\"[0].\"key2.dot\""); - rapidjson::Document document3; - rapidjson::Value* res3 = JsonFunctions::get_json_object(nullptr, json_string3, path_string3, - JSON_FUN_DOUBLE, &document3); - EXPECT_FALSE(res3->GetString() == nullptr); - EXPECT_EQ(std::string(res3->GetString()), "v1"); -} - TEST_F(JsonFunctionTest, json_path1) { bool wrap_explicitly; std::string json_raw_data( diff --git a/be/test/exprs/math_functions_test.cpp b/be/test/exprs/math_functions_test.cpp deleted file mode 100644 index 040ce49d34..0000000000 --- a/be/test/exprs/math_functions_test.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "exprs/math_functions.h" - -#include - -#include -#include - -#include "runtime/large_int_value.h" -#include "runtime/mem_pool.h" -#include "testutil/function_utils.h" -#include "udf/udf_internal.h" - -namespace doris { - -class MathFunctionsTest : public testing::Test { -public: - MathFunctionsTest() = default; - - void SetUp() { - utils = new FunctionUtils(); - ctx = utils->get_fn_ctx(); - } - void TearDown() { delete utils; } - - FunctionUtils* utils; - FunctionContext* ctx; -}; - -TEST_F(MathFunctionsTest, abs) { - // FloatVal - FloatVal fv1(0.0f); - FloatVal fv2(0.1f); - FloatVal fv3(FLT_MAX); - FloatVal fv4(FLT_MIN); - EXPECT_EQ(fv1, MathFunctions::abs(ctx, FloatVal(0.0))); - EXPECT_EQ(fv1, MathFunctions::abs(ctx, FloatVal(-0.0))); - EXPECT_EQ(fv2, MathFunctions::abs(ctx, FloatVal(0.1))); - EXPECT_EQ(fv2, MathFunctions::abs(ctx, FloatVal(-0.1))); - EXPECT_EQ(fv3, MathFunctions::abs(ctx, FloatVal(FLT_MAX))); - EXPECT_EQ(fv3, MathFunctions::abs(ctx, FloatVal(-FLT_MAX))); - EXPECT_EQ(fv4, MathFunctions::abs(ctx, FloatVal(FLT_MIN))); - EXPECT_EQ(fv4, MathFunctions::abs(ctx, FloatVal(-FLT_MIN))); - - // DoubleVal - DoubleVal dv1(0.0); - DoubleVal dv2(0.1); - DoubleVal dv3(DBL_MAX); - DoubleVal dv4(DBL_MIN); - EXPECT_EQ(dv1, MathFunctions::abs(ctx, DoubleVal(0.0))); - EXPECT_EQ(dv1, MathFunctions::abs(ctx, DoubleVal(-0.0))); - EXPECT_EQ(dv2, MathFunctions::abs(ctx, DoubleVal(0.1))); - EXPECT_EQ(dv2, MathFunctions::abs(ctx, DoubleVal(-0.1))); - EXPECT_EQ(dv3, MathFunctions::abs(ctx, DoubleVal(DBL_MAX))); - EXPECT_EQ(dv3, MathFunctions::abs(ctx, DoubleVal(-DBL_MAX))); - EXPECT_EQ(dv4, MathFunctions::abs(ctx, DoubleVal(DBL_MIN))); - EXPECT_EQ(dv4, MathFunctions::abs(ctx, DoubleVal(-DBL_MIN))); - - // LargeIntVal - LargeIntVal liv1(0); - LargeIntVal liv2(1); - LargeIntVal liv3(MAX_INT128); - LargeIntVal liv4(__int128(INT64_MAX)); - LargeIntVal liv5(-__int128(INT64_MIN)); - - EXPECT_EQ(liv1, MathFunctions::abs(ctx, LargeIntVal(0))); - EXPECT_EQ(liv1, MathFunctions::abs(ctx, LargeIntVal(-0))); - EXPECT_EQ(liv2, MathFunctions::abs(ctx, LargeIntVal(1))); - EXPECT_EQ(liv2, MathFunctions::abs(ctx, LargeIntVal(-1))); - EXPECT_EQ(liv3, MathFunctions::abs(ctx, LargeIntVal(MAX_INT128))); - EXPECT_EQ(liv3, MathFunctions::abs(ctx, LargeIntVal(-MAX_INT128))); - EXPECT_EQ(liv3, MathFunctions::abs(ctx, LargeIntVal(MIN_INT128 + 1))); - // BigIntVal - EXPECT_EQ(liv1, MathFunctions::abs(ctx, BigIntVal(0))); - EXPECT_EQ(liv1, MathFunctions::abs(ctx, BigIntVal(-0))); - EXPECT_EQ(liv2, MathFunctions::abs(ctx, BigIntVal(1))); - EXPECT_EQ(liv2, MathFunctions::abs(ctx, BigIntVal(-1))); - EXPECT_EQ(liv4, MathFunctions::abs(ctx, BigIntVal(INT64_MAX))); - EXPECT_EQ(liv5, MathFunctions::abs(ctx, BigIntVal(INT64_MIN))); - - // IntVal - BigIntVal biv1(0); - BigIntVal biv2(1); - BigIntVal biv3(int64_t(INT32_MAX)); - BigIntVal biv4(-int64_t(INT32_MIN)); - - EXPECT_EQ(biv1, MathFunctions::abs(ctx, IntVal(0))); - EXPECT_EQ(biv1, MathFunctions::abs(ctx, IntVal(-0))); - EXPECT_EQ(biv2, MathFunctions::abs(ctx, IntVal(1))); - EXPECT_EQ(biv2, MathFunctions::abs(ctx, IntVal(-1))); - EXPECT_EQ(biv3, MathFunctions::abs(ctx, IntVal(INT32_MAX))); - EXPECT_EQ(biv4, MathFunctions::abs(ctx, IntVal(INT32_MIN))); - - // SmallIntVal - IntVal iv1(0); - IntVal iv2(1); - IntVal iv3(int32_t(INT16_MAX)); - IntVal iv4(-int32_t(INT16_MIN)); - EXPECT_EQ(iv1, MathFunctions::abs(ctx, SmallIntVal(0))); - EXPECT_EQ(iv1, MathFunctions::abs(ctx, SmallIntVal(-0))); - EXPECT_EQ(iv2, MathFunctions::abs(ctx, SmallIntVal(1))); - EXPECT_EQ(iv2, MathFunctions::abs(ctx, SmallIntVal(-1))); - EXPECT_EQ(iv3, MathFunctions::abs(ctx, SmallIntVal(INT16_MAX))); - EXPECT_EQ(iv4, MathFunctions::abs(ctx, SmallIntVal(INT16_MIN))); - - //TinyIntVal - SmallIntVal siv1(0); - SmallIntVal siv2(1); - SmallIntVal siv3(int16_t(INT8_MAX)); - SmallIntVal siv4(-int16_t(INT8_MIN)); - EXPECT_EQ(siv1, MathFunctions::abs(ctx, TinyIntVal(0))); - EXPECT_EQ(siv1, MathFunctions::abs(ctx, TinyIntVal(-0))); - EXPECT_EQ(siv2, MathFunctions::abs(ctx, TinyIntVal(1))); - EXPECT_EQ(siv2, MathFunctions::abs(ctx, TinyIntVal(-1))); - EXPECT_EQ(siv3, MathFunctions::abs(ctx, TinyIntVal(INT8_MAX))); - EXPECT_EQ(siv4, MathFunctions::abs(ctx, TinyIntVal(INT8_MIN))); -} - -TEST_F(MathFunctionsTest, rand) { - doris_udf::FunctionContext::TypeDesc type; - type.type = doris_udf::FunctionContext::TYPE_DOUBLE; - std::vector arg_types; - doris_udf::FunctionContext::TypeDesc type1; - type1.type = doris_udf::FunctionContext::TYPE_BIGINT; - arg_types.push_back(type1); - FunctionUtils* utils1 = new FunctionUtils(type, arg_types, 8); - FunctionContext* ctx1 = utils1->get_fn_ctx(); - std::vector constant_args; - BigIntVal bi(1); - constant_args.push_back(&bi); - ctx1->impl()->set_constant_args(constant_args); - - MathFunctions::rand_prepare(ctx1, FunctionContext::THREAD_LOCAL); - DoubleVal dv1 = MathFunctions::rand_seed(ctx1, BigIntVal(0)); - MathFunctions::rand_close(ctx1, FunctionContext::THREAD_LOCAL); - - MathFunctions::rand_prepare(ctx1, FunctionContext::THREAD_LOCAL); - DoubleVal dv2 = MathFunctions::rand_seed(ctx1, BigIntVal(0)); - MathFunctions::rand_close(ctx1, FunctionContext::THREAD_LOCAL); - - EXPECT_EQ(dv1.val, dv2.val); - delete utils1; - - MathFunctions::rand_prepare(ctx, FunctionContext::THREAD_LOCAL); - DoubleVal dv3 = MathFunctions::rand(ctx); - MathFunctions::rand_close(ctx, FunctionContext::THREAD_LOCAL); - - MathFunctions::rand_prepare(ctx, FunctionContext::THREAD_LOCAL); - DoubleVal dv4 = MathFunctions::rand(ctx); - MathFunctions::rand_close(ctx, FunctionContext::THREAD_LOCAL); - - EXPECT_NE(dv3.val, dv4.val); -} - -TEST_F(MathFunctionsTest, hex_int) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(StringVal::null(), MathFunctions::hex_string(context, StringVal::null())); - - EXPECT_EQ( - StringVal("7FFFFFFFFFFFFFFF"), - MathFunctions::hex_int(context, BigIntVal(9223372036854775807))); //BigIntVal max_value - - EXPECT_EQ(StringVal("FFE5853AB393E6C0"), - MathFunctions::hex_int(context, BigIntVal(-7453337203775808))); - - EXPECT_EQ(StringVal("0"), MathFunctions::hex_int(context, BigIntVal(0))); - - EXPECT_EQ(StringVal("C"), MathFunctions::hex_int(context, BigIntVal(12))); - - EXPECT_EQ(StringVal("90"), MathFunctions::hex_int(context, BigIntVal(144))); - - EXPECT_EQ(StringVal("FFFFFFFFFFFFFFFF"), MathFunctions::hex_int(context, BigIntVal(-1))); - - EXPECT_EQ(StringVal("FFFFFFFFFFFFFFFE"), MathFunctions::hex_int(context, BigIntVal(-2))); - - EXPECT_EQ(StringVal("24EC1"), MathFunctions::hex_int(context, BigIntVal(151233))); - - delete context; -} - -TEST_F(MathFunctionsTest, hex_string) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(StringVal::null(), MathFunctions::hex_string(context, StringVal::null())); - - EXPECT_EQ(StringVal("30"), MathFunctions::hex_string(context, StringVal("0"))); - - EXPECT_EQ(StringVal("31"), MathFunctions::hex_string(context, StringVal("1"))); - - EXPECT_EQ(StringVal("313233"), MathFunctions::hex_string(context, StringVal("123"))); - - EXPECT_EQ(StringVal("41"), MathFunctions::hex_string(context, StringVal("A"))); - - EXPECT_EQ(StringVal("61"), MathFunctions::hex_string(context, StringVal("a"))); - - EXPECT_EQ(StringVal("E68891"), MathFunctions::hex_string(context, StringVal("我"))); - - EXPECT_EQ(StringVal("3F"), MathFunctions::hex_string(context, StringVal("?"))); - - delete context; -} - -TEST_F(MathFunctionsTest, unhex) { - MemPool mem_pool; - doris_udf::FunctionContext* context = - doris_udf::FunctionContext::create_test_context(&mem_pool); - - EXPECT_EQ(StringVal::null(), MathFunctions::unhex(context, StringVal::null())); - - EXPECT_EQ(StringVal("123"), MathFunctions::unhex(context, StringVal("313233"))); - - EXPECT_EQ(StringVal(""), MathFunctions::unhex(context, StringVal("@!#"))); - - EXPECT_EQ(StringVal(""), MathFunctions::unhex(context, StringVal("@@"))); - - EXPECT_EQ(StringVal("a"), MathFunctions::unhex(context, StringVal("61"))); - - EXPECT_EQ(StringVal("123"), MathFunctions::unhex(context, StringVal("313233"))); - - EXPECT_EQ(StringVal(""), MathFunctions::unhex(context, StringVal("我"))); - - EXPECT_EQ(StringVal("?"), MathFunctions::unhex(context, StringVal("EFBC9F"))); - - delete context; -} - -TEST_F(MathFunctionsTest, round_bankers) { - BigIntVal r0(0); - BigIntVal r1(-4); - BigIntVal r2(4); - DoubleVal r3(3.6); - DoubleVal r4(10.4); - DoubleVal r5(10.76); - - EXPECT_EQ(r0, MathFunctions::round_bankers(ctx, DoubleVal(0.4))); - EXPECT_EQ(r1, MathFunctions::round_bankers(ctx, DoubleVal(-3.5))); - EXPECT_EQ(r2, MathFunctions::round_bankers(ctx, DoubleVal(4.5))); - EXPECT_EQ(r3, MathFunctions::round_bankers(ctx, DoubleVal(3.55), IntVal(1))); - EXPECT_EQ(r3, MathFunctions::round_bankers(ctx, DoubleVal(3.65), IntVal(1))); - EXPECT_EQ(r4, MathFunctions::round_bankers(ctx, DoubleVal(10.35), IntVal(1))); - EXPECT_EQ(r5, MathFunctions::round_bankers(ctx, DoubleVal(10.755), IntVal(2))); -} - -TEST_F(MathFunctionsTest, round_up_to) { - DoubleVal r0(0); - DoubleVal r1(1); - DoubleVal r2(3); - DoubleVal r3(4); - DoubleVal r4(3.5); - DoubleVal r5(3.55); - - DoubleVal r6(222500); - - EXPECT_EQ(r0, MathFunctions::round_up_to(ctx, DoubleVal(0), IntVal(0))); - EXPECT_EQ(r1, MathFunctions::round_up_to(ctx, DoubleVal(0.5), IntVal(0))); - EXPECT_EQ(r1, MathFunctions::round_up_to(ctx, DoubleVal(0.51), IntVal(0))); - // not 2 - EXPECT_EQ(r2, MathFunctions::round_up_to(ctx, DoubleVal(2.5), IntVal(0))); - EXPECT_EQ(r3, MathFunctions::round_up_to(ctx, DoubleVal(3.5), IntVal(0))); - - EXPECT_EQ(r4, MathFunctions::round_up_to(ctx, DoubleVal(3.5451), IntVal(1))); - EXPECT_EQ(r5, MathFunctions::round_up_to(ctx, DoubleVal(3.5451), IntVal(2))); - - // not 3.54 - EXPECT_EQ(r5, MathFunctions::round_up_to(ctx, DoubleVal(3.5450), IntVal(2))); - - // not 222400 - EXPECT_EQ(r6, MathFunctions::round_up_to(ctx, DoubleVal(222450.00), IntVal(-2))); -} - -} // namespace doris diff --git a/be/test/exprs/string_functions_test.cpp b/be/test/exprs/string_functions_test.cpp deleted file mode 100644 index 89c58ecfb3..0000000000 --- a/be/test/exprs/string_functions_test.cpp +++ /dev/null @@ -1,819 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "exprs/string_functions.h" - -#include -#include - -#include -#include - -#include "exprs/anyval_util.h" -#include "runtime/large_int_value.h" -#include "testutil/function_utils.h" -#include "testutil/test_util.h" -#include "util/simd/vstring_function.h" - -namespace doris { - -class StringFunctionsTest : public testing::Test { -public: - StringFunctionsTest() = default; - - void SetUp() { - utils = new FunctionUtils(); - ctx = utils->get_fn_ctx(); - } - void TearDown() { delete utils; } - -private: - FunctionUtils* utils; - FunctionContext* ctx; -}; - -TEST_F(StringFunctionsTest, do_money_format_for_bigint_bench) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - StringVal expected = AnyValUtil::from_string(ctx, std::string("9,223,372,036,854,775,807.00")); - BigIntVal bigIntVal(9223372036854775807); - for (int i = 0; i < LOOP_LESS_OR_MORE(10, 10000000); i++) { - StringVal result = StringFunctions::money_format(context, bigIntVal); - EXPECT_EQ(expected, result); - } - delete context; -} - -TEST_F(StringFunctionsTest, do_money_format_for_decimalv2_bench) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - StringVal expected = AnyValUtil::from_string(ctx, std::string("9,223,372,085.87")); - DecimalV2Value dv1(std::string("9223372085.8678")); - DecimalV2Val decimalV2Val; - dv1.to_decimal_val(&decimalV2Val); - for (int i = 0; i < LOOP_LESS_OR_MORE(10, 10000000); i++) { - StringVal result = StringFunctions::money_format(context, decimalV2Val); - EXPECT_EQ(expected, result); - } - delete context; -} - -TEST_F(StringFunctionsTest, money_format_bigint) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - StringVal result = StringFunctions::money_format(context, doris_udf::BigIntVal(123456)); - StringVal expected = AnyValUtil::from_string(ctx, std::string("123,456.00")); - EXPECT_EQ(expected, result); - - result = StringFunctions::money_format(context, doris_udf::BigIntVal(-123456)); - expected = AnyValUtil::from_string(ctx, std::string("-123,456.00")); - EXPECT_EQ(expected, result); - - result = StringFunctions::money_format(context, doris_udf::BigIntVal(9223372036854775807)); - expected = AnyValUtil::from_string(ctx, std::string("9,223,372,036,854,775,807.00")); - EXPECT_EQ(expected, result); - delete context; -} - -TEST_F(StringFunctionsTest, money_format_large_int) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - __int128 value = MAX_INT128; - StringVal result = StringFunctions::money_format(context, doris_udf::LargeIntVal(value)); - StringVal expected = AnyValUtil::from_string_temp( - context, std::string("170,141,183,460,469,231,731,687,303,715,884,105,727.00")); - EXPECT_EQ(expected, result); - - value = MIN_INT128; - result = StringFunctions::money_format(context, doris_udf::LargeIntVal(value)); - expected = AnyValUtil::from_string_temp( - context, std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00")); - EXPECT_EQ(expected, result); - delete context; -} - -TEST_F(StringFunctionsTest, money_format_double) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - StringVal result = StringFunctions::money_format(context, doris_udf::DoubleVal(1234.456)); - StringVal expected = AnyValUtil::from_string(ctx, std::string("1,234.46")); - EXPECT_EQ(expected, result); - - result = StringFunctions::money_format(context, doris_udf::DoubleVal(1234.45)); - expected = AnyValUtil::from_string(ctx, std::string("1,234.45")); - EXPECT_EQ(expected, result); - - result = StringFunctions::money_format(context, doris_udf::DoubleVal(1234.4)); - expected = AnyValUtil::from_string(ctx, std::string("1,234.40")); - EXPECT_EQ(expected, result); - - result = StringFunctions::money_format(context, doris_udf::DoubleVal(1234.454)); - expected = AnyValUtil::from_string(ctx, std::string("1,234.45")); - EXPECT_EQ(expected, result); - - result = StringFunctions::money_format(context, doris_udf::DoubleVal(-36854775807.039)); - expected = AnyValUtil::from_string(ctx, std::string("-36,854,775,807.04")); - EXPECT_EQ(expected, result); - - delete context; -} - -TEST_F(StringFunctionsTest, money_format_decimal_v2) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - DecimalV2Value dv1(std::string("3333333333.2222222222")); - DecimalV2Val value1; - dv1.to_decimal_val(&value1); - - StringVal result = StringFunctions::money_format(context, value1); - StringVal expected = AnyValUtil::from_string(ctx, std::string("3,333,333,333.22")); - EXPECT_EQ(expected, result); - - DecimalV2Value dv2(std::string("-740740740.71604938271975308642")); - DecimalV2Val value2; - dv2.to_decimal_val(&value2); - - result = StringFunctions::money_format(context, value2); - expected = AnyValUtil::from_string(ctx, std::string("-740,740,740.72")); - EXPECT_EQ(expected, result); - delete context; -} - -TEST_F(StringFunctionsTest, split_part) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("hello")), - StringFunctions::split_part(context, StringVal("hello word"), StringVal(" "), 1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("word")), - StringFunctions::split_part(context, StringVal("hello word"), StringVal(" "), 2)); - - EXPECT_EQ(StringVal::null(), - StringFunctions::split_part(context, StringVal("hello word"), StringVal(" "), 3)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::split_part(context, StringVal("hello word"), StringVal("hello"), 1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string(" word")), - StringFunctions::split_part(context, StringVal("hello word"), StringVal("hello"), 2)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("2019年9")), - StringFunctions::split_part(context, StringVal("2019年9月8日"), StringVal("月"), 1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("bcd")), - StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 2)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("bd")), - StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 3)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::split_part(context, StringVal("abcdabda"), StringVal("a"), 4)); - - EXPECT_EQ( - AnyValUtil::from_string(ctx, std::string("#123")), - StringFunctions::split_part(context, StringVal("abc###123###234"), StringVal("##"), 2)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("234")), - StringFunctions::split_part(context, StringVal("abc###123###234"), StringVal("##"), - -1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("123#")), - StringFunctions::split_part(context, StringVal("abc###123###234"), StringVal("##"), - -2)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("abc#")), - StringFunctions::split_part(context, StringVal("abc###123###234"), StringVal("##"), - -3)); - - EXPECT_EQ(StringVal::null(), StringFunctions::split_part(context, StringVal("abc###123###234"), - StringVal("##"), -4)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("234")), - StringFunctions::split_part(context, StringVal("abc#123##234"), StringVal("#"), -1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::split_part(context, StringVal("abc#123##234"), StringVal("#"), -2)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("123")), - StringFunctions::split_part(context, StringVal("abc#123##234"), StringVal("#"), -3)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("abc")), - StringFunctions::split_part(context, StringVal("abc#123##234"), StringVal("#"), -4)); - - EXPECT_EQ(StringVal::null(), - StringFunctions::split_part(context, StringVal("abc#123##234"), StringVal("#"), -5)); - - EXPECT_EQ(StringVal::null(), StringFunctions::split_part(context, StringVal("abc#123##234"), - StringVal("#"), IntVal::null())); - - EXPECT_EQ(StringVal::null(), StringFunctions::split_part(context, StringVal("abc#123##234"), - StringVal::null(), -1)); - - EXPECT_EQ( - AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::split_part(context, StringVal("2019年9月-12月"), StringVal("月"), -1)); - EXPECT_EQ( - AnyValUtil::from_string(ctx, std::string("-12")), - StringFunctions::split_part(context, StringVal("2019年9月-12月"), StringVal("月"), -2)); - - EXPECT_EQ( - AnyValUtil::from_string(ctx, std::string("2019年9")), - StringFunctions::split_part(context, StringVal("2019年9月-12月"), StringVal("月"), -3)); - - EXPECT_EQ(StringVal::null(), StringFunctions::split_part(context, StringVal("2019年9月-12月"), - StringVal("月"), -4)); - delete context; -} - -TEST_F(StringFunctionsTest, ends_with) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - doris_udf::BooleanVal falseRet = doris_udf::BooleanVal(false); - doris_udf::BooleanVal trueRet = doris_udf::BooleanVal(true); - doris_udf::BooleanVal nullRet = doris_udf::BooleanVal::null(); - - EXPECT_EQ(trueRet, StringFunctions::ends_with(context, StringVal(""), StringVal(""))); - - EXPECT_EQ(trueRet, StringFunctions::ends_with(context, StringVal("hello"), StringVal(""))); - - EXPECT_EQ(falseRet, StringFunctions::ends_with(context, StringVal(""), StringVal("hello"))); - - EXPECT_EQ(trueRet, StringFunctions::ends_with(context, StringVal("hello"), StringVal("hello"))); - - EXPECT_EQ(trueRet, StringFunctions::ends_with(context, StringVal(" "), StringVal(" "))); - - EXPECT_EQ(falseRet, StringFunctions::ends_with(context, StringVal("hello"), StringVal(" "))); - - EXPECT_EQ(falseRet, StringFunctions::ends_with(context, StringVal(" "), StringVal("hello"))); - - EXPECT_EQ(falseRet, - StringFunctions::ends_with(context, StringVal("hello doris"), StringVal("hello"))); - - EXPECT_EQ(trueRet, - StringFunctions::ends_with(context, StringVal("hello doris"), StringVal("doris"))); - - EXPECT_EQ(trueRet, StringFunctions::ends_with(context, StringVal("hello doris"), - StringVal("hello doris"))); - - EXPECT_EQ(nullRet, StringFunctions::ends_with(context, StringVal("hello"), StringVal::null())); - - EXPECT_EQ(nullRet, StringFunctions::ends_with(context, StringVal::null(), StringVal("hello"))); - - EXPECT_EQ(nullRet, StringFunctions::ends_with(context, StringVal::null(), StringVal::null())); - delete context; -} - -TEST_F(StringFunctionsTest, starts_with) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - doris_udf::BooleanVal falseRet = doris_udf::BooleanVal(false); - doris_udf::BooleanVal trueRet = doris_udf::BooleanVal(true); - doris_udf::BooleanVal nullRet = doris_udf::BooleanVal::null(); - - EXPECT_EQ(trueRet, StringFunctions::starts_with(context, StringVal(""), StringVal(""))); - - EXPECT_EQ(trueRet, StringFunctions::starts_with(context, StringVal(" "), StringVal(" "))); - - EXPECT_EQ(trueRet, StringFunctions::starts_with(context, StringVal("hello"), StringVal(""))); - - EXPECT_EQ(falseRet, StringFunctions::starts_with(context, StringVal(""), StringVal("hello"))); - - EXPECT_EQ(trueRet, - StringFunctions::starts_with(context, StringVal("hello"), StringVal("hello"))); - - EXPECT_EQ(falseRet, StringFunctions::starts_with(context, StringVal("hello"), StringVal(" "))); - - EXPECT_EQ(falseRet, StringFunctions::starts_with(context, StringVal(" "), StringVal("world"))); - - EXPECT_EQ(trueRet, - StringFunctions::starts_with(context, StringVal("hello world"), StringVal("hello"))); - - EXPECT_EQ(falseRet, - StringFunctions::starts_with(context, StringVal("hello world"), StringVal("world"))); - - EXPECT_EQ(trueRet, StringFunctions::starts_with(context, StringVal("hello world"), - StringVal("hello world"))); - - EXPECT_EQ(nullRet, - StringFunctions::starts_with(context, StringVal("hello world"), StringVal::null())); - - EXPECT_EQ(nullRet, - StringFunctions::starts_with(context, StringVal::null(), StringVal("hello world"))); - - EXPECT_EQ(nullRet, StringFunctions::starts_with(context, StringVal::null(), StringVal::null())); - delete context; -} - -TEST_F(StringFunctionsTest, null_or_empty) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - doris_udf::BooleanVal falseRet = doris_udf::BooleanVal(false); - doris_udf::BooleanVal trueRet = doris_udf::BooleanVal(true); - - EXPECT_EQ(trueRet, StringFunctions::null_or_empty(context, StringVal(""))); - - EXPECT_EQ(falseRet, StringFunctions::null_or_empty(context, StringVal(" "))); - - EXPECT_EQ(falseRet, StringFunctions::null_or_empty(context, StringVal("hello"))); - - EXPECT_EQ(falseRet, StringFunctions::null_or_empty(context, StringVal("doris"))); - - EXPECT_EQ(falseRet, StringFunctions::null_or_empty(context, StringVal("111"))); - - EXPECT_EQ(falseRet, StringFunctions::null_or_empty(context, StringVal("."))); - - EXPECT_EQ(trueRet, StringFunctions::null_or_empty(context, StringVal::null())); - delete context; -} - -TEST_F(StringFunctionsTest, left) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::left(context, StringVal(""), 10)); - delete context; -} - -TEST_F(StringFunctionsTest, substring) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::substring(context, StringVal("hello word"), 0, 5)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("hello")), - StringFunctions::substring(context, StringVal("hello word"), 1, 5)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("word")), - StringFunctions::substring(context, StringVal("hello word"), 7, 4)); - - EXPECT_EQ(StringVal::null(), StringFunctions::substring(context, StringVal::null(), 1, 0)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::substring(context, StringVal("hello word"), 1, 0)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string(" word")), - StringFunctions::substring(context, StringVal("hello word"), -5, 5)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("hello word 你")), - StringFunctions::substring(context, StringVal("hello word 你好"), 1, 12)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("好")), - StringFunctions::substring(context, StringVal("hello word 你好"), 13, 1)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::substring(context, StringVal("hello word 你好"), 1, 0)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("rd 你好")), - StringFunctions::substring(context, StringVal("hello word 你好"), -5, 5)); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("h")), - StringFunctions::substring(context, StringVal("hello word 你好"), 1, 1)); - delete context; -} - -TEST_F(StringFunctionsTest, reverse) { - FunctionUtils fu; - doris_udf::FunctionContext* context = fu.get_fn_ctx(); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("olleh")), - StringFunctions::reverse(context, StringVal("hello"))); - EXPECT_EQ(StringVal::null(), StringFunctions::reverse(context, StringVal::null())); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("")), - StringFunctions::reverse(context, StringVal(""))); - - EXPECT_EQ(AnyValUtil::from_string(ctx, std::string("好你olleh")), - StringFunctions::reverse(context, StringVal("hello你好"))); -} - -TEST_F(StringFunctionsTest, length) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(IntVal(5), StringFunctions::length(context, StringVal("hello"))); - EXPECT_EQ(IntVal(5), StringFunctions::char_utf8_length(context, StringVal("hello"))); - EXPECT_EQ(IntVal::null(), StringFunctions::length(context, StringVal::null())); - EXPECT_EQ(IntVal::null(), StringFunctions::char_utf8_length(context, StringVal::null())); - - EXPECT_EQ(IntVal(0), StringFunctions::length(context, StringVal(""))); - EXPECT_EQ(IntVal(0), StringFunctions::char_utf8_length(context, StringVal(""))); - - EXPECT_EQ(IntVal(11), StringFunctions::length(context, StringVal("hello你好"))); - - EXPECT_EQ(IntVal(7), StringFunctions::char_utf8_length(context, StringVal("hello你好"))); - delete context; -} - -TEST_F(StringFunctionsTest, append_trailing_char_if_absent) { - EXPECT_EQ(StringVal("ac"), - StringFunctions::append_trailing_char_if_absent(ctx, StringVal("a"), StringVal("c"))); - - EXPECT_EQ(StringVal("c"), - StringFunctions::append_trailing_char_if_absent(ctx, StringVal("c"), StringVal("c"))); - - EXPECT_EQ(StringVal("123c"), StringFunctions::append_trailing_char_if_absent( - ctx, StringVal("123c"), StringVal("c"))); - - EXPECT_EQ(StringVal("c"), - StringFunctions::append_trailing_char_if_absent(ctx, StringVal(""), StringVal("c"))); - - EXPECT_EQ(StringVal::null(), StringFunctions::append_trailing_char_if_absent( - ctx, StringVal::null(), StringVal("c"))); - - EXPECT_EQ(StringVal::null(), StringFunctions::append_trailing_char_if_absent( - ctx, StringVal("a"), StringVal::null())); - - EXPECT_EQ(StringVal::null(), StringFunctions::append_trailing_char_if_absent( - ctx, StringVal("a"), StringVal("abc"))); -} - -TEST_F(StringFunctionsTest, instr) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - EXPECT_EQ(IntVal(4), StringFunctions::instr(context, StringVal("foobarbar"), StringVal("bar"))); - EXPECT_EQ(IntVal(0), StringFunctions::instr(context, StringVal("foobar"), StringVal("xbar"))); - EXPECT_EQ(IntVal(2), StringFunctions::instr(context, StringVal("123456234"), StringVal("234"))); - EXPECT_EQ(IntVal(0), StringFunctions::instr(context, StringVal("123456"), StringVal("567"))); - EXPECT_EQ(IntVal(2), StringFunctions::instr(context, StringVal("1.234"), StringVal(".234"))); - EXPECT_EQ(IntVal(1), StringFunctions::instr(context, StringVal("1.234"), StringVal(""))); - EXPECT_EQ(IntVal(0), StringFunctions::instr(context, StringVal(""), StringVal("123"))); - EXPECT_EQ(IntVal(1), StringFunctions::instr(context, StringVal(""), StringVal(""))); - EXPECT_EQ(IntVal(3), StringFunctions::instr(context, StringVal("你好世界"), StringVal("世界"))); - EXPECT_EQ(IntVal(0), StringFunctions::instr(context, StringVal("你好世界"), StringVal("您好"))); - EXPECT_EQ(IntVal(3), StringFunctions::instr(context, StringVal("你好abc"), StringVal("a"))); - EXPECT_EQ(IntVal(3), StringFunctions::instr(context, StringVal("你好abc"), StringVal("abc"))); - EXPECT_EQ(IntVal::null(), StringFunctions::instr(context, StringVal::null(), StringVal("2"))); - EXPECT_EQ(IntVal::null(), StringFunctions::instr(context, StringVal(""), StringVal::null())); - EXPECT_EQ(IntVal::null(), - StringFunctions::instr(context, StringVal::null(), StringVal::null())); - delete context; -} - -TEST_F(StringFunctionsTest, locate) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - EXPECT_EQ(IntVal(4), - StringFunctions::locate(context, StringVal("bar"), StringVal("foobarbar"))); - EXPECT_EQ(IntVal(0), StringFunctions::locate(context, StringVal("xbar"), StringVal("foobar"))); - EXPECT_EQ(IntVal(2), - StringFunctions::locate(context, StringVal("234"), StringVal("123456234"))); - EXPECT_EQ(IntVal(0), StringFunctions::locate(context, StringVal("567"), StringVal("123456"))); - EXPECT_EQ(IntVal(2), StringFunctions::locate(context, StringVal(".234"), StringVal("1.234"))); - EXPECT_EQ(IntVal(1), StringFunctions::locate(context, StringVal(""), StringVal("1.234"))); - EXPECT_EQ(IntVal(0), StringFunctions::locate(context, StringVal("123"), StringVal(""))); - EXPECT_EQ(IntVal(1), StringFunctions::locate(context, StringVal(""), StringVal(""))); - EXPECT_EQ(IntVal(3), - StringFunctions::locate(context, StringVal("世界"), StringVal("你好世界"))); - EXPECT_EQ(IntVal(0), - StringFunctions::locate(context, StringVal("您好"), StringVal("你好世界"))); - EXPECT_EQ(IntVal(3), StringFunctions::locate(context, StringVal("a"), StringVal("你好abc"))); - EXPECT_EQ(IntVal(3), StringFunctions::locate(context, StringVal("abc"), StringVal("你好abc"))); - EXPECT_EQ(IntVal::null(), StringFunctions::locate(context, StringVal::null(), StringVal("2"))); - EXPECT_EQ(IntVal::null(), StringFunctions::locate(context, StringVal(""), StringVal::null())); - EXPECT_EQ(IntVal::null(), - StringFunctions::locate(context, StringVal::null(), StringVal::null())); - delete context; -} - -TEST_F(StringFunctionsTest, locate_pos) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - EXPECT_EQ(IntVal(7), StringFunctions::locate_pos(context, StringVal("bar"), - StringVal("foobarbar"), IntVal(5))); - EXPECT_EQ(IntVal(0), StringFunctions::locate_pos(context, StringVal("xbar"), - StringVal("foobar"), IntVal(1))); - EXPECT_EQ(IntVal(2), - StringFunctions::locate_pos(context, StringVal(""), StringVal("foobar"), IntVal(2))); - EXPECT_EQ(IntVal(0), - StringFunctions::locate_pos(context, StringVal("foobar"), StringVal(""), IntVal(1))); - EXPECT_EQ(IntVal(0), - StringFunctions::locate_pos(context, StringVal(""), StringVal(""), IntVal(2))); - EXPECT_EQ(IntVal(0), - StringFunctions::locate_pos(context, StringVal("A"), StringVal("AAAAAA"), IntVal(0))); - EXPECT_EQ(IntVal(0), StringFunctions::locate_pos(context, StringVal("A"), StringVal("大A写的A"), - IntVal(0))); - EXPECT_EQ(IntVal(2), StringFunctions::locate_pos(context, StringVal("A"), StringVal("大A写的A"), - IntVal(1))); - EXPECT_EQ(IntVal(2), StringFunctions::locate_pos(context, StringVal("A"), StringVal("大A写的A"), - IntVal(2))); - EXPECT_EQ(IntVal(5), StringFunctions::locate_pos(context, StringVal("A"), StringVal("大A写的A"), - IntVal(3))); - EXPECT_EQ(IntVal(7), StringFunctions::locate_pos(context, StringVal("BaR"), - StringVal("foobarBaR"), IntVal(5))); - EXPECT_EQ(IntVal::null(), - StringFunctions::locate_pos(context, StringVal::null(), StringVal("2"), IntVal(1))); - EXPECT_EQ(IntVal::null(), - StringFunctions::locate_pos(context, StringVal(""), StringVal::null(), IntVal(4))); - EXPECT_EQ(IntVal::null(), StringFunctions::locate_pos(context, StringVal::null(), - StringVal::null(), IntVal(4))); - EXPECT_EQ(IntVal::null(), StringFunctions::locate_pos(context, StringVal::null(), - StringVal::null(), IntVal(-1))); - delete context; -} - -TEST_F(StringFunctionsTest, lpad) { - EXPECT_EQ(StringVal("???hi"), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(5), StringVal("?"))); - EXPECT_EQ(StringVal("g8%7IgY%AHx7luNtf8Kh"), - StringFunctions::lpad(ctx, StringVal("g8%7IgY%AHx7luNtf8Kh"), IntVal(20), - StringVal(""))); - EXPECT_EQ(StringVal("h"), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(1), StringVal("?"))); - EXPECT_EQ(StringVal("你"), - StringFunctions::lpad(ctx, StringVal("你好"), IntVal(1), StringVal("?"))); - EXPECT_EQ(StringVal("你"), - StringFunctions::lpad(ctx, StringVal("你"), IntVal(1), StringVal("?"))); - EXPECT_EQ(StringVal(""), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(0), StringVal("?"))); - EXPECT_EQ(StringVal::null(), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(-1), StringVal("?"))); - EXPECT_EQ(StringVal("h"), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(1), StringVal(""))); - EXPECT_EQ(StringVal::null(), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(5), StringVal(""))); - EXPECT_EQ(StringVal("abahi"), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(5), StringVal("ab"))); - EXPECT_EQ(StringVal("ababhi"), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(6), StringVal("ab"))); - EXPECT_EQ(StringVal("呵呵呵hi"), - StringFunctions::lpad(ctx, StringVal("hi"), IntVal(5), StringVal("呵呵"))); - EXPECT_EQ(StringVal("hih呵呵"), - StringFunctions::lpad(ctx, StringVal("呵呵"), IntVal(5), StringVal("hi"))); -} - -TEST_F(StringFunctionsTest, rpad) { - EXPECT_EQ(StringVal("hi???"), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(5), StringVal("?"))); - EXPECT_EQ(StringVal("g8%7IgY%AHx7luNtf8Kh"), - StringFunctions::rpad(ctx, StringVal("g8%7IgY%AHx7luNtf8Kh"), IntVal(20), - StringVal(""))); - EXPECT_EQ(StringVal("h"), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(1), StringVal("?"))); - EXPECT_EQ(StringVal("你"), - StringFunctions::rpad(ctx, StringVal("你好"), IntVal(1), StringVal("?"))); - EXPECT_EQ(StringVal("你"), - StringFunctions::rpad(ctx, StringVal("你"), IntVal(1), StringVal("?"))); - EXPECT_EQ(StringVal(""), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(0), StringVal("?"))); - EXPECT_EQ(StringVal::null(), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(-1), StringVal("?"))); - EXPECT_EQ(StringVal("h"), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(1), StringVal(""))); - EXPECT_EQ(StringVal::null(), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(5), StringVal(""))); - EXPECT_EQ(StringVal("hiaba"), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(5), StringVal("ab"))); - EXPECT_EQ(StringVal("hiabab"), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(6), StringVal("ab"))); - EXPECT_EQ(StringVal("hi呵呵呵"), - StringFunctions::rpad(ctx, StringVal("hi"), IntVal(5), StringVal("呵呵"))); - EXPECT_EQ(StringVal("呵呵hih"), - StringFunctions::rpad(ctx, StringVal("呵呵"), IntVal(5), StringVal("hi"))); -} - -TEST_F(StringFunctionsTest, replace) { - //exist substring - EXPECT_EQ(StringVal("http://www.baidu.com:8080"), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("9090"), StringVal("8080"))); - - //not exist substring - EXPECT_EQ(StringVal("http://www.baidu.com:9090"), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("9070"), StringVal("8080"))); - - //old substring is empty - EXPECT_EQ(StringVal("http://www.baidu.com:9090"), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), StringVal(""), - StringVal("8080"))); - - //new substring is empty - EXPECT_EQ(StringVal("http://www.baidu.com:"), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("9090"), StringVal(""))); - - //origin string is null - EXPECT_EQ(StringVal::null(), StringFunctions::replace(ctx, StringVal::null(), - StringVal("hello"), StringVal("8080"))); - - //old substring is null - EXPECT_EQ(StringVal::null(), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), - StringVal::null(), StringVal("8080"))); - - //new substring is null - EXPECT_EQ(StringVal::null(), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("hello"), StringVal::null())); - - //substring contains Chinese character - EXPECT_EQ(StringVal("http://华夏zhongguo:9090"), - StringFunctions::replace(ctx, StringVal("http://中国hello:9090"), - StringVal("中国hello"), StringVal("华夏zhongguo"))); - - //old substring is at the beginning of string - EXPECT_EQ(StringVal("ftp://www.baidu.com:9090"), - StringFunctions::replace(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("http"), StringVal("ftp"))); -} - -TEST_F(StringFunctionsTest, parse_url) { - EXPECT_EQ(StringVal("facebook.com"), - StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), - StringVal("AUTHORITY"))); - EXPECT_EQ(StringVal("facebook.com"), - StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), - StringVal("authority"))); - - EXPECT_EQ(StringVal("/a/b/c.php"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), - StringVal("FILE"))); - EXPECT_EQ(StringVal("/a/b/c.php"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), - StringVal("file"))); - - EXPECT_EQ(StringVal("/a/b/c.php"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), - StringVal("PATH"))); - EXPECT_EQ(StringVal("/a/b/c.php"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), - StringVal("path"))); - - EXPECT_EQ(StringVal("www.baidu.com"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("HOST"))); - EXPECT_EQ(StringVal("www.baidu.com"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), - StringVal("host"))); - - EXPECT_EQ(StringVal("http"), - StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), - StringVal("PROTOCOL"))); - EXPECT_EQ(StringVal("http"), - StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), - StringVal("protocol"))); - - EXPECT_EQ(StringVal("a=b"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("QUERY"))); - EXPECT_EQ(StringVal("a=b"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("query"))); - - EXPECT_EQ(StringVal::null(), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("REF"))); - EXPECT_EQ(StringVal::null(), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("ref"))); - - EXPECT_EQ(StringVal::null(), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("USERINFO"))); - EXPECT_EQ(StringVal::null(), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("userinfo"))); - - EXPECT_EQ(StringVal("9090"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("PORT"))); - EXPECT_EQ(StringVal("9090"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c?a=b"), - StringVal("PORT"))); - EXPECT_EQ(StringVal::null(), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com?a=b"), - StringVal("PORT"))); - EXPECT_EQ(StringVal("9090"), - StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), - StringVal("port"))); -} - -TEST_F(StringFunctionsTest, bit_length) { - doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - - EXPECT_EQ(IntVal(40), StringFunctions::bit_length(context, StringVal("hello"))); - - EXPECT_EQ(IntVal::null(), StringFunctions::bit_length(context, StringVal::null())); - - EXPECT_EQ(IntVal(0), StringFunctions::bit_length(context, StringVal(""))); - - EXPECT_EQ(IntVal(88), StringFunctions::bit_length(context, StringVal("hello你好"))); - - delete context; -} - -TEST_F(StringFunctionsTest, lower) { - EXPECT_EQ(StringVal("hello"), StringFunctions::lower(ctx, StringVal("hello"))); - EXPECT_EQ(StringVal("hello"), StringFunctions::lower(ctx, StringVal("HELLO"))); - EXPECT_EQ(StringVal("hello123"), StringFunctions::lower(ctx, StringVal("HELLO123"))); - EXPECT_EQ(StringVal("hello, 123"), StringFunctions::lower(ctx, StringVal("HELLO, 123"))); - EXPECT_EQ(StringVal::null(), StringFunctions::lower(ctx, StringVal::null())); - EXPECT_EQ(StringVal(""), StringFunctions::lower(ctx, StringVal(""))); -} - -TEST_F(StringFunctionsTest, elt) { - StringVal str[] = {"hello", "world"}; - EXPECT_EQ(StringVal("hello"), StringFunctions::elt(ctx, 1, 2, str)); - EXPECT_EQ(StringVal("world"), StringFunctions::elt(ctx, 2, 2, str)); - EXPECT_EQ(StringVal::null(), StringFunctions::elt(ctx, 0, 2, str)); - EXPECT_EQ(StringVal::null(), StringFunctions::elt(ctx, 3, 2, str)); -} - -TEST_F(StringFunctionsTest, upper) { - // function test - EXPECT_EQ(StringVal("HELLO"), StringFunctions::upper(ctx, StringVal("HELLO"))); - EXPECT_EQ(StringVal("HELLO"), StringFunctions::upper(ctx, StringVal("hello"))); - EXPECT_EQ(StringVal("HELLO123"), StringFunctions::upper(ctx, StringVal("hello123"))); - EXPECT_EQ(StringVal("HELLO, 123"), StringFunctions::upper(ctx, StringVal("hello, 123"))); - EXPECT_EQ(StringVal::null(), StringFunctions::upper(ctx, StringVal::null())); - EXPECT_EQ(StringVal(""), StringFunctions::upper(ctx, StringVal(""))); -} - -TEST_F(StringFunctionsTest, ltrim) { - // no blank - StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - StringVal res = simd::VStringFunctions::ltrim(src); - EXPECT_EQ(src, res); - // empty string - StringVal src1(""); - res = simd::VStringFunctions::ltrim(src1); - EXPECT_EQ(src1, res); - // null string - StringVal src2(StringVal::null()); - res = simd::VStringFunctions::ltrim(src2); - EXPECT_EQ(src2, res); - // less than 16 blanks - StringVal src3(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - res = simd::VStringFunctions::ltrim(src3); - EXPECT_EQ(src, res); - // more than 16 blanks - StringVal src4(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - res = simd::VStringFunctions::ltrim(src4); - EXPECT_EQ(src, res); - // all are blanks, less than 16 blanks - StringVal src5(" "); - res = simd::VStringFunctions::ltrim(src5); - EXPECT_EQ(StringVal(""), res); - // all are blanks, more than 16 blanks - StringVal src6(" "); - res = simd::VStringFunctions::ltrim(src6); - EXPECT_EQ(StringVal(""), res); - // src less than 16 length - StringVal src7(" 12345678910"); - res = simd::VStringFunctions::ltrim(src7); - EXPECT_EQ(StringVal("12345678910"), res); -} - -TEST_F(StringFunctionsTest, rtrim) { - // no blank - StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - StringVal res = simd::VStringFunctions::rtrim(src); - EXPECT_EQ(src, res); - // empty string - StringVal src1(""); - res = simd::VStringFunctions::rtrim(src1); - EXPECT_EQ(src1, res); - // null string - StringVal src2(StringVal::null()); - res = simd::VStringFunctions::rtrim(src2); - EXPECT_EQ(src2, res); - // less than 16 blanks - StringVal src3("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "); - res = simd::VStringFunctions::rtrim(src3); - EXPECT_EQ(src, res); - // more than 16 blanks - StringVal src4("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "); - res = simd::VStringFunctions::rtrim(src4); - EXPECT_EQ(src, res); - // all are blanks, less than 16 blanks - StringVal src5(" "); - res = simd::VStringFunctions::rtrim(src5); - EXPECT_EQ(StringVal(""), res); - // all are blanks, more than 16 blanks - StringVal src6(" "); - res = simd::VStringFunctions::rtrim(src6); - EXPECT_EQ(StringVal(""), res); - // src less than 16 length - StringVal src7("12345678910 "); - res = simd::VStringFunctions::rtrim(src7); - EXPECT_EQ(StringVal("12345678910"), res); -} - -TEST_F(StringFunctionsTest, is_ascii) { - EXPECT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("hello123"))); - EXPECT_EQ(true, simd::VStringFunctions::is_ascii( - StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); - EXPECT_EQ(false, simd::VStringFunctions::is_ascii(StringVal("运维组123"))); - EXPECT_EQ(false, simd::VStringFunctions::is_ascii( - StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); - EXPECT_EQ(true, simd::VStringFunctions::is_ascii(StringVal::null())); - EXPECT_EQ(true, simd::VStringFunctions::is_ascii(StringVal(""))); -} -} // namespace doris diff --git a/be/test/runtime/array_test.cpp b/be/test/runtime/array_test.cpp deleted file mode 100644 index 87f219ade7..0000000000 --- a/be/test/runtime/array_test.cpp +++ /dev/null @@ -1,873 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "gen_cpp/olap_file.pb.h" -#include "gen_cpp/segment_v2.pb.h" -#include "io/fs/file_system.h" -#include "io/fs/file_writer.h" -#include "io/fs/local_file_system.h" -#include "olap/field.h" -#include "olap/rowset/segment_v2/column_reader.h" -#include "olap/rowset/segment_v2/column_writer.h" -#include "olap/tablet_schema.h" -#include "olap/types.h" -#include "runtime/collection_value.h" -#include "runtime/descriptors.h" -#include "runtime/mem_pool.h" -#include "runtime/primitive_type.h" -#include "runtime/raw_value.h" -#include "testutil/array_utils.h" -#include "testutil/desc_tbl_builder.h" -#include "util/file_utils.h" -#include "util/uid_util.h" -#include "vec/columns/column.h" -#include "vec/columns/column_array.h" -#include "vec/core/block.h" -#include "vec/data_types/data_type_factory.hpp" - -namespace doris { - -template -ColumnPB create_column_pb(const std::string& type, const Ts&... sub_column_types) { - ColumnPB column; - auto prefix = "NOT_NULL_"; - column.set_is_nullable(type.compare(0, strlen(prefix), prefix) != 0); - column.set_type(column.is_nullable() ? type : type.substr(strlen(prefix))); - column.set_aggregation("NONE"); - if (type == "ARRAY") { - column.set_length(OLAP_ARRAY_MAX_BYTES); - } - if constexpr (sizeof...(sub_column_types) > 0) { - auto sub_column = create_column_pb(sub_column_types...); - column.add_children_columns()->Swap(&sub_column); - } - return column; -} - -TypeInfoPtr get_type_info(const ColumnPB& column_pb) { - TabletColumn tablet_column; - tablet_column.init_from_pb(column_pb); - return get_type_info(&tablet_column); -} - -std::unique_ptr create_field(const ColumnPB& column_pb) { - TabletColumn column; - column.init_from_pb(column_pb); - return std::unique_ptr(FieldFactory::create(column)); -} - -TypeDescriptor get_scalar_type_desc(const TypeInfo* type_info) { - switch (type_info->type()) { - case OLAP_FIELD_TYPE_BOOL: - return TypeDescriptor(TYPE_BOOLEAN); - case OLAP_FIELD_TYPE_TINYINT: - return TypeDescriptor(TYPE_TINYINT); - case OLAP_FIELD_TYPE_SMALLINT: - return TypeDescriptor(TYPE_SMALLINT); - case OLAP_FIELD_TYPE_INT: - return TypeDescriptor(TYPE_INT); - case OLAP_FIELD_TYPE_BIGINT: - return TypeDescriptor(TYPE_BIGINT); - case OLAP_FIELD_TYPE_LARGEINT: - return TypeDescriptor(TYPE_LARGEINT); - case OLAP_FIELD_TYPE_FLOAT: - return TypeDescriptor(TYPE_FLOAT); - case OLAP_FIELD_TYPE_DOUBLE: - return TypeDescriptor(TYPE_DOUBLE); - case OLAP_FIELD_TYPE_CHAR: - return TypeDescriptor::create_char_type(TypeDescriptor::MAX_CHAR_LENGTH); - case OLAP_FIELD_TYPE_VARCHAR: - return TypeDescriptor::create_varchar_type(TypeDescriptor::MAX_VARCHAR_LENGTH); - case OLAP_FIELD_TYPE_STRING: - return TypeDescriptor::create_string_type(); - case OLAP_FIELD_TYPE_DATE: - return TypeDescriptor(TYPE_DATE); - case OLAP_FIELD_TYPE_DATETIME: - return TypeDescriptor(TYPE_DATETIME); - case OLAP_FIELD_TYPE_DECIMAL: - return TypeDescriptor(TYPE_DECIMALV2); - default: - DCHECK(false) << "Failed to get the scalar type descriptor."; - } -} - -const TupleDescriptor* get_tuple_descriptor(ObjectPool& object_pool, const TypeInfo* type_info) { - DescriptorTblBuilder builder(&object_pool); - auto& tuple_desc_builder = builder.declare_tuple(); - if (type_info->type() == OLAP_FIELD_TYPE_ARRAY) { - TypeDescriptor type_desc(TYPE_ARRAY); - type_desc.len = OLAP_ARRAY_MAX_BYTES; - const auto* ptype = dynamic_cast(type_info)->item_type_info(); - while (ptype->type() == OLAP_FIELD_TYPE_ARRAY) { - type_desc.children.push_back(TypeDescriptor(TYPE_ARRAY)); - ptype = dynamic_cast(ptype)->item_type_info(); - } - type_desc.children.push_back(get_scalar_type_desc(ptype)); - tuple_desc_builder << type_desc; - } else { - tuple_desc_builder << get_scalar_type_desc(type_info); - } - return builder.build()->get_tuple_descriptor(0); -} - -CollectionValue* parse(MemPool& mem_pool, FunctionContext& context, const std::string& text, - const ColumnPB& column_pb) { - auto collection_value = - reinterpret_cast(mem_pool.allocate(sizeof(CollectionValue))); - auto status = ArrayUtils::create_collection_value(collection_value, &context, text); - if (!status.ok()) { - return nullptr; - } - return collection_value; -} - -void validate(const Field* field, const CollectionValue* expect, const CollectionValue* actual) { - EXPECT_TRUE(field->type_info()->equal(expect, actual)); -} - -class ArrayTest : public ::testing::Test { -public: - ArrayTest() : _mem_pool(new MemPool()) {} - - template - void test(const ColumnPB& column_pb, const std::vector& literal_arrays) { - auto field = create_field(column_pb); - const auto* type_info = field->type_info(); - const auto* tuple_desc = get_tuple_descriptor(_object_pool, type_info); - EXPECT_EQ(tuple_desc->slots().size(), 1); - - FunctionContext context; - ArrayUtils::prepare_context(context, *_mem_pool, column_pb); - - std::vector arrays; - for (const auto& literal_array : literal_arrays) { - arrays.push_back(parse(*_mem_pool, context, literal_array, column_pb)); - } - - for (auto array : arrays) { - test_array(column_pb, field.get(), tuple_desc, array); - } - test_direct_copy_array(field.get(), arrays); - test_write_and_read_column(column_pb, field.get(), arrays); - } - -protected: - void SetUp() override { - if (FileUtils::check_exist(TEST_DIR)) { - EXPECT_TRUE(FileUtils::remove_all(TEST_DIR).ok()); - } - EXPECT_TRUE(FileUtils::create_dir(TEST_DIR).ok()); - } - - void TearDown() override { - if (FileUtils::check_exist(TEST_DIR)) { - EXPECT_TRUE(FileUtils::remove_all(TEST_DIR).ok()); - } - } - -private: - void test_copy_array(const TupleDescriptor* tuple_desc, const Field* field, - const CollectionValue* array) { - auto slot_desc = tuple_desc->slots().front(); - const auto& item_type_desc = slot_desc->type().children[0]; - auto total_size = tuple_desc->byte_size() + array->get_byte_size(item_type_desc); - - auto src = allocate_tuple(total_size); - EXPECT_NE(src, nullptr); - - RawValue::write(array, src, slot_desc, _mem_pool.get()); - auto src_cv = reinterpret_cast(src->get_slot(slot_desc->tuple_offset())); - validate(field, array, src_cv); - - auto dst = allocate_tuple(total_size); - EXPECT_NE(dst, nullptr); - - src->deep_copy(dst, *tuple_desc, _mem_pool.get()); - auto dst_cv = reinterpret_cast(dst->get_slot(slot_desc->tuple_offset())); - validate(field, src_cv, dst_cv); - - dst->init(total_size); - int64_t offset = 0; - char* serialized_data = reinterpret_cast(dst); - src->deep_copy(*tuple_desc, &serialized_data, &offset, true); - EXPECT_EQ(total_size, offset); - EXPECT_EQ(total_size, serialized_data - reinterpret_cast(dst)); - dst_cv = reinterpret_cast(dst->get_slot(slot_desc->tuple_offset())); - CollectionValue::deserialize_collection(dst_cv, reinterpret_cast(dst), - item_type_desc); - validate(field, src_cv, dst_cv); - } - - Tuple* allocate_tuple(size_t size) { - auto tuple = reinterpret_cast(_mem_pool->allocate(size)); - if (tuple) { - tuple->init(size); - } - return tuple; - } - - void test_direct_copy_array(const Field* field, - const std::vector& arrays) { - CollectionValue cell; - std::unique_ptr variable_ptr(new char[field->length()]); - field->allocate_memory(reinterpret_cast(&cell), variable_ptr.get()); - EXPECT_EQ(cell.null_signs(), reinterpret_cast(variable_ptr.get())); - for (auto array : arrays) { - field->type_info()->direct_copy(&cell, array); - EXPECT_EQ(cell.null_signs(), reinterpret_cast(variable_ptr.get())); - validate(field, array, &cell); - } - } - - template - void test_write_and_read_column(const ColumnPB& column_pb, const Field* field, - const std::vector& arrays) { - auto filename = generate_uuid_string(); - const std::string path = TEST_DIR + "/" + filename; - LOG(INFO) << "Test path: " << path; - - segment_v2::ColumnMetaPB meta; - init_column_meta(&meta, column_pb); - - TabletColumn tablet_column; - tablet_column.init_from_pb(column_pb); - Schema schema({tablet_column}, 0); - { - auto file_writer = creat_file_writer(path); - EXPECT_NE(file_writer, nullptr); - auto writer = create_column_writer(file_writer.get(), - meta, column_pb); - EXPECT_NE(writer, nullptr); - Status st; - for (auto array : arrays) { - st = writer->append(false, const_cast(array)); - EXPECT_TRUE(st.ok()); - } - EXPECT_TRUE(writer->finish().ok()); - EXPECT_TRUE(writer->write_data().ok()); - EXPECT_TRUE(writer->write_ordinal_index().ok()); - EXPECT_TRUE(writer->write_zone_map().ok()); - - EXPECT_TRUE(file_writer->close().ok()); - } - { - auto type_info = get_type_info(column_pb); - auto tuple_desc = get_tuple_descriptor(_object_pool, type_info.get()); - - auto reader = create_column_reader(path, meta, arrays.size()); - EXPECT_NE(reader, nullptr); - auto rblock = create_readable_block(path); - EXPECT_NE(rblock, nullptr); - OlapReaderStatistics stats; - std::unique_ptr iter( - new_iterator(rblock.get(), &stats, reader.get())); - EXPECT_NE(iter, nullptr); - auto st = iter->seek_to_first(); - EXPECT_TRUE(st.ok()) << st.to_string(); - - auto data_type = - vectorized::DataTypeFactory::instance().create_data_type(tablet_column); - auto column_ptr = data_type->create_column(); - size_t rows_read = 1024; - column_ptr->reserve(rows_read); - do { - bool has_null = false; - st = iter->next_batch(&rows_read, column_ptr, &has_null); - EXPECT_TRUE(st.ok()); - vectorized::Block vblock; - vblock.insert({const_cast(*column_ptr).get_ptr(), - data_type, ""}); - for (int i = 0; i < arrays.size(); ++i) { - auto tuple = vblock.deep_copy_tuple(*tuple_desc, _mem_pool.get(), i, 0, false); - auto actual = - tuple->get_collection_slot(tuple_desc->slots().front()->tuple_offset()); - validate(field, arrays[i], actual); - } - } while (rows_read >= 1024); - } - } - - template - void init_column_meta(segment_v2::ColumnMetaPB* meta, const ColumnPB& column_pb) { - int column_id = 0; - TabletColumn column; - column.init_from_pb(column_pb); - init_column_meta(meta, &column_id, column); - } - - template - void init_column_meta(segment_v2::ColumnMetaPB* meta, int* column_id, - const TabletColumn& column) { - meta->set_column_id(*column_id); - meta->set_unique_id((*column_id)++); - meta->set_type(column.type()); - meta->set_length(column.length()); - if (column.type() == OLAP_FIELD_TYPE_ARRAY) { - meta->set_encoding(array_encoding); - } else { - meta->set_encoding(item_encoding); - } - meta->set_compression(segment_v2::LZ4F); - meta->set_is_nullable(column.is_nullable()); - for (uint32_t i = 0; i < column.get_subtype_count(); ++i) { - init_column_meta(meta->add_children_columns(), column_id, - column.get_sub_column(i)); - } - } - - io::FileWriterPtr creat_file_writer(const std::string& path) { - io::FileWriterPtr file_writer; - io::global_local_filesystem()->create_file(path, &file_writer); - return file_writer; - } - - template - std::unique_ptr create_column_writer(io::FileWriter* file_writer, - segment_v2::ColumnMetaPB& meta, - const ColumnPB& column_pb) { - segment_v2::ColumnWriterOptions writer_opts = {.meta = &meta}; - TabletColumn column; - column.init_from_pb(column_pb); - std::unique_ptr writer; - auto st = segment_v2::ColumnWriter::create(writer_opts, &column, file_writer, &writer); - if (!st.ok()) { - return nullptr; - } - st = writer->init(); - return st.ok() ? std::move(writer) : nullptr; - } - - std::unique_ptr create_column_reader( - const std::string& path, const segment_v2::ColumnMetaPB& meta, size_t num_rows) { - segment_v2::ColumnReaderOptions reader_opts; - std::unique_ptr reader; - auto st = segment_v2::ColumnReader::create(reader_opts, meta, num_rows, - io::global_local_filesystem(), path, &reader); - return st.ok() ? std::move(reader) : nullptr; - } - - io::FileReaderSPtr create_readable_block(const std::string& path) { - io::FileReaderSPtr reader; - auto st = io::global_local_filesystem()->open_file(path, &reader, nullptr); - return st.ok() ? std::move(reader) : nullptr; - } - - segment_v2::ColumnIterator* new_iterator(io::FileReader* rblock, OlapReaderStatistics* stats, - segment_v2::ColumnReader* reader) { - segment_v2::ColumnIterator* iter = nullptr; - auto st = reader->new_iterator(&iter); - if (!st.ok()) { - return nullptr; - } - segment_v2::ColumnIteratorOptions iter_opts; - iter_opts.stats = stats; - iter_opts.file_reader = rblock; - st = iter->init(iter_opts); - return st.ok() ? iter : nullptr; - } - - template - void test_array(const ColumnPB& column_pb, const Field* field, - const TupleDescriptor* tuple_desc, const CollectionValue* array) { - EXPECT_NE(array, nullptr); - test_copy_array(tuple_desc, field, array); - test_direct_copy_array(field, {array}); - test_write_and_read_column(column_pb, field, {array}); - } - -private: - static constexpr size_t MAX_MEMORY_BYTES = 1024 * 1024; - static const std::string TEST_DIR; - std::unique_ptr _mem_pool; - ObjectPool _object_pool; -}; - -const std::string ArrayTest::TEST_DIR = "./ut_dir/array_test"; - -TEST_F(ArrayTest, TestBoolean) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", "BOOLEAN"); - std::vector literal_arrays = { - "[]", - "[null]", - "[true, false, false]", - "[true, null, false]", - "[false, null, null]", - "[null, null, true]", - "[null, null, null]", - }; - test(column_pb, literal_arrays); - - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", "BOOLEAN"); - literal_arrays = { - "[]", - "[[]]", - "[[false, true, false], [true, false, true]]", - "[[false, true, false], null, [true, false, true]]", - "[[false, true, null], null, [true, null, false], null, [null, false, false]]", - }; - test(column_pb, literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", "BOOLEAN"); - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[null]], [[false], [true, false]], [[false, true, false], null, null]]", - }; - test(column_pb, literal_arrays); -} - -TEST_F(ArrayTest, TestNotNullBoolean) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", "NOT_NULL_BOOLEAN"); - std::vector literal_arrays = { - "[]", - "[true, false, false]", - }; - test(column_pb, literal_arrays); - - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", "NOT_NULL_BOOLEAN"); - literal_arrays = { - "[]", - "[[]]", - "[[false, true, false]]", - "[[false, true, false], [true, false, true]]", - }; - test(column_pb, literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", "NOT_NULL_BOOLEAN"); - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[]], [[false], [true, false]], [[false, true, false]]]", - }; - test(column_pb, literal_arrays); -} - -void test_integer(const std::string& type, ArrayTest& test_suite) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays = { - "[]", - "[null]", - "[1, 2, 3]", - "[1, null, 3]", - "[1, null, null]", - "[null, null, 3]", - "[null, null, null]", - }; - test_suite.test(column_pb, - literal_arrays); - - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[1, 2, 3], [4, 5, 6]]", - "[[1, 2, 3], null, [4, 5, 6]]", - "[[1, 2, null], null, [4, null, 6], null, [null, 8, 9]]", - }; - test_suite.test(column_pb, - literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[null]], [[1], [2, 3]], [[4, 5, 6], null, null]]", - }; - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestInteger) { - test_integer("TINYINT", *this); - test_integer("SMALLINT", *this); - test_integer("INT", *this); - test_integer("BIGINT", *this); - test_integer("LARGEINT", *this); -} - -void test_not_null_integer(const std::string& type, ArrayTest& test_suite) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays = { - "[]", - "[1, 2, 3]", - }; - test_suite.test(column_pb, - literal_arrays); - - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[1, 2, 3]]", - "[[1, 2, 3], [4, 5, 6]]", - }; - test_suite.test(column_pb, - literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - literal_arrays = { - "[]", "[[]]", "[[[]]]", "[[[1, 2, 3]]]", "[[[]], [[1], [2, 3]], [[4, 5, 6]]]", - }; - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestNotNullInteger) { - test_not_null_integer("NOT_NULL_TINYINT", *this); - test_not_null_integer("NOT_NULL_SMALLINT", *this); - test_not_null_integer("NOT_NULL_INT", *this); - test_not_null_integer("NOT_NULL_BIGINT", *this); - test_not_null_integer("NOT_NULL_LARGEINT", *this); -} - -void test_float(const std::string& type, ArrayTest& test_suite) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays = { - "[]", - "[null]", - "[1.5, 2.5, 3.5]", - "[1.5, null, 3.5]", - "[1.5, null, null]", - "[null, null, 3.5]", - "[null, null, null]", - }; - test_suite.test(column_pb, - literal_arrays); - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[1.5, 2.5, 3.5], [4.5, 5.5, 6.5]]", - "[[1.5, 2.5, 3.5], null, [4.5, 5.5, 6.5]]", - "[[1.5, 2.5, null], null, [4.5, null, 6.5], null, [null, 8.5, 9.5]]", - }; - test_suite.test(column_pb, - literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[null]], [[1.5], [2.5, 3.5]], [[4.5, 5.5, 6.5], null, null]]", - }; - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestFloat) { - test_float("FLOAT", *this); - test_float("DOUBLE", *this); -} - -void test_not_null_float(const std::string& type, ArrayTest& test_suite) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays = { - "[]", - "[1.5, 2.5, 3.5]", - }; - test_suite.test(column_pb, - literal_arrays); - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[1.5, 2.5, 3.5]]", - "[[1.5, 2.5, 3.5], [4.5, 5.5, 6.5]]", - }; - test_suite.test(column_pb, - literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - literal_arrays = { - "[]", "[[]]", "[[[]]]", "[[[1.5]]]", "[[[]], [[1.5], [2.5, 3.5]], [[4.5, 5.5, 6.5]]]", - }; - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestNotNullFloat) { - test_not_null_float("NOT_NULL_FLOAT", *this); - test_not_null_float("NOT_NULL_DOUBLE", *this); -} - -void test_string(const std::string& type, ArrayTest& test_suite) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays = { - "[]", - "[null]", - "[\"a\", \"b\", \"c\"]", - "[null, \"b\", \"c\"]", - "[\"a\", null, \"c\"]", - "[\"a\", \"b\", null]", - "[null, \"b\", null]", - "[null, null, null]", - }; - test_suite.test(column_pb, - literal_arrays); - - // more depths - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[null, [null], [[null]]]", - "[[[\"a\", null, \"c\"], [\"d\", \"e\", \"f\"]], null, [[\"g\"]]]", - }; - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestString) { - test_string("CHAR", *this); - test_string("VARCHAR", *this); - test_string("STRING", *this); -} - -void test_not_null_string(const std::string& type, ArrayTest& test_suite) { - // depth 1 - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays = { - "[]", - "[\"a\", \"b\", \"c\"]", - }; - test_suite.test(column_pb, - literal_arrays); - - // more depths - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[\"a\", \"b\", \"c\"]]]", - "[[[\"a\", \"c\"], [\"d\", \"e\", \"f\"]], [[\"g\"]]]", - }; - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestNotNullString) { - test_not_null_string("NOT_NULL_CHAR", *this); - test_not_null_string("NOT_NULL_VARCHAR", *this); - test_not_null_string("NOT_NULL_STRING", *this); -} - -void test_datetime(const std::string& type, ArrayTest& test_suite) { - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays; - if (type == "DATE") { - literal_arrays = { - "[]", - "[null]", - "[\"2022-04-01\", \"2022-04-02\", \"2022-04-03\"]", - "[\"2022-04-01\", null, \"2022-04-03\"]", - "[\"2022-04-01\", null, null]", - "[null, null, \"2022-04-03\"]", - "[null, null, null]", - }; - } else { - literal_arrays = { - "[]", - "[null]", - "[\"2022-04-01 19:30:40\", \"2022-04-02 19:30:40 \", \"2022-04-03 19:30:40\"]", - "[\"2022-04-01 19:30:40\", null, \"2022-04-03 19:30:40\"]", - "[\"2022-04-01 19:30:40\", null, null]", - "[null, null, \"2022-04-03 19:30:40\"]", - "[null, null, null]", - }; - } - test_suite.test(column_pb, - literal_arrays); - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", type); - if (type == "DATE") { - literal_arrays = { - "[]", - "[[]]", - "[[\"2022-04-01\", \"2022-04-02\", \"2022-04-03\"], [\"2022-04-04\", " - "\"2022-04-05\", " - "\"2022-04-06\"]]", - "[[\"2022-04-01\", \"2022-04-02\", \"2022-04-03\"], null, [\"2022-04-04\", " - "\"2022-04-05\", \"2022-04-06\"]]", - "[[\"2022-04-01\", \"2022-04-02\", null], null, [\"2022-04-04\", null, " - "\"2022-04-06\"], null, [null, \"2022-04-08\", \"2022-04-09\"]]", - }; - } else { - literal_arrays = { - "[]", - "[[]]", - "[[\"2022-04-01 19:30:40\", \"2022-04-02 19:30:40\", \"2022-04-03 19:30:40\"], " - "[\"2022-04-04 19:30:40\", " - "\"2022-04-05\", " - "\"2022-04-06\"]]", - "[[\"2022-04-01 19:30:40\", \"2022-04-02 19:30:40\", \"2022-04-03 19:30:40\"], " - "null, [\"2022-04-04 19:30:40\", " - "\"2022-04-05\", \"2022-04-06\"]]", - "[[\"2022-04-01 19:30:40\", \"2022-04-02 19:30:40\", null], null, [\"2022-04-04 " - "19:30:40\", null, " - "\"2022-04-06 19:30:40\"], null, [null, \"2022-04-08 19:30:40\", \"2022-04-09 " - "19:30:40\"]]", - }; - } - test_suite.test(column_pb, - literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - if (type == "DATE") { - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[null]], [[\"2022-04-01\"], [\"2022-04-02\", \"2022-04-03\"]], " - "[[\"2022-04-04\", " - "\"2022-04-05\", \"2022-04-06\"], null, null]]", - }; - } else { - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[null]], [[\"2022-04-01 19:30:40\"], [\"2022-04-02 19:30:40\", \"2022-04-03 " - "19:30:40\"]], " - "[[\"2022-04-04 19:30:40\", " - "\"2022-04-05 19:30:40\", \"2022-04-06 19:30:40\"], null, null]]", - }; - } - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestDateTime) { - test_datetime("DATE", *this); - test_datetime("DATETIME", *this); -} - -void test_not_null_datetime(const std::string& type, ArrayTest& test_suite) { - auto column_pb = create_column_pb("ARRAY", type); - std::vector literal_arrays; - if (type == "DATE") { - literal_arrays = { - "[]", - "[\"2022-04-01\", \"2022-04-02\", \"2022-04-03\"]", - }; - } else { - literal_arrays = { - "[]", - "[\"2022-04-01 19:30:40\", \"2022-04-02 19:30:40 \", \"2022-04-03 19:30:40\"]", - }; - } - test_suite.test(column_pb, - literal_arrays); - // depth 2 - column_pb = create_column_pb("ARRAY", "ARRAY", type); - if (type == "DATE") { - literal_arrays = { - "[]", - "[[]]", - "[[\"2022-04-01\", \"2022-04-02\", \"2022-04-03\"], [\"2022-04-04\", " - "\"2022-04-05\", " - "\"2022-04-06\"]]", - }; - } else { - literal_arrays = { - "[]", - "[[]]", - "[[\"2022-04-01 19:30:40\", \"2022-04-02 19:30:40\", \"2022-04-03 19:30:40\"], " - "[\"2022-04-04 19:30:40\", " - "\"2022-04-05\", " - "\"2022-04-06\"]]", - }; - } - test_suite.test(column_pb, - literal_arrays); - - // depth 3 - column_pb = create_column_pb("ARRAY", "ARRAY", "ARRAY", type); - if (type == "DATE") { - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[\"2022-04-01\"]]]", - "[[[]], [[\"2022-04-01\"], [\"2022-04-02\", \"2022-04-03\"]], " - "[[\"2022-04-04\", " - "\"2022-04-05\", \"2022-04-06\"]]]", - }; - } else { - literal_arrays = { - "[]", - "[[]]", - "[[[]]]", - "[[[\"2022-04-01 19:30:40\"]]]", - "[[[]], [[\"2022-04-01 19:30:40\"], [\"2022-04-02 19:30:40\", \"2022-04-03 " - "19:30:40\"]], " - "[[\"2022-04-04 19:30:40\", " - "\"2022-04-05 19:30:40\", \"2022-04-06 19:30:40\"]]]", - }; - } - test_suite.test(column_pb, - literal_arrays); -} - -TEST_F(ArrayTest, TestNotNullDateTime) { - test_not_null_datetime("NOT_NULL_DATE", *this); - test_not_null_datetime("NOT_NULL_DATETIME", *this); -} - -TEST_F(ArrayTest, TestDecimal) { - test_integer("DECIMAL", *this); - test_not_null_integer("NOT_NULL_DECIMAL", *this); - test_float("DECIMAL", *this); - test_not_null_float("NOT_NULL_DECIMAL", *this); -} - -} // namespace doris diff --git a/be/test/runtime/collection_value_test.cpp b/be/test/runtime/collection_value_test.cpp deleted file mode 100644 index ce2fc7adec..0000000000 --- a/be/test/runtime/collection_value_test.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/collection_value.h" - -#include - -#include "common/object_pool.h" -#include "string" -#include "util/bitmap.h" - -#define private public - -namespace doris { - -TEST(CollectionValueTest, init) { - { - CollectionValue cv; - - ObjectPool pool; - EXPECT_TRUE(CollectionValue::init_collection(&pool, 10, TYPE_INT, &cv).ok()); - - EXPECT_EQ(10, cv.size()); - - for (int j = 0; j < 10; ++j) { - EXPECT_FALSE(*(cv._null_signs + j)); - } - - EXPECT_FALSE(CollectionValue::init_collection(&pool, 10, TYPE_INT, nullptr).ok()); - - CollectionValue cv_null; - bzero(&cv_null, sizeof(cv_null)); - EXPECT_TRUE(CollectionValue::init_collection(&pool, 0, TYPE_INT, &cv_null).ok()); - EXPECT_EQ(0, cv_null.size()); - } - - { - CollectionValue cv; - ObjectPool pool; - EXPECT_TRUE(CollectionValue::init_collection(&pool, 10, TYPE_INT, &cv).ok()); - } -} - -TEST(CollectionValueTest, set) { - CollectionValue cv; - ObjectPool pool; - EXPECT_TRUE(CollectionValue::init_collection(&pool, 10, TYPE_INT, &cv).ok()); - - // normal - { - auto iterator = cv.iterator(TYPE_INT); - IntVal v0 = IntVal::null(); - iterator.set(&v0); - for (int j = 1; j < cv.size(); ++j) { - IntVal i(j + 10); - iterator.seek(j); - iterator.set(&i); - } - } - - { - auto iter = cv.iterator(TYPE_INT); - IntVal v0; - iter.get(&v0); - EXPECT_TRUE(v0.is_null); - EXPECT_TRUE(iter.is_null()); - iter.next(); - for (int k = 1; k < cv.size(); ++k, iter.next()) { - IntVal v; - iter.get(&v); - EXPECT_EQ(k + 10, v.val); - } - } - - // over size - { - IntVal intv(20); - auto iterator = cv.iterator(TYPE_INT); - EXPECT_FALSE(iterator.seek(10)); - } -} -} // namespace doris diff --git a/be/test/testutil/array_utils.cpp b/be/test/testutil/array_utils.cpp index 834f8c6157..8fcb7061b5 100644 --- a/be/test/testutil/array_utils.cpp +++ b/be/test/testutil/array_utils.cpp @@ -25,7 +25,6 @@ #include "runtime/mem_pool.h" #include "runtime/memory/mem_tracker.h" #include "udf/udf_internal.h" -#include "util/array_parser.h" namespace doris { @@ -38,19 +37,6 @@ void ArrayUtils::prepare_context(FunctionContext& context, MemPool& mem_pool, context.impl()->_pool = new FreePool(&mem_pool); } -Status ArrayUtils::create_collection_value(CollectionValue* collection_value, - FunctionContext* context, - const std::string& json_string) { - CollectionVal collection_val; - auto status = ArrayParser::parse(collection_val, context, StringVal(json_string.c_str())); - if (!status.ok()) { - return status; - } - new (collection_value) CollectionValue(collection_val.data, collection_val.length, - collection_val.has_null, collection_val.null_signs); - return Status::OK(); -} - TypeDesc ArrayUtils::create_function_type_desc(const ColumnPB& column_pb) { TypeDesc type_desc; type_desc.len = column_pb.length(); diff --git a/be/test/testutil/array_utils.h b/be/test/testutil/array_utils.h index 85cc0434d5..f6e79a0804 100644 --- a/be/test/testutil/array_utils.h +++ b/be/test/testutil/array_utils.h @@ -33,8 +33,6 @@ public: using TypeDesc = FunctionContext::TypeDesc; static void prepare_context(FunctionContext& context, MemPool& mem_pool, const ColumnPB& column_pb); - static Status create_collection_value(CollectionValue* collection_value, - FunctionContext* context, const std::string& json_string); private: static TypeDesc create_function_type_desc(const ColumnPB& column_pb); diff --git a/be/test/util/array_parser_test.cpp b/be/test/util/array_parser_test.cpp deleted file mode 100644 index 1b0f93823b..0000000000 --- a/be/test/util/array_parser_test.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include -#include - -#include "olap/tablet_schema.h" -#include "olap/types.h" -#include "testutil/array_utils.h" -#include "vec/common/string_ref.h" - -namespace doris { - -template -ColumnPB create_column_pb(const std::string& type, const Ts&... sub_column_types) { - ColumnPB column; - column.set_type(type); - column.set_aggregation("NONE"); - column.set_is_nullable(true); - if (type == "ARRAY") { - column.set_length(OLAP_ARRAY_MAX_BYTES); - } - if constexpr (sizeof...(sub_column_types) > 0) { - auto sub_column = create_column_pb(sub_column_types...); - column.add_children_columns()->Swap(&sub_column); - } - return column; -} - -static TypeInfoPtr get_type_info(const ColumnPB& column_pb) { - TabletColumn tablet_column; - tablet_column.init_from_pb(column_pb); - return get_type_info(&tablet_column); -} - -static void test_array_parser(const ColumnPB& column_pb, const std::string& json, - const CollectionValue& expect) { - MemPool mem_pool; - FunctionContext context; - ArrayUtils::prepare_context(context, mem_pool, column_pb); - CollectionValue actual; - auto status = ArrayUtils::create_collection_value(&actual, &context, json); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(get_type_info(column_pb)->equal(&expect, &actual)); -} - -TEST(ArrayParserTest, TestParseIntArray) { - auto column_pb = create_column_pb("ARRAY", "INT"); - test_array_parser(column_pb, "[]", CollectionValue(0)); - - int32_t data[] = {1, 2, 3}; - int num_items = sizeof(data) / sizeof(data[0]); - CollectionValue value(data, num_items, false, nullptr); - test_array_parser(column_pb, "[1, 2, 3]", value); - - bool null_signs[] = {false, true, false}; - value.set_has_null(true); - value.set_null_signs(null_signs); - test_array_parser(column_pb, "[1, null, 3]", value); -} - -TEST(ArrayParserTest, TestParseVarcharArray) { - auto column_pb = create_column_pb("ARRAY", "VARCHAR"); - test_array_parser(column_pb, "[]", CollectionValue(0)); - - char data[] = {'a', 'b', 'c'}; - int num_items = sizeof(data) / sizeof(data[0]); - StringRef string_values[] = { - {&data[0], 1}, - {&data[1], 1}, - {&data[2], 1}, - }; - CollectionValue value(string_values, num_items, false, nullptr); - test_array_parser(column_pb, "[\"a\", \"b\", \"c\"]", value); - - bool null_signs[] = {false, true, false}; - value.set_has_null(true); - value.set_null_signs(null_signs); - test_array_parser(column_pb, "[\"a\", null, \"c\"]", value); -} - -TEST(ArrayParserTest, TestNestedArray) { - auto column_pb = create_column_pb("ARRAY", "ARRAY", "INT"); - test_array_parser(column_pb, "[]", CollectionValue(0)); - - CollectionValue empty_array(0); - test_array_parser(column_pb, "[[]]", {&empty_array, 1, false, nullptr}); - - int data[] = {1, 0, 3}; - uint32_t num_items = sizeof(data) / sizeof(data[0]); - bool null_signs[] = {false, true, false}; - CollectionValue array = {data, num_items, true, null_signs}; - - CollectionValue array_data[] = {empty_array, array, empty_array, array}; - uint32_t num_arrays = sizeof(array_data) / sizeof(array_data[0]); - test_array_parser(column_pb, "[[], [1, null, 3], [], [1, null, 3]]", - {array_data, num_arrays, false, nullptr}); - bool array_null_signs[] = {false, true, true, false}; - test_array_parser(column_pb, "[[], null, null, [1, null, 3]]", - {array_data, num_arrays, true, array_null_signs}); -} - -TEST(ArrayParserTest, TestLargeIntArray) { - auto column_pb = create_column_pb("ARRAY", "LARGEINT"); - test_array_parser(column_pb, "[]", CollectionValue(0)); - - __int128_t data[] = {(1L << 31) - 1, (1LU << 63) - 1, (1LU << 63) | ((1LU << 63) - 1)}; - int num_items = sizeof(data) / sizeof(data[0]); - CollectionValue value(data, num_items, false, nullptr); - test_array_parser(column_pb, "[2147483647, 9223372036854775807, 18446744073709551615]", value); - - bool null_signs[] = {false, true, false}; - value.set_has_null(true); - value.set_null_signs(null_signs); - test_array_parser(column_pb, "[2147483647, null, 18446744073709551615]", value); - - data[1] = static_cast<__int128_t>(1) << 66; - null_signs[1] = false; - test_array_parser(column_pb, - "[\"2147483647\", \"73786976294838206464\", \"18446744073709551615\"]", - value); -} - -TEST(ArrayParserTest, TestDecimalArray) { - auto column_pb = create_column_pb("ARRAY", "DECIMAL"); - test_array_parser(column_pb, "[]", CollectionValue(0)); - - std::string literals[] = {"2147483647", "9223372036854775807"}; - uint32_t num_items = sizeof(literals) / sizeof(literals[0]); - decimal12_t data[num_items]; - for (int i = 0; i < num_items; ++i) { - auto decimal_value = DecimalV2Value(literals[i]); - data[i].integer = decimal_value.int_value(); - data[i].fraction = decimal_value.frac_value(); - } - CollectionValue value(data, num_items, false, nullptr); - test_array_parser(column_pb, "[2147483647, 9223372036854775807]", value); - - bool null_signs[] = {false, true}; - value.set_has_null(true); - value.set_null_signs(null_signs); - test_array_parser(column_pb, "[2147483647, null]", value); - - null_signs[1] = false; - test_array_parser(column_pb, "[\"2147483647\", \"9223372036854775807\"]", value); - - literals[0] = "2147483647.5"; - literals[1] = "34359738368.5"; - for (int i = 0; i < num_items; ++i) { - auto decimal_value = DecimalV2Value(literals[i]); - data[i].integer = decimal_value.int_value(); - data[i].fraction = decimal_value.frac_value(); - } - value = {data, num_items, false, nullptr}; - test_array_parser(column_pb, "[2147483647.5, \"34359738368.5\"]", value); -} - -TEST(ArrayParserTest, TestFreePool) { - auto column_pb = create_column_pb("ARRAY", "DECIMAL"); - MemPool mem_pool; - FunctionContext context; - ArrayUtils::prepare_context(context, mem_pool, column_pb); - int alignment = 1; - for (int i = 1; i <= 4; ++i) { - alignment <<= 1; - auto* p = context.aligned_allocate(alignment, alignment); - EXPECT_TRUE(reinterpret_cast(p) % alignment == 0); - p = context.aligned_allocate(alignment, alignment); - EXPECT_TRUE(reinterpret_cast(p) % alignment == 0); - } -} - -} // namespace doris