From a36193dfab1a0124d260871396f50ef2b09e4b86 Mon Sep 17 00:00:00 2001 From: HangyuanLiu <460660596@qq.com> Date: Wed, 15 Jan 2020 07:40:30 +0800 Subject: [PATCH] Support decimal and timestamp type in orc load (#2759) --- be/src/exec/orc_scanner.cpp | 40 +++- be/test/exec/orc_scanner_test.cpp | 179 ++++++++++++++++++ .../orc_scanner/decimal_and_timestamp.orc | Bin 0 -> 693 bytes .../load-data/load-manual.md | 2 +- .../Data Manipulation/BROKER LOAD.md | 2 +- .../sql-statements/Data Manipulation/LOAD.md | 2 +- .../load-data/load-manual_EN.md | 2 +- .../Data Manipulation/BROKER LOAD_EN.md | 2 +- .../Data Manipulation/LOAD_EN.md | 2 +- 9 files changed, 222 insertions(+), 9 deletions(-) create mode 100644 be/test/exec/test_data/orc_scanner/decimal_and_timestamp.orc diff --git a/be/src/exec/orc_scanner.cpp b/be/src/exec/orc_scanner.cpp index 127af6b02d..ae0215169a 100644 --- a/be/src/exec/orc_scanner.cpp +++ b/be/src/exec/orc_scanner.cpp @@ -233,9 +233,43 @@ Status ORCScanner::get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof) { str_slot->len = wbytes; break; } - //TODO (lhy) : support more orc type - case orc::TIMESTAMP: - case orc::DECIMAL: + case orc::DECIMAL: { + int precision = ((orc::Decimal64VectorBatch*) cvb)->precision; + int scale = ((orc::Decimal64VectorBatch*) cvb)->scale; + + //Decimal64VectorBatch handles decimal columns with precision no greater than 18. + //Decimal128VectorBatch handles the others. + std::string decimal_str; + if (precision <= 18) { + decimal_str = std::to_string(((orc::Decimal64VectorBatch*) cvb)->values[_current_line_of_group]); + } else { + decimal_str = ((orc::Decimal128VectorBatch*) cvb)->values[_current_line_of_group].toString(); + } + //Orc api will fill in 0 at the end, so size must greater than scale + std::string v = decimal_str.substr(0, decimal_str.size() - scale) + "." + + decimal_str.substr(decimal_str.size() - scale); + str_slot->ptr = reinterpret_cast(tuple_pool->allocate(v.size())); + memcpy(str_slot->ptr, v.c_str(), v.size()); + str_slot->len = v.size(); + break; + } + case orc::TIMESTAMP: { + int64_t timestamp = ((orc::TimestampVectorBatch*) cvb)->data[_current_line_of_group]; + std::string timezone = _state->timezone(); + DateTimeValue dtv; + if (!dtv.from_unixtime(timestamp, timezone)) { + std::stringstream str_error; + str_error << "Parse timestamp (" + std::to_string(timestamp) + ") error"; + LOG(WARNING) << str_error.str(); + return Status::InternalError(str_error.str()); + } + char* buf_end = dtv.to_string((char*) tmp_buf); + wbytes = buf_end - (char*) tmp_buf -1; + str_slot->ptr = reinterpret_cast(tuple_pool->allocate(wbytes)); + memcpy(str_slot->ptr, tmp_buf, wbytes); + str_slot->len = wbytes; + break; + } default: { std::stringstream str_error; str_error << "The field name(" << slot_desc->col_name() << ") type not support. "; diff --git a/be/test/exec/orc_scanner_test.cpp b/be/test/exec/orc_scanner_test.cpp index 9b6ce2cd49..8d2cd5aba8 100644 --- a/be/test/exec/orc_scanner_test.cpp +++ b/be/test/exec/orc_scanner_test.cpp @@ -29,6 +29,7 @@ #include "exec/local_file_reader.h" #include "exec/orc_scanner.h" #include "exprs/cast_functions.h" +#include "exprs/decimal_operators.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" #include "runtime/row_batch.h" @@ -48,6 +49,7 @@ public: static void SetUpTestCase() { UserFunctionCache::instance()->init("./be/test/runtime/test_data/user_function_cache/normal"); CastFunctions::init(); + DecimalOperators::init(); } protected: @@ -523,6 +525,183 @@ TEST_F(OrcScannerTest, normal2) { scanner.close(); } +TEST_F(OrcScannerTest, normal3) { + TBrokerScanRangeParams params; + TTypeDesc varchar_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::VARCHAR); + scalar_type.__set_len(65535); + node.__set_scalar_type(scalar_type); + varchar_type.types.push_back(node); + } + + TTypeDesc decimal_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::DECIMAL); + node.__set_scalar_type(scalar_type); + decimal_type.types.push_back(node); + } + + TTypeDesc datetime_type; + { + TTypeNode node; + node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::DATETIME); + node.__set_scalar_type(scalar_type); + datetime_type.types.push_back(node); + } + + { + for (int i = 0; i < 5; ++i) { + TExprNode cast_expr; + cast_expr.node_type = TExprNodeType::CAST_EXPR; + cast_expr.type = decimal_type; + cast_expr.__set_opcode(TExprOpcode::CAST); + cast_expr.__set_num_children(1); + cast_expr.__set_output_scale(-1); + cast_expr.__isset.fn = true; + cast_expr.fn.name.function_name = "casttodecimal"; + cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; + cast_expr.fn.arg_types.push_back(varchar_type); + cast_expr.fn.ret_type = decimal_type; + cast_expr.fn.has_var_args = false; + cast_expr.fn.__set_signature("cast_to_decimal_val(VARCHAR(*))"); + cast_expr.fn.__isset.scalar_fn = true; + cast_expr.fn.scalar_fn.symbol = "doris::DecimalOperators::cast_to_decimal_val"; + + TExprNode slot_ref; + slot_ref.node_type = TExprNodeType::SLOT_REF; + slot_ref.type = varchar_type; + slot_ref.num_children = 0; + slot_ref.__isset.slot_ref = true; + slot_ref.slot_ref.slot_id = i; + slot_ref.slot_ref.tuple_id = 0; + + TExpr expr; + expr.nodes.push_back(cast_expr); + expr.nodes.push_back(slot_ref); + + params.expr_of_dest_slot.emplace(6 + i, expr); + params.src_slot_ids.push_back(i); + } + + { + TExprNode cast_expr; + cast_expr.node_type = TExprNodeType::CAST_EXPR; + cast_expr.type = datetime_type; + cast_expr.__set_opcode(TExprOpcode::CAST); + cast_expr.__set_num_children(1); + cast_expr.__set_output_scale(-1); + cast_expr.__isset.fn = true; + cast_expr.fn.name.function_name = "casttodatetime"; + cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; + cast_expr.fn.arg_types.push_back(varchar_type); + cast_expr.fn.ret_type = datetime_type; + cast_expr.fn.has_var_args = false; + cast_expr.fn.__set_signature("cast_to_datetime_val(VARCHAR(*))"); + cast_expr.fn.__isset.scalar_fn = true; + cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_datetime_val"; + + TExprNode slot_ref; + slot_ref.node_type = TExprNodeType::SLOT_REF; + slot_ref.type = varchar_type; + slot_ref.num_children = 0; + slot_ref.__isset.slot_ref = true; + slot_ref.slot_ref.slot_id = 5; + slot_ref.slot_ref.tuple_id = 0; + + TExpr expr; + expr.nodes.push_back(cast_expr); + expr.nodes.push_back(slot_ref); + + params.expr_of_dest_slot.emplace(11, expr); + params.src_slot_ids.push_back(5); + } + + } + params.__set_src_tuple_id(0); + params.__set_dest_tuple_id(1); + + //init_desc_table + TDescriptorTable t_desc_table; + + // table descriptors + TTableDescriptor t_table_desc; + + t_table_desc.id = 0; + t_table_desc.tableType = TTableType::BROKER_TABLE; + t_table_desc.numCols = 0; + t_table_desc.numClusteringCols = 0; + t_desc_table.tableDescriptors.push_back(t_table_desc); + t_desc_table.__isset.tableDescriptors = true; + + TDescriptorTableBuilder dtb; + TTupleDescriptorBuilder src_tuple_builder; + src_tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65535).nullable(true).column_name("col1").column_pos(1).build()); + src_tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65535).nullable(true).column_name("col2").column_pos(2).build()); + src_tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65535).nullable(true).column_name("col3").column_pos(3).build()); + src_tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65535).nullable(true).column_name("col4").column_pos(4).build()); + src_tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65535).nullable(true).column_name("col5").column_pos(5).build()); + src_tuple_builder.add_slot( + TSlotDescriptorBuilder().string_type(65535).nullable(true).column_name("col6").column_pos(6).build()); + src_tuple_builder.build(&dtb); + + TTupleDescriptorBuilder dest_tuple_builder; + dest_tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(10,9).column_name("col1").column_pos(1).build()); + dest_tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(7,5).column_name("col2").column_pos(2).build()); + dest_tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(10,9).column_name("col3").column_pos(3).build()); + dest_tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(10,5).column_name("col4").column_pos(4).build()); + dest_tuple_builder.add_slot( + TSlotDescriptorBuilder().decimal_type(10,5).column_name("col5").column_pos(5).build()); + dest_tuple_builder.add_slot( + TSlotDescriptorBuilder().type(TYPE_DATETIME).column_name("col5").column_pos(6).build()); + + dest_tuple_builder.build(&dtb); + t_desc_table = dtb.desc_tbl(); + + DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); + _runtime_state.set_desc_tbl(_desc_tbl); + + std::vector ranges; + TBrokerRangeDesc rangeDesc; + rangeDesc.start_offset = 0; + rangeDesc.size = -1; + rangeDesc.format_type = TFileFormatType::FORMAT_ORC; + rangeDesc.splittable = false; + + rangeDesc.path = "./be/test/exec/test_data/orc_scanner/decimal_and_timestamp.orc"; + rangeDesc.file_type = TFileType::FILE_LOCAL; + ranges.push_back(rangeDesc); + + ORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, &_counter); + ASSERT_TRUE(scanner.open().ok()); + MemTracker tracker; + MemPool tuple_pool(&tracker); + + Tuple *tuple = (Tuple *) tuple_pool.allocate(_desc_tbl->get_tuple_descriptor(1)->byte_size()); + bool eof = false; + ASSERT_TRUE(scanner.get_next(tuple, &tuple_pool, &eof).ok()); + ASSERT_EQ(Tuple::to_string(tuple, *_desc_tbl->get_tuple_descriptor(1)), + "(1.123456789 1.12 1.1234500000 1.12345 1.12345 2020-01-14 22:12:19)"); + scanner.close(); +} + } // end namespace doris int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/be/test/exec/test_data/orc_scanner/decimal_and_timestamp.orc b/be/test/exec/test_data/orc_scanner/decimal_and_timestamp.orc new file mode 100644 index 0000000000000000000000000000000000000000..cccedf3e71961e8e1f9318046b9ea0cea78e6165 GIT binary patch literal 693 zcmeYdau#G@;9?VE;b074aARP2Z1gxOm4Q()md{POR4>oj)>%1OIM}$D+gmZEPsB*Q zfsw(Cf#Gq`6R;{yJ~w4WJu#3fMFUM%HD(D0I|hcwmmYy-UHROUGxf?qvdO~1iW1IT z$2bxgsG`Dzf#LC_$4SWy%!-11Zu)ke{FkrE{}x)yu#icDft!J0)txQtIoKH({1}8- z85kZgvNJHaF*vX@Fl@Tb3uZ6C#pY#TC}R-%RMW=Jz~I2ZaDss$_lzM|g8@%#;%u{p zVJwAQO?-Uo<{dPyX!5&u^yu$q!?co$nm+=QG=jnlC&$U^YkgQ!HScP|B2Swxzjuo} zlrotYn?+QM`l{FO699~eGCkbcO5-(^w2?x z0|%C#KbW!P+|k?>=@}_GNm*%miJC2ECOSQ1F>v`%=*eixQMj?8rS-rPkr^zf3X3(F zpDx*XV_{TF0*|Hy^NNo4uC5~%5_^BB*!e`)zwzF})WX#DnVBJuk>PQI*o%Z|kDeun z?Kq