449 lines
15 KiB
C++
449 lines
15 KiB
C++
// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
|
|
|
|
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "exec/csv_scan_node.h"
|
|
|
|
#include <vector>
|
|
|
|
#include <boost/scoped_ptr.hpp>
|
|
#include <gtest/gtest.h>
|
|
|
|
#include "gen_cpp/PlanNodes_types.h"
|
|
#include "gen_cpp/Types_types.h"
|
|
#include "runtime/row_batch.h"
|
|
#include "runtime/runtime_state.h"
|
|
#include "runtime/tuple_row.h"
|
|
#include "util/debug_util.h"
|
|
#include "util/disk_info.h"
|
|
#include "util/cpu_info.h"
|
|
#include "util/logging.h"
|
|
|
|
namespace palo {
|
|
|
|
class CsvScanNodeTest : public testing::Test {
|
|
public:
|
|
CsvScanNodeTest(){}
|
|
~CsvScanNodeTest(){}
|
|
|
|
protected:
|
|
virtual void SetUp() {
|
|
config::periodic_counter_update_period_ms = 500;
|
|
config::storage_root_path = "./data";
|
|
_env.reset(new ExecEnv());
|
|
|
|
system("mkdir -p ./test_run/output/");
|
|
system("pwd");
|
|
system("cp -r ./be/test/exec/test_data/csv_scan_node ./test_run/.");
|
|
init();
|
|
}
|
|
virtual void TearDown() {
|
|
_obj_pool.clear();
|
|
_env.reset();
|
|
// system("rm -rf ./test_run");
|
|
}
|
|
|
|
void init();
|
|
void init_desc_tbl();
|
|
void init_runtime_state();
|
|
|
|
private:
|
|
ObjectPool _obj_pool;
|
|
TDescriptorTable _t_desc_table;
|
|
DescriptorTbl* _desc_tbl;
|
|
TPlanNode _tnode;
|
|
boost::scoped_ptr<ExecEnv> _env;
|
|
RuntimeState* _state;
|
|
}; // end class CsvScanNodeTest
|
|
|
|
void CsvScanNodeTest::init() {
|
|
_env->init_for_tests();
|
|
init_desc_tbl();
|
|
init_runtime_state();
|
|
}
|
|
|
|
void CsvScanNodeTest::init_runtime_state() {
|
|
_state = _obj_pool.add(
|
|
new RuntimeState(TUniqueId(), TQueryOptions(), "", _env.get()));
|
|
_state->set_desc_tbl(_desc_tbl);
|
|
_state->_load_dir = "./test_run/output/";
|
|
_state->init_mem_trackers(TUniqueId());
|
|
}
|
|
|
|
void CsvScanNodeTest::init_desc_tbl() {
|
|
// TTableDescriptor
|
|
TTableDescriptor t_table_desc;
|
|
t_table_desc.id = 0;
|
|
t_table_desc.tableType = TTableType::OLAP_TABLE;
|
|
t_table_desc.numCols = 0;
|
|
t_table_desc.numClusteringCols = 0;
|
|
t_table_desc.olapTable.tableName = "test";
|
|
t_table_desc.tableName = "test_table_name";
|
|
t_table_desc.dbName = "test_db_name";
|
|
t_table_desc.__isset.olapTable = true;
|
|
|
|
_t_desc_table.tableDescriptors.push_back(t_table_desc);
|
|
_t_desc_table.__isset.tableDescriptors = true;
|
|
|
|
// TSlotDescriptor
|
|
std::vector<TSlotDescriptor> slot_descs;
|
|
int offset = 1;
|
|
int i = 0;
|
|
// int_column
|
|
{
|
|
TSlotDescriptor t_slot_desc;
|
|
t_slot_desc.__set_id(i);
|
|
t_slot_desc.__set_slotType(gen_type_desc(TPrimitiveType::INT));
|
|
t_slot_desc.__set_columnPos(i);
|
|
t_slot_desc.__set_byteOffset(offset);
|
|
t_slot_desc.__set_nullIndicatorByte(0);
|
|
t_slot_desc.__set_nullIndicatorBit(-1);
|
|
t_slot_desc.__set_slotIdx(i);
|
|
t_slot_desc.__set_isMaterialized(true);
|
|
t_slot_desc.__set_colName("int_column");
|
|
|
|
slot_descs.push_back(t_slot_desc);
|
|
offset += sizeof(int32_t);
|
|
}
|
|
++i;
|
|
// decimal_column
|
|
{
|
|
TSlotDescriptor t_slot_desc;
|
|
t_slot_desc.__set_id(i);
|
|
TTypeDesc ttype = gen_type_desc(TPrimitiveType::DECIMAL);
|
|
ttype.types[0].scalar_type.__set_precision(10);
|
|
ttype.types[0].scalar_type.__set_scale(5);
|
|
t_slot_desc.__set_slotType(ttype);
|
|
t_slot_desc.__set_columnPos(i);
|
|
t_slot_desc.__set_byteOffset(offset);
|
|
t_slot_desc.__set_nullIndicatorByte(0);
|
|
t_slot_desc.__set_nullIndicatorBit(-1);
|
|
t_slot_desc.__set_slotIdx(i);
|
|
t_slot_desc.__set_isMaterialized(true);
|
|
t_slot_desc.__set_colName("decimal_column");
|
|
|
|
slot_descs.push_back(t_slot_desc);
|
|
offset += sizeof(DecimalValue);
|
|
}
|
|
++i;
|
|
// date_column
|
|
{
|
|
TSlotDescriptor t_slot_desc;
|
|
t_slot_desc.__set_id(i);
|
|
t_slot_desc.__set_slotType(gen_type_desc(TPrimitiveType::DATE));
|
|
t_slot_desc.__set_columnPos(i);
|
|
t_slot_desc.__set_byteOffset(offset);
|
|
t_slot_desc.__set_nullIndicatorByte(0);
|
|
t_slot_desc.__set_nullIndicatorBit(-1);
|
|
t_slot_desc.__set_slotIdx(i);
|
|
t_slot_desc.__set_isMaterialized(true);
|
|
t_slot_desc.__set_colName("date_column");
|
|
|
|
slot_descs.push_back(t_slot_desc);
|
|
offset += sizeof(DateTimeValue);
|
|
}
|
|
++i;
|
|
// fix_len_string_column
|
|
{
|
|
TSlotDescriptor t_slot_desc;
|
|
t_slot_desc.__set_id(i);
|
|
TTypeDesc ttype = gen_type_desc(TPrimitiveType::CHAR);
|
|
ttype.types[0].scalar_type.__set_len(5);
|
|
t_slot_desc.__set_slotType(ttype);
|
|
t_slot_desc.__set_columnPos(i);
|
|
t_slot_desc.__set_byteOffset(offset);
|
|
t_slot_desc.__set_nullIndicatorByte(0);
|
|
t_slot_desc.__set_nullIndicatorBit(-1);
|
|
t_slot_desc.__set_slotIdx(i);
|
|
t_slot_desc.__set_isMaterialized(true);
|
|
t_slot_desc.__set_colName("fix_len_string_column");
|
|
|
|
slot_descs.push_back(t_slot_desc);
|
|
offset += sizeof(StringValue);
|
|
}
|
|
_t_desc_table.__set_slotDescriptors(slot_descs);
|
|
|
|
// TTupleDescriptor
|
|
TTupleDescriptor t_tuple_desc;
|
|
t_tuple_desc.id = 0;
|
|
t_tuple_desc.byteSize = offset;
|
|
t_tuple_desc.numNullBytes = 1;
|
|
t_tuple_desc.tableId = 0;
|
|
t_tuple_desc.__isset.tableId = true;
|
|
_t_desc_table.tupleDescriptors.push_back(t_tuple_desc);
|
|
|
|
DescriptorTbl::create(&_obj_pool, _t_desc_table, &_desc_tbl);
|
|
|
|
// node
|
|
_tnode.node_id = 0;
|
|
_tnode.node_type = TPlanNodeType::CSV_SCAN_NODE;
|
|
_tnode.num_children = 0;
|
|
_tnode.limit = -1;
|
|
_tnode.row_tuples.push_back(0);
|
|
_tnode.nullable_tuples.push_back(false);
|
|
_tnode.csv_scan_node.tuple_id = 0;
|
|
|
|
_tnode.csv_scan_node.__set_column_separator(",");
|
|
_tnode.csv_scan_node.__set_line_delimiter("\n");
|
|
|
|
// column_type_mapping
|
|
std::map<std::string, TColumnType> column_type_map;
|
|
{
|
|
TColumnType column_type;
|
|
column_type.__set_type(TPrimitiveType::INT);
|
|
column_type_map["int_column"] = column_type;
|
|
}
|
|
{
|
|
TColumnType column_type;
|
|
column_type.__set_type(TPrimitiveType::DECIMAL);
|
|
column_type.__set_precision(10);
|
|
column_type.__set_scale(5);
|
|
column_type_map["decimal_column"] = column_type;
|
|
}
|
|
{
|
|
TColumnType column_type;
|
|
column_type.__set_type(TPrimitiveType::DATE);
|
|
column_type_map["date_column"] = column_type;
|
|
}
|
|
{
|
|
TColumnType column_type;
|
|
column_type.__set_type(TPrimitiveType::BIGINT);
|
|
column_type.__set_len(5);
|
|
column_type_map["fix_len_string_column"] = column_type;
|
|
}
|
|
_tnode.csv_scan_node.__set_column_type_mapping(column_type_map);
|
|
|
|
std::vector<std::string> columns;
|
|
columns.push_back("int_column");
|
|
columns.push_back("date_column");
|
|
columns.push_back("decimal_column");
|
|
columns.push_back("fix_len_string_column");
|
|
_tnode.csv_scan_node.__set_columns(columns);
|
|
|
|
_tnode.csv_scan_node.__isset.unspecified_columns = true;
|
|
_tnode.csv_scan_node.__isset.default_values = true;
|
|
_tnode.csv_scan_node.max_filter_ratio = 0.5;
|
|
_tnode.__isset.csv_scan_node = true;
|
|
|
|
}
|
|
|
|
TEST_F(CsvScanNodeTest, NormalUse) {
|
|
std::vector<std::string> file_paths;
|
|
file_paths.push_back("./test_run/csv_scan_node/normal_use");
|
|
_tnode.csv_scan_node.__set_file_paths(file_paths);
|
|
|
|
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
|
|
Status status = scan_node.prepare(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
status = scan_node.open(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
|
|
bool eos = false;
|
|
|
|
while (!eos) {
|
|
status = scan_node.get_next(_state, &row_batch, &eos);
|
|
ASSERT_TRUE(status.ok());
|
|
// int num = std::min(row_batch.num_rows(), 10);
|
|
int num = row_batch.num_rows();
|
|
std::cout << "num: " << num << std::endl;
|
|
ASSERT_EQ(num, 6);
|
|
|
|
for (int i = 0; i < num; ++i) {
|
|
TupleRow* row = row_batch.get_row(i);
|
|
// LOG(WARNING) << "input row[" << i << "]: " << print_row(row, scan_node._row_descriptor);
|
|
std::cout << "input row: " << print_row(row, scan_node._row_descriptor) << std::endl;
|
|
|
|
if (i == 0) {
|
|
ASSERT_EQ(
|
|
std::string("[(1 -12345.67891 2015-04-20 abc\0\0)]", 35),
|
|
print_row(row, scan_node._row_descriptor));
|
|
}
|
|
}
|
|
}
|
|
|
|
ASSERT_TRUE(scan_node.close(_state).ok());
|
|
}
|
|
|
|
TEST_F(CsvScanNodeTest, continuousDelim) {
|
|
std::vector<std::string> file_paths;
|
|
file_paths.push_back("./test_run/csv_scan_node/continuous_delim");
|
|
_tnode.csv_scan_node.__set_file_paths(file_paths);
|
|
|
|
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
|
|
Status status = scan_node.prepare(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
status = scan_node.open(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
|
|
bool eos = false;
|
|
|
|
while (!eos) {
|
|
status = scan_node.get_next(_state, &row_batch, &eos);
|
|
ASSERT_TRUE(status.ok());
|
|
// int num = std::min(row_batch.num_rows(), 10);
|
|
int num = row_batch.num_rows();
|
|
std::cout << "num: " << num << std::endl;
|
|
ASSERT_EQ(num, 1);
|
|
|
|
for (int i = 0; i < num; ++i) {
|
|
TupleRow* row = row_batch.get_row(i);
|
|
// LOG(WARNING) << "input row[" << i << "]: " << print_row(row, scan_node._row_descriptor);
|
|
std::cout << "input row: " << print_row(row, scan_node._row_descriptor) << std::endl;
|
|
|
|
if (i == 0) {
|
|
ASSERT_EQ(
|
|
std::string("[(1 -12345.67891 2015-04-20 \0\0\0\0\0)]", 35),
|
|
print_row(row, scan_node._row_descriptor));
|
|
}
|
|
}
|
|
}
|
|
|
|
ASSERT_TRUE(scan_node.close(_state).ok());
|
|
}
|
|
|
|
TEST_F(CsvScanNodeTest, wrong_decimal_format_test) {
|
|
std::vector<std::string> file_paths;
|
|
file_paths.push_back("./test_run/csv_scan_node/wrong_decimal_format");
|
|
_tnode.csv_scan_node.__set_file_paths(file_paths);
|
|
|
|
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
|
|
Status status = scan_node.prepare(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
status = scan_node.open(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
|
|
bool eos = false;
|
|
|
|
while (!eos) {
|
|
status = scan_node.get_next(_state, &row_batch, &eos);
|
|
ASSERT_TRUE(status.ok());
|
|
// int num = std::min(row_batch.num_rows(), 10);
|
|
int num = row_batch.num_rows();
|
|
std::cout << "num: " << num << std::endl;
|
|
ASSERT_EQ(0, num);
|
|
}
|
|
|
|
// Failed because reach max_filter_ratio
|
|
ASSERT_TRUE(!scan_node.close(_state).ok());
|
|
}
|
|
|
|
TEST_F(CsvScanNodeTest, fill_fix_len_stringi_test) {
|
|
std::vector<std::string> file_paths;
|
|
file_paths.push_back("./test_run/csv_scan_node/fill_string_len");
|
|
_tnode.csv_scan_node.__set_file_paths(file_paths);
|
|
|
|
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
|
|
Status status = scan_node.prepare(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
status = scan_node.open(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
|
|
bool eos = false;
|
|
|
|
while (!eos) {
|
|
status = scan_node.get_next(_state, &row_batch, &eos);
|
|
ASSERT_TRUE(status.ok());
|
|
// int num = std::min(row_batch.num_rows(), 10);
|
|
int num = row_batch.num_rows();
|
|
std::cout << "num: " << num << std::endl;
|
|
ASSERT_TRUE(num > 0);
|
|
|
|
// 1,2015-04-20,12345.67891,abcdefg
|
|
for (int i = 0; i < num; ++i) {
|
|
TupleRow* row = row_batch.get_row(i);
|
|
LOG(WARNING) << "input row[" << i << "]: " << print_row(row, scan_node._row_descriptor);
|
|
std::cout << "input row: " << print_row(row, scan_node._row_descriptor) << std::endl;
|
|
|
|
if (i == 0) {
|
|
ASSERT_EQ(std::string(
|
|
"[(1 12345.67891 2015-04-20 ab\0\0\0)]", 34),
|
|
print_row(row, scan_node._row_descriptor));
|
|
Tuple* tuple = row->get_tuple(0);
|
|
StringValue* str_slot
|
|
= tuple->get_string_slot(_t_desc_table.slotDescriptors[3].byteOffset);
|
|
std::cout << "str_slot len: " << str_slot->len << std::endl;
|
|
ASSERT_EQ(5, str_slot->len);
|
|
}
|
|
}
|
|
}
|
|
|
|
ASSERT_TRUE(scan_node.close(_state).ok());
|
|
}
|
|
|
|
TEST_F(CsvScanNodeTest, wrong_fix_len_string_format_test) {
|
|
std::vector<std::string> file_paths;
|
|
file_paths.push_back("./test_run/csv_scan_node/wrong_fix_len_string");
|
|
_tnode.csv_scan_node.__set_file_paths(file_paths);
|
|
|
|
CsvScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl);
|
|
Status status = scan_node.prepare(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
status = scan_node.open(_state);
|
|
ASSERT_TRUE(status.ok());
|
|
|
|
RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
|
|
bool eos = false;
|
|
|
|
while (!eos) {
|
|
status = scan_node.get_next(_state, &row_batch, &eos);
|
|
ASSERT_TRUE(status.ok());
|
|
// int num = std::min(row_batch.num_rows(), 10);
|
|
int num = row_batch.num_rows();
|
|
std::cout << "num: " << num << std::endl;
|
|
ASSERT_EQ(0, num);
|
|
}
|
|
|
|
// Failed because reach max_filter_ratio
|
|
ASSERT_TRUE(!scan_node.close(_state).ok());
|
|
}
|
|
|
|
// 待补充测试case
|
|
// 1. 字符串导入
|
|
// 2. 不指定有默认值的列
|
|
// 3. 文件中有但表中没有的列,导入命令中跳过该列
|
|
// 4. max_filter_ratio
|
|
|
|
} // end namespace palo
|
|
|
|
int main(int argc, char** argv) {
|
|
// std::string conffile = std::string(getenv("PALO_HOME")) + "/conf/be.conf";
|
|
// if (!palo::config::init(conffile.c_str(), false)) {
|
|
// fprintf(stderr, "error read config file. \n");
|
|
// return -1;
|
|
// }
|
|
palo::config::read_size = 8388608;
|
|
palo::config::min_buffer_size = 1024;
|
|
|
|
palo::init_glog("be-test");
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
palo::CpuInfo::init();
|
|
palo::DiskInfo::init();
|
|
|
|
return RUN_ALL_TESTS();
|
|
}
|