diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 7619191a2e..55e9b1af29 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -305,6 +305,19 @@ Status OrcReader::get_parsed_schema(std::vector* col_names, return Status::OK(); } +Status OrcReader::get_schema_col_name_attribute(std::vector* col_names, + std::vector* col_attributes, + std::string attribute) { + RETURN_IF_ERROR(_create_file_reader()); + auto& root_type = _is_acid ? _remove_acid(_reader->getType()) : _reader->getType(); + for (int i = 0; i < root_type.getSubtypeCount(); ++i) { + col_names->emplace_back(get_field_name_lower_case(&root_type, i)); + col_attributes->emplace_back( + std::stol(root_type.getSubtype(i)->getAttributeValue(attribute))); + } + return Status::OK(); +} + Status OrcReader::_init_read_columns() { auto& root_type = _reader->getType(); std::vector orc_cols; @@ -722,7 +735,11 @@ Status OrcReader::set_fill_columns( std::unordered_map> predicate_columns; std::function visit_slot = [&](VExpr* expr) { if (VSlotRef* slot_ref = typeid_cast(expr)) { - auto& expr_name = slot_ref->expr_name(); + auto expr_name = slot_ref->expr_name(); + auto iter = _table_col_to_file_col.find(expr_name); + if (iter != _table_col_to_file_col.end()) { + expr_name = iter->second; + } predicate_columns.emplace(expr_name, std::make_pair(slot_ref->column_id(), slot_ref->slot_id())); if (slot_ref->column_id() == 0) { @@ -1587,6 +1604,7 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { *read_rows = 0; return Status::OK(); } + _execute_filter_position_delete_rowids(*_filter); RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(block, columns_to_filter, *_filter)); if (!_not_single_slot_filter_conjuncts.empty()) { @@ -1694,6 +1712,10 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { for (auto& conjunct : _non_dict_filter_conjuncts) { filter_conjuncts.emplace_back(conjunct); } + //include missing_columns != missing_columns ; missing_column is null; missing_column != file_columns etc... + for (auto& [missing_col, conjunct] : _lazy_read_ctx.predicate_missing_columns) { + filter_conjuncts.emplace_back(conjunct); + } std::vector filters; if (_delete_rows_filter_ptr) { filters.push_back(_delete_rows_filter_ptr.get()); @@ -1710,6 +1732,7 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { static_cast(_convert_dict_cols_to_string_cols(block, &batch_vec)); return Status::OK(); } + _execute_filter_position_delete_rowids(result_filter); RETURN_IF_CATCH_EXCEPTION( Block::filter_block_internal(block, columns_to_filter, result_filter)); if (!_not_single_slot_filter_conjuncts.empty()) { @@ -1841,6 +1864,10 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s for (auto& conjunct : _non_dict_filter_conjuncts) { filter_conjuncts.emplace_back(conjunct); } + //include missing_columns != missing_columns ; missing_column is null; missing_column != file_columns etc... + for (auto& [missing_col, conjunct] : _lazy_read_ctx.predicate_missing_columns) { + filter_conjuncts.emplace_back(conjunct); + } std::vector filters; if (_delete_rows_filter_ptr) { filters.push_back(_delete_rows_filter_ptr.get()); @@ -1919,6 +1946,9 @@ bool OrcReader::_can_filter_by_dict(int slot_id) { break; } } + if (slot == nullptr) { + return false; + } if (!slot->type().is_string_type()) { return false; } @@ -2369,5 +2399,19 @@ void ORCFileInputStream::_collect_profile_before_close() { _file_reader->collect_profile_before_close(); } } +void OrcReader::_execute_filter_position_delete_rowids(IColumn::Filter& filter) { + if (_position_delete_ordered_rowids == nullptr) { + return; + } + auto start = _row_reader->getRowNumber(); + auto nums = _batch->numElements; + auto l = std::lower_bound(_position_delete_ordered_rowids->begin(), + _position_delete_ordered_rowids->end(), start); + auto r = std::upper_bound(_position_delete_ordered_rowids->begin(), + _position_delete_ordered_rowids->end(), start + nums - 1); + for (; l < r; l++) { + filter[*l - start] = 0; + } +} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 0b07f147c4..f5bb7004ca 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -177,6 +177,19 @@ public: Status get_parsed_schema(std::vector* col_names, std::vector* col_types) override; + Status get_schema_col_name_attribute(std::vector* col_names, + std::vector* col_attributes, + std::string attribute); + void set_table_col_to_file_col( + std::unordered_map table_col_to_file_col) { + _table_col_to_file_col = table_col_to_file_col; + } + + void set_position_delete_rowids(vector* delete_rows) { + _position_delete_ordered_rowids = delete_rows; + } + void _execute_filter_position_delete_rowids(IColumn::Filter& filter); + void set_delete_rows(const TransactionalHiveReader::AcidRowIDSet* delete_rows) { _delete_rows = delete_rows; } @@ -246,6 +259,8 @@ private: OrcReader* _orc_reader = nullptr; }; + //class RowFilter : public orc::RowReader + // Create inner orc file, // return EOF if file is empty // return EROOR if encounter error. @@ -586,6 +601,10 @@ private: // resolve schema change std::unordered_map> _converters; + //for iceberg table , when table column name != file column name + std::unordered_map _table_col_to_file_col; + //support iceberg position delete . + std::vector* _position_delete_ordered_rowids = nullptr; }; class ORCFileInputStream : public orc::InputStream, public ProfileCollector { diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index 8c05b8c2a0..7e5a5bf6db 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -49,12 +49,10 @@ #include "vec/common/string_ref.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" -#include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/exec/format/format_common.h" #include "vec/exec/format/generic_reader.h" -#include "vec/exec/format/parquet/parquet_common.h" -#include "vec/exec/format/parquet/vparquet_reader.h" +#include "vec/exec/format/orc/vorc_reader.h" #include "vec/exec/format/table/table_format_reader.h" namespace cctz { @@ -74,17 +72,6 @@ class VExprContext; } // namespace doris namespace doris::vectorized { - -using DeleteRows = std::vector; -using DeleteFile = phmap::parallel_flat_hash_map< - std::string, std::unique_ptr, std::hash, - std::equal_to, - std::allocator>>, 8, std::mutex>; - -const int64_t MIN_SUPPORT_DELETE_FILES_VERSION = 2; -const std::string ICEBERG_ROW_POS = "pos"; -const std::string ICEBERG_FILE_PATH = "file_path"; - IcebergTableReader::IcebergTableReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, RuntimeState* state, const TFileScanRangeParams& params, @@ -110,33 +97,6 @@ IcebergTableReader::IcebergTableReader(std::unique_ptr file_forma ADD_CHILD_TIMER(_profile, "DeleteRowsSortTime", iceberg_profile); } -Status IcebergTableReader::init_reader( - const std::vector& file_col_names, - const std::unordered_map& col_id_name_map, - std::unordered_map* colname_to_value_range, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts) { - ParquetReader* parquet_reader = static_cast(_file_format_reader.get()); - _col_id_name_map = col_id_name_map; - _file_col_names = file_col_names; - _colname_to_value_range = colname_to_value_range; - auto parquet_meta_kv = parquet_reader->get_metadata_key_values(); - static_cast(_gen_col_name_maps(parquet_meta_kv)); - _gen_file_col_names(); - _gen_new_colname_to_value_range(); - parquet_reader->set_table_to_file_col_map(_table_col_to_file_col); - parquet_reader->iceberg_sanitize(_all_required_col_names); - Status status = parquet_reader->init_reader( - _all_required_col_names, _not_in_file_col_names, &_new_colname_to_value_range, - conjuncts, tuple_descriptor, row_descriptor, colname_to_slot_id, - not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); - - return status; -} - Status IcebergTableReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { // already get rows from be if (_push_down_agg_type == TPushAggOp::type::COUNT && _remaining_push_down_count > 0) { @@ -220,18 +180,17 @@ Status IcebergTableReader::init_row_filters(const TFileRangeDesc& range) { } if (delete_file_type == POSITION_DELETE) { - RETURN_IF_ERROR(_position_delete(files)); + RETURN_IF_ERROR(_position_delete_base(files)); + } else if (delete_file_type == EQUALITY_DELETE) { + // todo: equality delete + // If it is a count operation and it has equality delete file kind, + // the push down operation of the count for this split needs to be canceled. + return Status::NotSupported("NOT SUPPORT EQUALITY_DELETE!"); } - - // todo: equality delete - // If it is a count operation and it has equality delete file kind, - // the push down operation of the count for this split needs to be canceled. - COUNTER_UPDATE(_iceberg_profile.num_delete_files, files.size()); return Status::OK(); } - -Status IcebergTableReader::_position_delete( +Status IcebergTableReader::_position_delete_base( const std::vector& delete_files) { std::string data_file_path = _range.path; // the path in _range is remove the namenode prefix, @@ -243,133 +202,31 @@ Status IcebergTableReader::_position_delete( } } - // position delete - ParquetReader* parquet_reader = (ParquetReader*)(_file_format_reader.get()); - RowRange whole_range = parquet_reader->get_whole_range(); - bool init_schema = false; - std::vector delete_file_col_names; - std::vector delete_file_col_types; std::vector delete_rows_array; int64_t num_delete_rows = 0; std::vector erase_data; for (auto& delete_file : delete_files) { - if (whole_range.last_row <= delete_file.position_lower_bound || - whole_range.first_row > delete_file.position_upper_bound) { - continue; - } - SCOPED_TIMER(_iceberg_profile.delete_files_read_time); Status create_status = Status::OK(); - DeleteFile* delete_file_cache = _kv_cache->get< - DeleteFile>(_delet_file_cache_key(delete_file.path), [&]() -> DeleteFile* { - TFileRangeDesc delete_range; - // must use __set() method to make sure __isset is true - delete_range.__set_fs_name(_range.fs_name); - delete_range.path = delete_file.path; - delete_range.start_offset = 0; - delete_range.size = -1; - delete_range.file_size = -1; - ParquetReader delete_reader(_profile, _params, delete_range, 102400, - const_cast(&_state->timezone_obj()), - _io_ctx, _state); - if (!init_schema) { - static_cast(delete_reader.get_parsed_schema(&delete_file_col_names, - &delete_file_col_types)); - init_schema = true; - } - create_status = delete_reader.open(); - if (!create_status.ok()) { - return nullptr; - } - create_status = delete_reader.init_reader(delete_file_col_names, _not_in_file_col_names, - nullptr, {}, nullptr, nullptr, nullptr, - nullptr, nullptr, false); - if (!create_status.ok()) { - return nullptr; - } + auto* delete_file_cache = _kv_cache->get( + _delet_file_cache_key(delete_file.path), [&]() -> DeleteFile* { + auto* position_delete = new DeleteFile; + TFileRangeDesc delete_file_range; + // must use __set() method to make sure __isset is true + delete_file_range.__set_fs_name(_range.fs_name); + delete_file_range.path = delete_file.path; + delete_file_range.start_offset = 0; + delete_file_range.size = -1; + delete_file_range.file_size = -1; + //read position delete file base on delete_file_range , generate DeleteFile , add DeleteFile to kv_cache + create_status = _read_position_delete_file(&delete_file_range, position_delete); - std::unordered_map> - partition_columns; - std::unordered_map missing_columns; - static_cast(delete_reader.set_fill_columns(partition_columns, missing_columns)); + if (!create_status) { + return nullptr; + } - bool dictionary_coded = true; - const tparquet::FileMetaData* meta_data = delete_reader.get_meta_data(); - for (int i = 0; i < delete_file_col_names.size(); ++i) { - if (delete_file_col_names[i] == ICEBERG_FILE_PATH) { - for (int j = 0; j < meta_data->row_groups.size(); ++j) { - auto& column_chunk = meta_data->row_groups[j].columns[i]; - if (!(column_chunk.__isset.meta_data && - column_chunk.meta_data.__isset.dictionary_page_offset)) { - dictionary_coded = false; - break; - } - } - break; - } - } - - DeleteFile* position_delete = new DeleteFile; - bool eof = false; - while (!eof) { - Block block = Block(); - for (int i = 0; i < delete_file_col_names.size(); ++i) { - DataTypePtr data_type = DataTypeFactory::instance().create_data_type( - delete_file_col_types[i], false); - if (delete_file_col_names[i] == ICEBERG_FILE_PATH && dictionary_coded) { - // the dictionary data in ColumnDictI32 is referenced by StringValue, it does keep - // the dictionary data in its life circle, so the upper caller should keep the - // dictionary data alive after ColumnDictI32. - MutableColumnPtr dict_column = ColumnDictI32::create(); - block.insert(ColumnWithTypeAndName(std::move(dict_column), data_type, - delete_file_col_names[i])); - } else { - MutableColumnPtr data_column = data_type->create_column(); - block.insert(ColumnWithTypeAndName(std::move(data_column), data_type, - delete_file_col_names[i])); - } - } - eof = false; - size_t read_rows = 0; - create_status = delete_reader.get_next_block(&block, &read_rows, &eof); - if (!create_status.ok()) { - return nullptr; - } - if (read_rows > 0) { - ColumnPtr path_column = block.get_by_name(ICEBERG_FILE_PATH).column; - DCHECK_EQ(path_column->size(), read_rows); - ColumnPtr pos_column = block.get_by_name(ICEBERG_ROW_POS).column; - using ColumnType = typename PrimitiveTypeTraits::ColumnType; - const int64_t* src_data = - assert_cast(*pos_column).get_data().data(); - IcebergTableReader::PositionDeleteRange range; - if (dictionary_coded) { - range = _get_range(assert_cast(*path_column)); - } else { - range = _get_range(assert_cast(*path_column)); - } - for (int i = 0; i < range.range.size(); ++i) { - std::string key = range.data_file_path[i]; - auto iter = position_delete->find(key); - DeleteRows* delete_rows; - if (iter == position_delete->end()) { - delete_rows = new DeleteRows; - std::unique_ptr delete_rows_ptr(delete_rows); - (*position_delete)[key] = std::move(delete_rows_ptr); - } else { - delete_rows = iter->second.get(); - } - const int64_t* cpy_start = src_data + range.range[i].first; - const int64_t cpy_count = range.range[i].second - range.range[i].first; - int64_t origin_size = delete_rows->size(); - delete_rows->resize(origin_size + cpy_count); - int64_t* dest_position = &(*delete_rows)[origin_size]; - memcpy(dest_position, cpy_start, cpy_count * sizeof(int64_t)); - } - } - } - return position_delete; - }); + return position_delete; + }); if (create_status.is()) { continue; } else if (!create_status.ok()) { @@ -382,10 +239,7 @@ Status IcebergTableReader::_position_delete( if (row_ids->size() > 0) { delete_rows_array.emplace_back(row_ids); num_delete_rows += row_ids->size(); - if (row_ids->front() >= whole_range.first_row && - row_ids->back() < whole_range.last_row) { - erase_data.emplace_back(delete_file_cache); - } + erase_data.emplace_back(delete_file_cache); } }; delete_file_map.if_contains(data_file_path, get_value); @@ -393,7 +247,7 @@ Status IcebergTableReader::_position_delete( if (num_delete_rows > 0) { SCOPED_TIMER(_iceberg_profile.delete_rows_sort_time); _sort_delete_rows(delete_rows_array, num_delete_rows); - parquet_reader->set_delete_rows(&_delete_rows); + this->set_delete_rows(); COUNTER_UPDATE(_iceberg_profile.num_delete_rows, num_delete_rows); } // the deleted rows are copy out, we can erase them. @@ -428,9 +282,9 @@ IcebergTableReader::PositionDeleteRange IcebergTableReader::_get_range( int index = 0; while (index < read_rows) { StringRef data_path = file_path_column.get_data_at(index); - int left = index; - int right = read_rows - 1; - while (left < right) { + int left = index - 1; + int right = read_rows; + while (left + 1 != right) { int mid = left + (right - left) / 2; if (file_path_column.get_data_at(mid) > data_path) { right = mid; @@ -451,23 +305,23 @@ void IcebergTableReader::_sort_delete_rows(std::vector*>& d return; } if (delete_rows_array.size() == 1) { - _delete_rows.resize(num_delete_rows); - memcpy(&_delete_rows[0], &((*delete_rows_array.front())[0]), + _iceberg_delete_rows.resize(num_delete_rows); + memcpy(&_iceberg_delete_rows[0], &((*delete_rows_array.front())[0]), sizeof(int64_t) * num_delete_rows); return; } if (delete_rows_array.size() == 2) { - _delete_rows.resize(num_delete_rows); + _iceberg_delete_rows.resize(num_delete_rows); std::merge(delete_rows_array.front()->begin(), delete_rows_array.front()->end(), delete_rows_array.back()->begin(), delete_rows_array.back()->end(), - _delete_rows.begin()); + _iceberg_delete_rows.begin()); return; } using vec_pair = std::pair::iterator, std::vector::iterator>; - _delete_rows.resize(num_delete_rows); - auto row_id_iter = _delete_rows.begin(); - auto iter_end = _delete_rows.end(); + _iceberg_delete_rows.resize(num_delete_rows); + auto row_id_iter = _iceberg_delete_rows.begin(); + auto iter_end = _iceberg_delete_rows.end(); std::vector rows_array; for (auto rows : delete_rows_array) { if (rows->size() > 0) { @@ -493,58 +347,6 @@ void IcebergTableReader::_sort_delete_rows(std::vector*>& d } } -/* - * To support schema evolution, Iceberg write the column id to column name map to - * parquet file key_value_metadata. - * This function is to compare the table schema from FE (_col_id_name_map) with - * the schema in key_value_metadata for the current parquet file and generate two maps - * for future use: - * 1. table column name to parquet column name. - * 2. parquet column name to table column name. - * For example, parquet file has a column 'col1', - * after this file was written, iceberg changed the column name to 'col1_new'. - * The two maps would contain: - * 1. col1_new -> col1 - * 2. col1 -> col1_new - */ -Status IcebergTableReader::_gen_col_name_maps(std::vector parquet_meta_kv) { - for (int i = 0; i < parquet_meta_kv.size(); ++i) { - tparquet::KeyValue kv = parquet_meta_kv[i]; - if (kv.key == "iceberg.schema") { - _has_iceberg_schema = true; - std::string schema = kv.value; - rapidjson::Document json; - json.Parse(schema.c_str()); - - if (json.HasMember("fields")) { - rapidjson::Value& fields = json["fields"]; - if (fields.IsArray()) { - for (int j = 0; j < fields.Size(); j++) { - rapidjson::Value& e = fields[j]; - rapidjson::Value& id = e["id"]; - rapidjson::Value& name = e["name"]; - std::string name_string = name.GetString(); - transform(name_string.begin(), name_string.end(), name_string.begin(), - ::tolower); - auto iter = _col_id_name_map.find(id.GetInt()); - if (iter != _col_id_name_map.end()) { - _table_col_to_file_col.emplace(iter->second, name_string); - _file_col_to_table_col.emplace(name_string, iter->second); - if (name_string != iter->second) { - _has_schema_change = true; - } - } else { - _has_schema_change = true; - } - } - } - } - break; - } - } - return Status::OK(); -} - /* * Generate _all_required_col_names and _not_in_file_col_names. * @@ -599,4 +401,244 @@ void IcebergTableReader::_gen_new_colname_to_value_range() { } } +void IcebergTableReader::_gen_position_delete_file_range(Block& block, DeleteFile* position_delete, + size_t read_rows, + bool file_path_column_dictionary_coded) { + ColumnPtr path_column = block.get_by_name(ICEBERG_FILE_PATH).column; + DCHECK_EQ(path_column->size(), read_rows); + ColumnPtr pos_column = block.get_by_name(ICEBERG_ROW_POS).column; + using ColumnType = typename PrimitiveTypeTraits::ColumnType; + const int64_t* src_data = assert_cast(*pos_column).get_data().data(); + IcebergTableReader::PositionDeleteRange range; + if (file_path_column_dictionary_coded) { + range = _get_range(assert_cast(*path_column)); + } else { + range = _get_range(assert_cast(*path_column)); + } + for (int i = 0; i < range.range.size(); ++i) { + std::string key = range.data_file_path[i]; + auto iter = position_delete->find(key); + DeleteRows* delete_rows; + if (iter == position_delete->end()) { + delete_rows = new DeleteRows; + std::unique_ptr delete_rows_ptr(delete_rows); + (*position_delete)[key] = std::move(delete_rows_ptr); + } else { + delete_rows = iter->second.get(); + } + const int64_t* cpy_start = src_data + range.range[i].first; + const int64_t cpy_count = range.range[i].second - range.range[i].first; + int64_t origin_size = delete_rows->size(); + delete_rows->resize(origin_size + cpy_count); + int64_t* dest_position = &(*delete_rows)[origin_size]; + memcpy(dest_position, cpy_start, cpy_count * sizeof(int64_t)); + } +} + +Status IcebergParquetReader::init_reader( + const std::vector& file_col_names, + const std::unordered_map& col_id_name_map, + std::unordered_map* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map* slot_id_to_filter_conjuncts) { + _file_format = Fileformat::PARQUET; + ParquetReader* parquet_reader = static_cast(_file_format_reader.get()); + _col_id_name_map = col_id_name_map; + _file_col_names = file_col_names; + _colname_to_value_range = colname_to_value_range; + auto parquet_meta_kv = parquet_reader->get_metadata_key_values(); + static_cast(_gen_col_name_maps(parquet_meta_kv)); + _gen_file_col_names(); + _gen_new_colname_to_value_range(); + parquet_reader->set_table_to_file_col_map(_table_col_to_file_col); + parquet_reader->iceberg_sanitize(_all_required_col_names); + Status status = parquet_reader->init_reader( + _all_required_col_names, _not_in_file_col_names, &_new_colname_to_value_range, + conjuncts, tuple_descriptor, row_descriptor, colname_to_slot_id, + not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + + return status; +} + +Status IcebergParquetReader ::_read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) { + ParquetReader parquet_delete_reader( + _profile, _params, *delete_range, READ_DELETE_FILE_BATCH_SIZE, + const_cast(&_state->timezone_obj()), _io_ctx, _state); + + RETURN_IF_ERROR(parquet_delete_reader.open()); + RETURN_IF_ERROR(parquet_delete_reader.init_reader(delete_file_col_names, {}, nullptr, {}, + nullptr, nullptr, nullptr, nullptr, nullptr, + false)); + + std::unordered_map> + partition_columns; + std::unordered_map missing_columns; + static_cast(parquet_delete_reader.set_fill_columns(partition_columns, missing_columns)); + + const tparquet::FileMetaData* meta_data = parquet_delete_reader.get_meta_data(); + bool dictionary_coded = true; + for (int j = 0; j < meta_data->row_groups.size(); ++j) { + auto& column_chunk = meta_data->row_groups[j].columns[ICEBERG_FILE_PATH_INDEX]; + if (!(column_chunk.__isset.meta_data && + column_chunk.meta_data.__isset.dictionary_page_offset)) { + dictionary_coded = false; + break; + } + } + DataTypePtr data_type_file_path {new DataTypeString}; + DataTypePtr data_type_pos {new DataTypeInt64}; + bool eof = false; + while (!eof) { + Block block = {dictionary_coded + ? ColumnWithTypeAndName {ColumnDictI32::create(), + data_type_file_path, ICEBERG_FILE_PATH} + : ColumnWithTypeAndName {data_type_file_path, ICEBERG_FILE_PATH}, + + {data_type_pos, ICEBERG_ROW_POS}}; + size_t read_rows = 0; + RETURN_IF_ERROR(parquet_delete_reader.get_next_block(&block, &read_rows, &eof)); + + if (read_rows <= 0) { + break; + } + _gen_position_delete_file_range(block, position_delete, read_rows, dictionary_coded); + } + return Status::OK(); +}; + +Status IcebergOrcReader::init_reader( + const std::vector& file_col_names, + const std::unordered_map& col_id_name_map, + std::unordered_map* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map* slot_id_to_filter_conjuncts) { + _file_format = Fileformat::ORC; + auto* orc_reader = static_cast(_file_format_reader.get()); + _col_id_name_map = col_id_name_map; + _file_col_names = file_col_names; + _colname_to_value_range = colname_to_value_range; + + RETURN_IF_ERROR(_gen_col_name_maps(orc_reader)); + _gen_file_col_names(); + _gen_new_colname_to_value_range(); + orc_reader->set_table_col_to_file_col(_table_col_to_file_col); + Status status = + orc_reader->init_reader(&_all_required_col_names, &_new_colname_to_value_range, + conjuncts, false, tuple_descriptor, row_descriptor, + not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts); + return status; +} + +Status IcebergOrcReader::_read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) { + OrcReader orc_delete_reader(_profile, _state, _params, *delete_range, + READ_DELETE_FILE_BATCH_SIZE, _state->timezone(), _io_ctx); + std::unordered_map colname_to_value_range; + Status init_status = orc_delete_reader.init_reader( + &delete_file_col_names, &colname_to_value_range, {}, false, {}, {}, nullptr, nullptr); + + std::unordered_map> + partition_columns; + std::unordered_map missing_columns; + static_cast(orc_delete_reader.set_fill_columns(partition_columns, missing_columns)); + + bool eof = false; + DataTypePtr data_type_file_path {new DataTypeString}; + DataTypePtr data_type_pos {new DataTypeInt64}; + while (!eof) { + Block block = {{data_type_file_path, ICEBERG_FILE_PATH}, {data_type_pos, ICEBERG_ROW_POS}}; + + size_t read_rows = 0; + RETURN_IF_ERROR(orc_delete_reader.get_next_block(&block, &read_rows, &eof)); + + _gen_position_delete_file_range(block, position_delete, read_rows, false); + } + return Status::OK(); +} + +/* + * To support schema evolution, Iceberg write the column id to column name map to + * parquet file key_value_metadata. + * This function is to compare the table schema from FE (_col_id_name_map) with + * the schema in key_value_metadata for the current parquet file and generate two maps + * for future use: + * 1. table column name to parquet column name. + * 2. parquet column name to table column name. + * For example, parquet file has a column 'col1', + * after this file was written, iceberg changed the column name to 'col1_new'. + * The two maps would contain: + * 1. col1_new -> col1 + * 2. col1 -> col1_new + */ +Status IcebergParquetReader::_gen_col_name_maps(std::vector parquet_meta_kv) { + for (int i = 0; i < parquet_meta_kv.size(); ++i) { + tparquet::KeyValue kv = parquet_meta_kv[i]; + if (kv.key == "iceberg.schema") { + _has_iceberg_schema = true; + std::string schema = kv.value; + rapidjson::Document json; + json.Parse(schema.c_str()); + + if (json.HasMember("fields")) { + rapidjson::Value& fields = json["fields"]; + if (fields.IsArray()) { + for (int j = 0; j < fields.Size(); j++) { + rapidjson::Value& e = fields[j]; + rapidjson::Value& id = e["id"]; + rapidjson::Value& name = e["name"]; + std::string name_string = name.GetString(); + transform(name_string.begin(), name_string.end(), name_string.begin(), + ::tolower); + auto iter = _col_id_name_map.find(id.GetInt()); + if (iter != _col_id_name_map.end()) { + _table_col_to_file_col.emplace(iter->second, name_string); + _file_col_to_table_col.emplace(name_string, iter->second); + if (name_string != iter->second) { + _has_schema_change = true; + } + } else { + _has_schema_change = true; + } + } + } + } + break; + } + } + return Status::OK(); +} + +Status IcebergOrcReader::_gen_col_name_maps(OrcReader* orc_reader) { + std::vector col_names; + std::vector col_ids; + RETURN_IF_ERROR( + orc_reader->get_schema_col_name_attribute(&col_names, &col_ids, ICEBERG_ORC_ATTRIBUTE)); + _has_iceberg_schema = true; + _table_col_to_file_col.clear(); + _file_col_to_table_col.clear(); + for (size_t i = 0; i < col_ids.size(); i++) { + auto col_id = col_ids[i]; + auto& file_col_name = col_names[i]; + + if (_col_id_name_map.find(col_id) == _col_id_name_map.end()) { + _has_schema_change = true; + continue; + } + auto& table_col_name = _col_id_name_map[col_id]; + _table_col_to_file_col.emplace(table_col_name, file_col_name); + _file_col_to_table_col.emplace(file_col_name, table_col_name); + if (table_col_name != file_col_name) { + _has_schema_change = true; + } + } + return Status::OK(); +} + } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/table/iceberg_reader.h b/be/src/vec/exec/format/table/iceberg_reader.h index 04be8d53f2..50c8d31bed 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.h +++ b/be/src/vec/exec/format/table/iceberg_reader.h @@ -28,10 +28,16 @@ #include "common/status.h" #include "exec/olap_common.h" +#include "runtime/define_primitive_type.h" +#include "runtime/primitive_type.h" +#include "runtime/runtime_state.h" +#include "runtime/types.h" #include "table_format_reader.h" #include "util/runtime_profile.h" #include "vec/columns/column_dictionary.h" - +#include "vec/exec/format/orc/vorc_reader.h" +#include "vec/exec/format/parquet/vparquet_reader.h" +#include "vec/exprs/vslot_ref.h" namespace tparquet { class KeyValue; } // namespace tparquet @@ -60,8 +66,6 @@ class ShardedKVCache; class VExprContext; class IcebergTableReader : public TableFormatReader { - ENABLE_FACTORY_CREATOR(IcebergTableReader); - public: struct PositionDeleteRange { std::vector data_file_path; @@ -74,42 +78,40 @@ public: int64_t push_down_count); ~IcebergTableReader() override = default; - Status init_row_filters(const TFileRangeDesc& range) override; + Status init_row_filters(const TFileRangeDesc& range) final; - Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + Status get_next_block(Block* block, size_t* read_rows, bool* eof) final; Status set_fill_columns( const std::unordered_map>& partition_columns, - const std::unordered_map& missing_columns) override; + const std::unordered_map& missing_columns) final; - bool fill_all_columns() const override; + bool fill_all_columns() const final; Status get_columns(std::unordered_map* name_to_type, - std::unordered_set* missing_cols) override; + std::unordered_set* missing_cols) final; - Status init_reader( - const std::vector& file_col_names, - const std::unordered_map& col_id_name_map, - std::unordered_map* colname_to_value_range, - const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, - const RowDescriptor* row_descriptor, - const std::unordered_map* colname_to_slot_id, - const VExprContextSPtrs* not_single_slot_filter_conjuncts, - const std::unordered_map* slot_id_to_filter_conjuncts); + Status _position_delete_base(const std::vector& delete_files); enum { DATA, POSITION_DELETE, EQUALITY_DELETE }; + enum Fileformat { NONE, PARQUET, ORC, AVRO }; -private: + virtual void set_delete_rows() = 0; + +protected: struct IcebergProfile { RuntimeProfile::Counter* num_delete_files; RuntimeProfile::Counter* num_delete_rows; RuntimeProfile::Counter* delete_files_read_time; RuntimeProfile::Counter* delete_rows_sort_time; }; - - Status _position_delete(const std::vector& delete_files); - + using DeleteRows = std::vector; + using DeleteFile = phmap::parallel_flat_hash_map< + std::string, std::unique_ptr, std::hash, + std::equal_to, + std::allocator>>, 8, + std::mutex>; /** * https://iceberg.apache.org/spec/#position-delete-files * The rows in the delete file must be sorted by file_path then position to optimize filtering rows while scanning. @@ -123,8 +125,8 @@ private: PositionDeleteRange _get_range(const ColumnString& file_path_column); - Status _gen_col_name_maps(std::vector parquet_meta_kv); void _gen_file_col_names(); + void _gen_new_colname_to_value_range(); static std::string _delet_file_cache_key(const std::string& path) { return "delete_" + path; } @@ -135,7 +137,7 @@ private: // owned by scan node ShardedKVCache* _kv_cache; IcebergProfile _iceberg_profile; - std::vector _delete_rows; + std::vector _iceberg_delete_rows; // col names from _file_slot_descs std::vector _file_col_names; // file column name to table column name map. For iceberg schema evolution. @@ -143,13 +145,13 @@ private: // table column name to file column name map. For iceberg schema evolution. std::unordered_map _table_col_to_file_col; std::unordered_map* _colname_to_value_range; - // copy from _colname_to_value_range with new column name that is in parquet file, to support schema evolution. + // copy from _colname_to_value_range with new column name that is in parquet/orc file, to support schema evolution. std::unordered_map _new_colname_to_value_range; // column id to name map. Collect from FE slot descriptor. std::unordered_map _col_id_name_map; - // col names in the parquet file + // col names in the parquet,orc file std::vector _all_required_col_names; - // col names in table but not in parquet file + // col names in table but not in parquet,orc file std::vector _not_in_file_col_names; io::IOContext* _io_ctx; @@ -157,6 +159,88 @@ private: bool _has_iceberg_schema = false; int64_t _remaining_push_down_count; + Fileformat _file_format = Fileformat::NONE; + + const int64_t MIN_SUPPORT_DELETE_FILES_VERSION = 2; + const std::string ICEBERG_ROW_POS = "pos"; + const std::string ICEBERG_FILE_PATH = "file_path"; + const std::vector delete_file_col_names {ICEBERG_ROW_POS, ICEBERG_FILE_PATH}; + const std::vector delete_file_col_types {{TYPE_STRING}, {TYPE_BIGINT}}; + const int ICEBERG_FILE_PATH_INDEX = 0; + const int ICEBERG_FILE_POS_INDEX = 1; + const int READ_DELETE_FILE_BATCH_SIZE = 102400; + + //Read position_delete_file TFileRangeDesc, generate DeleteFile + virtual Status _read_position_delete_file(const TFileRangeDesc*, DeleteFile*) = 0; + + void _gen_position_delete_file_range(Block& block, DeleteFile* const position_delete, + size_t read_rows, bool file_path_column_dictionary_coded); }; + +class IcebergParquetReader final : public IcebergTableReader { +public: + ENABLE_FACTORY_CREATOR(IcebergParquetReader); + + IcebergParquetReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, + RuntimeState* state, const TFileScanRangeParams& params, + const TFileRangeDesc& range, ShardedKVCache* kv_cache, + io::IOContext* io_ctx, int64_t push_down_count) + : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, + kv_cache, io_ctx, push_down_count) {} + Status init_reader( + const std::vector& file_col_names, + const std::unordered_map& col_id_name_map, + std::unordered_map* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map* slot_id_to_filter_conjuncts); + + Status _read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) override; + + void set_delete_rows() override { + auto* parquet_reader = (ParquetReader*)(_file_format_reader.get()); + parquet_reader->set_delete_rows(&_iceberg_delete_rows); + } + + Status _gen_col_name_maps(std::vector parquet_meta_kv); +}; +class IcebergOrcReader final : public IcebergTableReader { +public: + ENABLE_FACTORY_CREATOR(IcebergOrcReader); + + Status _read_position_delete_file(const TFileRangeDesc* delete_range, + DeleteFile* position_delete) override; + + IcebergOrcReader(std::unique_ptr file_format_reader, RuntimeProfile* profile, + RuntimeState* state, const TFileScanRangeParams& params, + const TFileRangeDesc& range, ShardedKVCache* kv_cache, io::IOContext* io_ctx, + int64_t push_down_count) + : IcebergTableReader(std::move(file_format_reader), profile, state, params, range, + kv_cache, io_ctx, push_down_count) {} + + void set_delete_rows() override { + auto* orc_reader = (OrcReader*)_file_format_reader.get(); + orc_reader->set_position_delete_rowids(&_iceberg_delete_rows); + } + + Status init_reader( + const std::vector& file_col_names, + const std::unordered_map& col_id_name_map, + std::unordered_map* colname_to_value_range, + const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, + const RowDescriptor* row_descriptor, + const std::unordered_map* colname_to_slot_id, + const VExprContextSPtrs* not_single_slot_filter_conjuncts, + const std::unordered_map* slot_id_to_filter_conjuncts); + + Status _gen_col_name_maps(OrcReader* orc_reader); + +private: + const std::string ICEBERG_ORC_ATTRIBUTE = "iceberg.id"; +}; + } // namespace vectorized } // namespace doris diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index fb3eaec078..c4bb8e20a1 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -820,15 +820,16 @@ Status VFileScanner::_get_next_reader() { } if (range.__isset.table_format_params && range.table_format_params.table_format_type == "iceberg") { - std::unique_ptr iceberg_reader = - IcebergTableReader::create_unique(std::move(parquet_reader), _profile, - _state, *_params, range, _kv_cache, - _io_ctx.get(), _get_push_down_count()); + std::unique_ptr iceberg_reader = + IcebergParquetReader::create_unique(std::move(parquet_reader), _profile, + _state, *_params, range, _kv_cache, + _io_ctx.get(), _get_push_down_count()); init_status = iceberg_reader->init_reader( _file_col_names, _col_id_name_map, _colname_to_value_range, _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); + RETURN_IF_ERROR(iceberg_reader->init_row_filters(range)); _cur_reader = std::move(iceberg_reader); } else { @@ -870,6 +871,20 @@ Status VFileScanner::_get_next_reader() { &_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts); RETURN_IF_ERROR(tran_orc_reader->init_row_filters(range)); _cur_reader = std::move(tran_orc_reader); + } else if (range.__isset.table_format_params && + range.table_format_params.table_format_type == "iceberg") { + std::unique_ptr iceberg_reader = IcebergOrcReader::create_unique( + std::move(orc_reader), _profile, _state, *_params, range, _kv_cache, + _io_ctx.get(), _get_push_down_count()); + + init_status = iceberg_reader->init_reader( + _file_col_names, _col_id_name_map, _colname_to_value_range, + _push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(), + _col_name_to_slot_id, &_not_single_slot_filter_conjuncts, + &_slot_id_to_filter_conjuncts); + + RETURN_IF_ERROR(iceberg_reader->init_row_filters(range)); + _cur_reader = std::move(iceberg_reader); } else { init_status = orc_reader->init_reader( &_file_col_names, _colname_to_value_range, _push_down_conjuncts, false, diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java index 70c384a0c4..05519f8459 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java @@ -37,7 +37,10 @@ import org.apache.doris.analysis.Subquery; import org.apache.doris.catalog.ArrayType; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.MapType; import org.apache.doris.catalog.ScalarType; +import org.apache.doris.catalog.StructField; +import org.apache.doris.catalog.StructType; import org.apache.doris.catalog.Type; import org.apache.doris.common.UserException; import org.apache.doris.common.util.TimeUtils; @@ -65,6 +68,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.stream.Collectors; /** * Iceberg utils @@ -503,8 +507,17 @@ public class IcebergUtils { Types.ListType list = (Types.ListType) type; return ArrayType.create(icebergTypeToDorisType(list.elementType()), true); case MAP: + Types.MapType map = (Types.MapType) type; + return new MapType( + icebergTypeToDorisType(map.keyType()), + icebergTypeToDorisType(map.valueType()) + ); case STRUCT: - return Type.UNSUPPORTED; + Types.StructType struct = (Types.StructType) type; + ArrayList nestedTypes = struct.fields().stream().map( + x -> new StructField(x.name(), icebergTypeToDorisType(x.type())) + ).collect(Collectors.toCollection(ArrayList::new)); + return new StructType(nestedTypes); default: throw new IllegalArgumentException("Cannot transform unknown type: " + type); } diff --git a/regression-test/data/external_table_p2/iceberg/iceberg_complex_type.out b/regression-test/data/external_table_p2/iceberg/iceberg_complex_type.out new file mode 100644 index 0000000000..2250381e6f --- /dev/null +++ b/regression-test/data/external_table_p2/iceberg/iceberg_complex_type.out @@ -0,0 +1,165 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !parquet_v1_1 -- +id INT Yes true \N +col2 ARRAY>>>> Yes true \N +col3 MAP,MAP>> Yes true \N +col4 STRUCT,y:ARRAY,z:MAP> Yes true \N +col5 MAP>>>>>>> Yes true \N +col6 STRUCT,yy:ARRAY>,zz:STRUCT>>> Yes true \N + +-- !parquet_v1_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !parquet_v1_3 -- +6 + +-- !parquet_v1_4 -- +1 +1 +2 +1 +1 +1 + +-- !parquet_v1_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !parquet_v1_6 -- + +-- !parquet_v1_7 -- +6 1 +5 1 + +-- !parquet_v2_1 -- +id INT Yes true \N +col2 ARRAY>>>> Yes true \N +col3 MAP,MAP>> Yes true \N +col4 STRUCT,y:ARRAY,z:MAP> Yes true \N +col5 MAP>>>>>>> Yes true \N +col6 STRUCT,yy:ARRAY>,zz:STRUCT>>> Yes true \N + +-- !parquet_v2_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !parquet_v2_3 -- +6 + +-- !parquet_v2_4 -- +1 +1 +2 +1 +1 +1 + +-- !parquet_v2_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !parquet_v2_6 -- + +-- !parquet_v2_7 -- +6 1 +5 1 + +-- !orc_v1_1 -- +id INT Yes true \N +col2 ARRAY>>>> Yes true \N +col3 MAP,MAP>> Yes true \N +col4 STRUCT,y:ARRAY,z:MAP> Yes true \N +col5 MAP>>>>>>> Yes true \N +col6 STRUCT,yy:ARRAY>,zz:STRUCT>>> Yes true \N + +-- !orc_v1_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !orc_v1_3 -- +6 + +-- !orc_v1_4 -- +1 +1 +2 +1 +1 +1 + +-- !orc_v1_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !orc_v1_6 -- + +-- !orc_v1_7 -- +6 1 +5 1 + +-- !orc_v2_1 -- +id INT Yes true \N +col2 ARRAY>>>> Yes true \N +col3 MAP,MAP>> Yes true \N +col4 STRUCT,y:ARRAY,z:MAP> Yes true \N +col5 MAP>>>>>>> Yes true \N +col6 STRUCT,yy:ARRAY>,zz:STRUCT>>> Yes true \N + +-- !orc_v2_2 -- +1 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +2 [[[[[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +3 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]], [[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]], [[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +4 [[[[[2, 2, 3, 9]], [[2, 2, 3, 9], [2, 2, 3, 9], [2, 2, 3, 9]]]]] {[2]:{2:{2:222}}} {"x": [2], "y": [2, 2, 22935, 99, 59, 955], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +5 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2, 239, 39293259, 2223, 23, 59, 29, 9353]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} +6 [[[[[2, 2, 3, 9]]]]] {[2]:{2:{2:2}}} {"x": [2], "y": [2], "z": {1:"example"}} {2:{2:{2:{2:{2:{2:{"x": 2, "y": [2]}}}}}}} {"xx": [2, 2, 3, 9], "yy": [{2:2}, {2:2}, {2:2}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}, {2:4}], "zz": {"xxx": {"xxxx": {"xxxxx": 10123.33}}}} + +-- !orc_v2_3 -- +6 + +-- !orc_v2_4 -- +1 +1 +2 +1 +1 +1 + +-- !orc_v2_5 -- +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] +[[2]] + +-- !orc_v2_6 -- + +-- !orc_v2_7 -- +6 1 +5 1 + diff --git a/regression-test/data/external_table_p2/iceberg/iceberg_position_delete.out b/regression-test/data/external_table_p2/iceberg/iceberg_position_delete.out new file mode 100644 index 0000000000..2d61ebaaf5 --- /dev/null +++ b/regression-test/data/external_table_p2/iceberg/iceberg_position_delete.out @@ -0,0 +1,196 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !gen_data_1 -- + +-- !gen_data_2 -- + +-- !gen_data_3 -- + +-- !gen_data_4 -- + +-- !gen_data_5 -- + +-- !gen_data_6 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !gen_data_7 -- +2 +2 +2 + +-- !gen_data_8 -- +2 512 +3 512 +4 512 +6 512 +7 512 +8 512 +9 512 +11 512 +12 512 +13 512 +14 512 + +-- !gen_data_9 -- + +-- !gen_data_10 -- + +-- !gen_data_11 -- + +-- !gen_data_12 -- + +-- !gen_data_13 -- + +-- !gen_data_14 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !gen_data_15 -- +7 12345xxx +7 12345xxx +7 12345xxx + +-- !gen_data_16 -- + +-- !gen_data_17 -- + +-- !gen_data_18 -- + +-- !gen_data_19 -- +5632 + +-- !gen_data_20 -- +5632 + +-- !orc_1 -- + +-- !orc_2 -- + +-- !orc_3 -- + +-- !orc_4 -- + +-- !orc_5 -- + +-- !orc_6 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !orc_7 -- +2 +2 +2 + +-- !orc_8 -- +2 512 +3 512 +4 512 +6 512 +7 512 +8 512 +9 512 +11 512 +12 512 +13 512 +14 512 + +-- !orc_9 -- + +-- !orc_10 -- + +-- !orc_11 -- + +-- !orc_12 -- + +-- !orc_13 -- + +-- !orc_14 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !orc_15 -- +7 12345xxx +7 12345xxx +7 12345xxx + +-- !orc_16 -- + +-- !orc_17 -- + +-- !orc_18 -- + +-- !orc_19 -- +5632 + +-- !orc_20 -- +5632 + +-- !parquet_1 -- + +-- !parquet_2 -- + +-- !parquet_3 -- + +-- !parquet_4 -- + +-- !parquet_5 -- + +-- !parquet_6 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !parquet_7 -- +2 +2 +2 + +-- !parquet_8 -- +2 512 +3 512 +4 512 +6 512 +7 512 +8 512 +9 512 +11 512 +12 512 +13 512 +14 512 + +-- !parquet_9 -- + +-- !parquet_10 -- + +-- !parquet_11 -- + +-- !parquet_12 -- + +-- !parquet_13 -- + +-- !parquet_14 -- +2 select xxxxxxxxx +2 select xxxxxxxxx +2 select xxxxxxxxx + +-- !parquet_15 -- +7 12345xxx +7 12345xxx +7 12345xxx + +-- !parquet_16 -- + +-- !parquet_17 -- + +-- !parquet_18 -- + +-- !parquet_19 -- +5632 + +-- !parquet_20 -- +5632 + diff --git a/regression-test/data/external_table_p2/iceberg/iceberg_schema_change.out b/regression-test/data/external_table_p2/iceberg/iceberg_schema_change.out new file mode 100644 index 0000000000..4faf8c695e --- /dev/null +++ b/regression-test/data/external_table_p2/iceberg/iceberg_schema_change.out @@ -0,0 +1,305 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !parquet_v1_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY Yes true \N +rename_col2 ARRAY Yes true \N +rename_col3 ARRAY Yes true \N +rename_col4 MAP Yes true \N +rename_col5 MAP Yes true \N +rename_col6 MAP Yes true \N +rename_col7 STRUCT Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !parquet_v1_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 4 5 + +-- !parquet_v1_3 -- +9 + +-- !parquet_v1_4 -- +6 +7 +8 +9 + +-- !parquet_v1_5 -- +1 +2 +3 +4 + +-- !parquet_v1_6 -- +6 +7 +8 +9 + +-- !parquet_v1_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !parquet_v1_8 -- +3 +4 +5 + +-- !parquet_v1_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !parquet_v1_10 -- + +-- !parquet_v2_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY Yes true \N +rename_col2 ARRAY Yes true \N +rename_col3 ARRAY Yes true \N +rename_col4 MAP Yes true \N +rename_col5 MAP Yes true \N +rename_col6 MAP Yes true \N +rename_col7 STRUCT Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !parquet_v2_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 4 5 + +-- !parquet_v2_3 -- +9 + +-- !parquet_v2_4 -- +6 +7 +8 +9 + +-- !parquet_v2_5 -- +1 +2 +3 +4 + +-- !parquet_v2_6 -- +6 +7 +8 +9 + +-- !parquet_v2_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !parquet_v2_8 -- +3 +4 +5 + +-- !parquet_v2_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !parquet_v2_10 -- + +-- !orc_v1_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY Yes true \N +rename_col2 ARRAY Yes true \N +rename_col3 ARRAY Yes true \N +rename_col4 MAP Yes true \N +rename_col5 MAP Yes true \N +rename_col6 MAP Yes true \N +rename_col7 STRUCT Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !orc_v1_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 4 5 + +-- !orc_v1_3 -- +9 + +-- !orc_v1_4 -- +6 +7 +8 +9 + +-- !orc_v1_5 -- +1 +2 +3 +4 + +-- !orc_v1_6 -- +6 +7 +8 +9 + +-- !orc_v1_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !orc_v1_8 -- +3 +4 +5 + +-- !orc_v1_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !orc_v1_10 -- + +-- !orc_v2_1 -- +rename_col8 BIGINT Yes true \N +rename_col9 DOUBLE Yes true \N +rename_col10 DECIMAL(20, 5) Yes true \N +id INT Yes true \N +rename_col1 ARRAY Yes true \N +rename_col2 ARRAY Yes true \N +rename_col3 ARRAY Yes true \N +rename_col4 MAP Yes true \N +rename_col5 MAP Yes true \N +rename_col6 MAP Yes true \N +rename_col7 STRUCT Yes true \N +col_add INT Yes true \N +col_add2 INT Yes true \N + +-- !orc_v2_2 -- +1 1.2000000476837158 1.12345 1 [1, 2, 3] [1.100000023841858, 2.200000047683716, 3.299999952316284] [1.1234, 2.2345, 3.3456] {1:10, 2:20} {1:1.100000023841858, 2:2.200000047683716} {1:1.12345, 2:2.23456} {"add": null, "x": 1, "y": 1.100000023841858} \N \N +1 1.2000000476837158 1.12345 2 [4, 5, 6] [4.400000095367432, 5.5, 6.599999904632568] [4.4567, 5.5678, 6.6789] {3:30, 4:40} {3:3.299999952316284, 4:4.400000095367432} {3:3.34567, 4:4.45678} {"add": null, "x": 2, "y": 2.200000047683716} \N \N +1 1.2000000476837158 1.12345 3 [7, 8, 9] [7.699999809265137, 8.800000190734863, 9.899999618530273] [7.7890, 8.8901, 9.9012] {5:50, 6:60} {5:5.5, 6:6.599999904632568} {5:5.56789, 6:6.67890} {"add": null, "x": 3, "y": 3.299999952316284} \N \N +1 1.2000000476837158 1.12345 4 [10, 11, 12] [10.100000381469727, 11.109999656677246, 12.119999885559082] [10.1011, 11.1112, 12.1213] {7:70, 8:80} {7:7.699999809265137, 8:8.800000190734863} {7:7.78901, 8:8.89012} {"add": null, "x": 4, "y": 4.400000095367432} \N \N +1 1.2000000476837158 1.12345 5 [13, 14, 15] [13.130000114440918, 14.140000343322754, 15.149999618530273] [13.1314, 14.1415, 15.1516] {9:90, 10:100} {9:9.899999618530273, 10:10.100000381469727} {9:9.89012, 10:10.10123} {"add": null, "x": 5, "y": 5.5} \N \N +21447483648 1.7976931348623157E308 1234567890.12345 6 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 1 2 +2144748345648 1.7976931348623157E308 1234567890.23456 7 [2144748345648, 214742435483649, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 2 3 +21447483648 1.7976931348623157E308 1234567890.12345 8 [21447483648, 21474483649, 21474483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [1234567890.1235, 1234567890.2346, 1234567890.3457] {214748348:2147483648, 24748649:214743383649} {214748648:1.7976931348623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} 3 4 +2144748345648 1.7976931348623157E308 1234567890.23456 9 [2144748345648, 214742435483649, 214742435483650, 214742435483650, 214742435483650, 214742435483650] [1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308] [12345673890.1235, 12345367890.2346, 12344567890.3457] {214748348:2147483632148, 24748649:213144743383649} {214748648:1.717693623157e+308, 27483649:1.7976931348623157e+308} {214743648:1234567890.12345, 21474649:1234567890.23456} {"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} 4 5 + +-- !orc_v2_3 -- +9 + +-- !orc_v2_4 -- +6 +7 +8 +9 + +-- !orc_v2_5 -- +1 +2 +3 +4 + +-- !orc_v2_6 -- +6 +7 +8 +9 + +-- !orc_v2_7 -- +{"add": null, "x": 1, "y": 1.100000023841858} +{"add": null, "x": 2, "y": 2.200000047683716} +{"add": null, "x": 3, "y": 3.299999952316284} +{"add": null, "x": 4, "y": 4.400000095367432} +{"add": null, "x": 5, "y": 5.5} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214748223648, "y": 1.7976931346232156e+308} +{"add": 1234567890.12345, "x": 214743338223648, "y": 1.7976931346232156e+308} + +-- !orc_v2_8 -- +3 +4 +5 + +-- !orc_v2_9 -- +9 1 +8 1 +7 1 +6 1 +5 0 +4 0 +3 0 +2 0 +1 0 + +-- !orc_v2_10 -- + diff --git a/regression-test/suites/external_table_p2/iceberg/iceberg_complex_type.groovy b/regression-test/suites/external_table_p2/iceberg/iceberg_complex_type.groovy new file mode 100644 index 0000000000..f465a9afe3 --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/iceberg_complex_type.groovy @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("iceberg_complex_type", "p2,external,iceberg,external_remote,external_remote_iceberg") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + + String catalog_name = "test_external_iceberg_complex_type" + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHdfsPort = context.config.otherConfigs.get("extHdfsPort") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='iceberg', + 'iceberg.catalog.type'='hadoop', + 'warehouse' = 'hdfs://${extHiveHmsHost}:${extHdfsPort}/usr/hive/warehouse/hadoop_catalog' + ); + """ + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use multi_catalog;""" + + + + qt_parquet_v1_1 """ desc complex_parquet_v1 ;""" + qt_parquet_v1_2 """ select * from complex_parquet_v1 order by id; """ + qt_parquet_v1_3 """ select count(*) from complex_parquet_v1 ;""" + qt_parquet_v1_4 """ select array_size(col2) from complex_parquet_v1 where col2 is not null order by id ; """ + qt_parquet_v1_5 """ select map_keys(col3) from complex_parquet_v1 order by id; """ + qt_parquet_v1_6 """ select struct_element(col4, 1) from complex_parquet_v1 where id >=7 order by id; """ + qt_parquet_v1_7 """ select id,count(col2) from complex_parquet_v1 group by id order by id desc limit 2; """ + + + qt_parquet_v2_1 """ desc complex_parquet_v2 ;""" + qt_parquet_v2_2 """ select * from complex_parquet_v2 order by id; """ + qt_parquet_v2_3 """ select count(*) from complex_parquet_v2 ;""" + qt_parquet_v2_4 """ select array_size(col2) from complex_parquet_v2 where col2 is not null order by id ; """ + qt_parquet_v2_5 """ select map_keys(col3) from complex_parquet_v2 order by id; """ + qt_parquet_v2_6 """ select struct_element(col4, 1) from complex_parquet_v2 where id >=7 order by id; """ + qt_parquet_v2_7 """ select id,count(col2) from complex_parquet_v2 group by id order by id desc limit 2; """ + + + qt_orc_v1_1 """ desc complex_orc_v1 ;""" + qt_orc_v1_2 """ select * from complex_orc_v1 order by id; """ + qt_orc_v1_3 """ select count(*) from complex_orc_v1 ;""" + qt_orc_v1_4 """ select array_size(col2) from complex_orc_v1 where col2 is not null order by id ; """ + qt_orc_v1_5 """ select map_keys(col3) from complex_orc_v1 order by id; """ + qt_orc_v1_6 """ select struct_element(col4, 1) from complex_orc_v1 where id >=7 order by id; """ + qt_orc_v1_7 """ select id,count(col2) from complex_orc_v1 group by id order by id desc limit 2; """ + + + qt_orc_v2_1 """ desc complex_orc_v2 ;""" + qt_orc_v2_2 """ select * from complex_orc_v2 order by id; """ + qt_orc_v2_3 """ select count(*) from complex_orc_v2 ;""" + qt_orc_v2_4 """ select array_size(col2) from complex_orc_v2 where col2 is not null order by id ; """ + qt_orc_v2_5 """ select map_keys(col3) from complex_orc_v2 order by id; """ + qt_orc_v2_6 """ select struct_element(col4, 1) from complex_orc_v2 where id >=7 order by id; """ + qt_orc_v2_7 """ select id,count(col2) from complex_orc_v2 group by id order by id desc limit 2; """ + + + + + } +} + +/* +schema : + id int + col2 array>>>> + col3 map,map>> + col4 struct,y:array,z:map> + col5 map>>>>>>> + col6 struct,yy:array>,zz:struct>>> + +*/ \ No newline at end of file diff --git a/regression-test/suites/external_table_p2/iceberg/iceberg_position_delete.groovy b/regression-test/suites/external_table_p2/iceberg/iceberg_position_delete.groovy new file mode 100644 index 0000000000..4cb497c307 --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/iceberg_position_delete.groovy @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("iceberg_position_delete", "p2,external,iceberg,external_remote,external_remote_iceberg") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + + String catalog_name = "test_external_iceberg_position_delete" + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHdfsPort = context.config.otherConfigs.get("extHdfsPort") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='iceberg', + 'iceberg.catalog.type'='hadoop', + 'warehouse' = 'hdfs://${extHiveHmsHost}:${extHdfsPort}/usr/hive/warehouse/hadoop_catalog' + ); + """ + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use multi_catalog;""" + + qt_gen_data_1 """ select * from iceberg_position_gen_data where name = 'xyzxxxxxx' and id != 9;""" + qt_gen_data_2 """ select * from iceberg_position_gen_data where id = 1; """ + qt_gen_data_3 """ select * from iceberg_position_gen_data where id = 5; """ + qt_gen_data_4 """ select * from iceberg_position_gen_data where id = 10; """ + qt_gen_data_5 """ select * from iceberg_position_gen_data where id = 15; """ + qt_gen_data_6 """ select * from iceberg_position_gen_data where id = 2 limit 3;""" + qt_gen_data_7 """ select id from iceberg_position_gen_data where id = 2 limit 3;""" + qt_gen_data_8 """ select id,count(name) from iceberg_position_gen_data where id != 1 group by id order by id ;""" + qt_gen_data_9 """ select id from iceberg_position_gen_data where id = 1; """ + qt_gen_data_10 """ select name from iceberg_position_gen_data where id = 5; """ + qt_gen_data_11 """ select id from iceberg_position_gen_data where id = 10; """ + qt_gen_data_12 """ select name from iceberg_position_gen_data where id = 15;""" + qt_gen_data_13 """ select * from iceberg_position_gen_data where id = 15 and name = 'select xxxxxxxxx';""" + qt_gen_data_14 """ select * from iceberg_position_gen_data where id = 2 and name = 'select xxxxxxxxx' limit 3;""" + qt_gen_data_15 """ select * from iceberg_position_gen_data where id = 7 and name = '12345xxx' limit 3;""" + qt_gen_data_16 """ select * from iceberg_position_gen_data where name = 'hello world' ;""" + qt_gen_data_17 """ select name from iceberg_position_gen_data where name = 'hello world' ;""" + qt_gen_data_18 """ select id from iceberg_position_gen_data where name = 'hello world' ;""" + qt_gen_data_19 """ select count(*) from iceberg_position_gen_data where name != 'final entryxxxxxx' ;""" + qt_gen_data_20 """ select count(*) from iceberg_position_gen_data; """ + + + qt_orc_1 """ select * from iceberg_position_orc where name = 'xyzxxxxxx' and id != 9;""" + qt_orc_2 """ select * from iceberg_position_orc where id = 1; """ + qt_orc_3 """ select * from iceberg_position_orc where id = 5; """ + qt_orc_4 """ select * from iceberg_position_orc where id = 10; """ + qt_orc_5 """ select * from iceberg_position_orc where id = 15; """ + qt_orc_6 """ select * from iceberg_position_orc where id = 2 limit 3;""" + qt_orc_7 """ select id from iceberg_position_orc where id = 2 limit 3;""" + qt_orc_8 """ select id,count(name) from iceberg_position_orc where id != 1 group by id order by id ;""" + qt_orc_9 """ select id from iceberg_position_orc where id = 1; """ + qt_orc_10 """ select name from iceberg_position_orc where id = 5; """ + qt_orc_11 """ select id from iceberg_position_orc where id = 10; """ + qt_orc_12 """ select name from iceberg_position_orc where id = 15;""" + qt_orc_13 """ select * from iceberg_position_orc where id = 15 and name = 'select xxxxxxxxx';""" + qt_orc_14 """ select * from iceberg_position_orc where id = 2 and name = 'select xxxxxxxxx' limit 3;""" + qt_orc_15 """ select * from iceberg_position_orc where id = 7 and name = '12345xxx' limit 3;""" + qt_orc_16 """ select * from iceberg_position_orc where name = 'hello world' ;""" + qt_orc_17 """ select name from iceberg_position_orc where name = 'hello world' ;""" + qt_orc_18 """ select id from iceberg_position_orc where name = 'hello world' ;""" + qt_orc_19 """ select count(*) from iceberg_position_orc where name != 'final entryxxxxxx' ;""" + qt_orc_20 """ select count(*) from iceberg_position_orc; """ + + qt_parquet_1 """ select * from iceberg_position_parquet where name = 'xyzxxxxxx' and id != 9;""" + qt_parquet_2 """ select * from iceberg_position_parquet where id = 1; """ + qt_parquet_3 """ select * from iceberg_position_parquet where id = 5; """ + qt_parquet_4 """ select * from iceberg_position_parquet where id = 10; """ + qt_parquet_5 """ select * from iceberg_position_parquet where id = 15; """ + qt_parquet_6 """ select * from iceberg_position_parquet where id = 2 limit 3;""" + qt_parquet_7 """ select id from iceberg_position_parquet where id = 2 limit 3;""" + qt_parquet_8 """ select id,count(name) from iceberg_position_parquet where id != 1 group by id order by id ;""" + qt_parquet_9 """ select id from iceberg_position_parquet where id = 1; """ + qt_parquet_10 """ select name from iceberg_position_parquet where id = 5; """ + qt_parquet_11 """ select id from iceberg_position_parquet where id = 10; """ + qt_parquet_12 """ select name from iceberg_position_parquet where id = 15;""" + qt_parquet_13 """ select * from iceberg_position_parquet where id = 15 and name = 'select xxxxxxxxx';""" + qt_parquet_14 """ select * from iceberg_position_parquet where id = 2 and name = 'select xxxxxxxxx' limit 3;""" + qt_parquet_15 """ select * from iceberg_position_parquet where id = 7 and name = '12345xxx' limit 3;""" + qt_parquet_16 """ select * from iceberg_position_parquet where name = 'hello world' ;""" + qt_parquet_17 """ select name from iceberg_position_parquet where name = 'hello world' ;""" + qt_parquet_18 """ select id from iceberg_position_parquet where name = 'hello world' ;""" + qt_parquet_19 """ select count(*) from iceberg_position_parquet where name != 'final entryxxxxxx' ;""" + qt_parquet_20 """ select count(*) from iceberg_position_parquet; """ + + } +} +/* + + +create table iceberg_position_gen_data( + id int, + name string +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.format.default' = 'orc', + 'write.update.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read' +); + +INSERT INTO iceberg_position_gen_data VALUES +(1, "hello world"), +(2, "select xxxxxxxxx"), +(3, "example xxxx"), +(4, "more dataxxx"), +(5, "another examplexxx"), +(6, "testxxx"), +(7, "12345xxx"), +(8, "abcdefxxxx"), +(9, "xyzxxxxxx"), +(10, "inserted dataxxxxx"), +(11, "SQLxxxxx"), +(12, "tablexxxx"), +(13, "rowxxxx"), +(14, "data entryxxxx"), +(15, "final entryxxxxxx"); +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; +insert into iceberg_position_gen_data select * from iceberg_position_gen_data; + + + +create table iceberg_position_parquet( + id int, + name string +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.format.default' = 'parquet', + 'write.update.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read' +); +create table iceberg_position_orc( + id int, + name string +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.format.default' = 'orc', + 'write.update.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.delete.mode' = 'merge-on-read' +); + +insert into iceberg_position_parquet select * from iceberg_position_gen_data; +insert into iceberg_position_orc select * from iceberg_position_parquet; + + +delete from iceberg_position_gen_data where id = 1; +delete from iceberg_position_gen_data where id = 5; +delete from iceberg_position_gen_data where id = 10; +delete from iceberg_position_gen_data where id = 15; + +delete from iceberg_position_parquet where id = 1; +delete from iceberg_position_parquet where id = 5; +delete from iceberg_position_parquet where id = 10; +delete from iceberg_position_parquet where id = 15; + +delete from iceberg_position_orc where id = 1; +delete from iceberg_position_orc where id = 5; +delete from iceberg_position_orc where id = 10; +delete from iceberg_position_orc where id = 15; +*/ + + diff --git a/regression-test/suites/external_table_p2/iceberg/iceberg_schema_change.groovy b/regression-test/suites/external_table_p2/iceberg/iceberg_schema_change.groovy new file mode 100644 index 0000000000..5e03668359 --- /dev/null +++ b/regression-test/suites/external_table_p2/iceberg/iceberg_schema_change.groovy @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("iceberg_schema_change", "p2,external,iceberg,external_remote,external_remote_iceberg") { + + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + + String catalog_name = "test_external_iceberg_schema_change" + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHdfsPort = context.config.otherConfigs.get("extHdfsPort") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='iceberg', + 'iceberg.catalog.type'='hadoop', + 'warehouse' = 'hdfs://${extHiveHmsHost}:${extHdfsPort}/usr/hive/warehouse/hadoop_catalog' + ); + """ + + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + sql """ use multi_catalog;""" + + + + + + qt_parquet_v1_1 """ desc complex_parquet_v1_schema_change ;""" + qt_parquet_v1_2 """ select * from complex_parquet_v1_schema_change order by id; """ + qt_parquet_v1_3 """ select count(*) from complex_parquet_v1_schema_change ;""" + qt_parquet_v1_4 """ select id from complex_parquet_v1_schema_change where col_add +1 = col_add2 order by id;""" + qt_parquet_v1_5 """ select col_add from complex_parquet_v1_schema_change where col_add is not null order by col_add ; """ + qt_parquet_v1_6 """ select id from complex_parquet_v1_schema_change where col_add + 5 = id order by id; """ + qt_parquet_v1_7 """ select rename_col7 from complex_parquet_v1_schema_change order by id; """ + qt_parquet_v1_8 """ select col_add2 from complex_parquet_v1_schema_change where id >=7 order by id; """ + qt_parquet_v1_9 """ select id,count(col_add) from complex_parquet_v1_schema_change group by id order by id desc ; """ + qt_parquet_v1_10 """ select col_add from complex_parquet_v1_schema_change where col_add -1 = col_add2 order by id; """ + + + + qt_parquet_v2_1 """ desc complex_parquet_v2_schema_change ;""" + qt_parquet_v2_2 """ select * from complex_parquet_v2_schema_change order by id; """ + qt_parquet_v2_3 """ select count(*) from complex_parquet_v2_schema_change ;""" + qt_parquet_v2_4 """ select id from complex_parquet_v2_schema_change where col_add +1 = col_add2 order by id;""" + qt_parquet_v2_5 """ select col_add from complex_parquet_v2_schema_change where col_add is not null order by col_add ; """ + qt_parquet_v2_6 """ select id from complex_parquet_v2_schema_change where col_add + 5 = id order by id; """ + qt_parquet_v2_7 """ select rename_col7 from complex_parquet_v2_schema_change order by id; """ + qt_parquet_v2_8 """ select col_add2 from complex_parquet_v2_schema_change where id >=7 order by id; """ + qt_parquet_v2_9 """ select id,count(col_add) from complex_parquet_v2_schema_change group by id order by id desc ; """ + qt_parquet_v2_10 """ select col_add from complex_parquet_v2_schema_change where col_add -1 = col_add2 order by id; """ + + + + + qt_orc_v1_1 """ desc complex_orc_v1_schema_change ;""" + qt_orc_v1_2 """ select * from complex_orc_v1_schema_change order by id; """ + qt_orc_v1_3 """ select count(*) from complex_orc_v1_schema_change ;""" + qt_orc_v1_4 """ select id from complex_orc_v1_schema_change where col_add +1 = col_add2 order by id;""" + qt_orc_v1_5 """ select col_add from complex_orc_v1_schema_change where col_add is not null order by col_add ; """ + qt_orc_v1_6 """ select id from complex_orc_v1_schema_change where col_add + 5 = id order by id; """ + qt_orc_v1_7 """ select rename_col7 from complex_orc_v1_schema_change order by id; """ + qt_orc_v1_8 """ select col_add2 from complex_orc_v1_schema_change where id >=7 order by id; """ + qt_orc_v1_9 """ select id,count(col_add) from complex_orc_v1_schema_change group by id order by id desc ; """ + qt_orc_v1_10 """ select col_add from complex_orc_v1_schema_change where col_add -1 = col_add2 order by id; """ + + + + qt_orc_v2_1 """ desc complex_orc_v2_schema_change ;""" + qt_orc_v2_2 """ select * from complex_orc_v2_schema_change order by id; """ + qt_orc_v2_3 """ select count(*) from complex_orc_v2_schema_change ;""" + qt_orc_v2_4 """ select id from complex_orc_v2_schema_change where col_add +1 = col_add2 order by id;""" + qt_orc_v2_5 """ select col_add from complex_orc_v2_schema_change where col_add is not null order by col_add ; """ + qt_orc_v2_6 """ select id from complex_orc_v2_schema_change where col_add + 5 = id order by id; """ + qt_orc_v2_7 """ select rename_col7 from complex_orc_v2_schema_change order by id; """ + qt_orc_v2_8 """ select col_add2 from complex_orc_v2_schema_change where id >=7 order by id; """ + qt_orc_v2_9 """ select id,count(col_add) from complex_orc_v2_schema_change group by id order by id desc ; """ + qt_orc_v2_10 """ select col_add from complex_orc_v2_schema_change where col_add -1 = col_add2 order by id; """ + + + + } +} +/* +before schema: + id int, + col1 array, + col2 array, + col3 array, + col4 map, + col5 map, + col6 map, + col7 struct, + col8 int, + col9 float, + col10 decimal(12,5), + col_del int + + +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col1.element type bigint; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col2.element type double; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col3.element type decimal(20,4); +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col4.value type bigint; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col5.value type double; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col6.value type decimal(20,5); +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col7.x type bigint; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col7.y type double; +ALTER TABLE complex_parquet_v2_schema_change CHANGE COLUMN col7.z type decimal(20,5); +alter table complex_parquet_v2_schema_change CHANGE COLUMN col8 col8 bigint; +alter table complex_parquet_v2_schema_change CHANGE COLUMN col9 col9 double; +alter table complex_parquet_v2_schema_change CHANGE COLUMN col10 col10 decimal(20,5); +alter table complex_parquet_v2_schema_change drop column col7.z; +alter table complex_parquet_v2_schema_change add column col7.add double; +alter table complex_parquet_v2_schema_change change column col7.add first; +alter table complex_parquet_v2_schema_change rename COLUMN col1 to rename_col1; +alter table complex_parquet_v2_schema_change rename COLUMN col2 to rename_col2; +alter table complex_parquet_v2_schema_change rename COLUMN col3 to rename_col3; +alter table complex_parquet_v2_schema_change rename COLUMN col4 to rename_col4; +alter table complex_parquet_v2_schema_change rename COLUMN col5 to rename_col5; +alter table complex_parquet_v2_schema_change rename COLUMN col6 to rename_col6; +alter table complex_parquet_v2_schema_change rename COLUMN col7 to rename_col7; +alter table complex_parquet_v2_schema_change rename COLUMN col8 to rename_col8; +alter table complex_parquet_v2_schema_change rename COLUMN col9 to rename_col9; +alter table complex_parquet_v2_schema_change rename COLUMN col10 to rename_col10; +alter table complex_parquet_v2_schema_change drop column col_del; +alter table complex_parquet_v2_schema_change CHANGE COLUMN rename_col8 first; +alter table complex_parquet_v2_schema_change CHANGE COLUMN rename_col9 after rename_col8; +alter table complex_parquet_v2_schema_change CHANGE COLUMN rename_col10 after rename_col9; +alter table complex_parquet_v2_schema_change add column col_add int; +alter table complex_parquet_v2_schema_change add column col_add2 int; + +after schema: + rename_col8 bigint + rename_col9 double + rename_col10 decimal(20,5) + id int + rename_col1 array + rename_col2 array + rename_col3 array + rename_col4 map + rename_col5 map + rename_col6 map + rename_col7 struct + col_add int + col_add2 int + +*/