// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include #include #include #include #include #include #include "gen_cpp/FrontendService.h" #include "gen_cpp/HeartbeatService_types.h" #include "olap/rowset/rowset_writer_context.h" #include "runtime/client_cache.h" #include "runtime/exec_env.h" #include "util/thrift_rpc_helper.h" namespace doris::vectorized::schema_util { size_t get_number_of_dimensions(const IDataType& type) { if (const auto* type_array = typeid_cast(&type)) { return type_array->get_number_of_dimensions(); } return 0; } size_t get_number_of_dimensions(const IColumn& column) { if (const auto* column_array = check_and_get_column(column)) { return column_array->get_number_of_dimensions(); } return 0; } DataTypePtr get_base_type_of_array(const DataTypePtr& type) { /// Get raw pointers to avoid extra copying of type pointers. const DataTypeArray* last_array = nullptr; const auto* current_type = type.get(); while (const auto* type_array = typeid_cast(current_type)) { current_type = type_array->get_nested_type().get(); last_array = type_array; } return last_array ? last_array->get_nested_type() : type; } Array create_empty_array_field(size_t num_dimensions) { DCHECK(num_dimensions > 0); Array array; Array* current_array = &array; for (size_t i = 1; i < num_dimensions; ++i) { current_array->push_back(Array()); current_array = ¤t_array->back().get(); } return array; } FieldType get_field_type(const IDataType* data_type) { switch (data_type->get_type_id()) { case TypeIndex::UInt8: return FieldType::OLAP_FIELD_TYPE_UNSIGNED_TINYINT; case TypeIndex::UInt16: return FieldType::OLAP_FIELD_TYPE_UNSIGNED_SMALLINT; case TypeIndex::UInt32: return FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT; case TypeIndex::UInt64: return FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT; case TypeIndex::Int8: return FieldType::OLAP_FIELD_TYPE_TINYINT; case TypeIndex::Int16: return FieldType::OLAP_FIELD_TYPE_SMALLINT; case TypeIndex::Int32: return FieldType::OLAP_FIELD_TYPE_INT; case TypeIndex::Int64: return FieldType::OLAP_FIELD_TYPE_BIGINT; case TypeIndex::Float32: return FieldType::OLAP_FIELD_TYPE_FLOAT; case TypeIndex::Float64: return FieldType::OLAP_FIELD_TYPE_DOUBLE; case TypeIndex::Decimal32: return FieldType::OLAP_FIELD_TYPE_DECIMAL; case TypeIndex::Array: return FieldType::OLAP_FIELD_TYPE_ARRAY; case TypeIndex::String: return FieldType::OLAP_FIELD_TYPE_STRING; case TypeIndex::Date: return FieldType::OLAP_FIELD_TYPE_DATE; case TypeIndex::DateTime: return FieldType::OLAP_FIELD_TYPE_DATETIME; case TypeIndex::Tuple: return FieldType::OLAP_FIELD_TYPE_STRUCT; // TODO add more types default: LOG(FATAL) << "unknow type"; return FieldType::OLAP_FIELD_TYPE_UNKNOWN; } } Status parse_object_column(ColumnObject& dest, const IColumn& src, bool need_finalize, const int* row_begin, const int* row_end) { assert(src.is_column_string()); const ColumnString* parsing_column {nullptr}; if (!src.is_nullable()) { parsing_column = reinterpret_cast(src.get_ptr().get()); } else { auto nullable_column = reinterpret_cast(src.get_ptr().get()); parsing_column = reinterpret_cast( nullable_column->get_nested_column().get_ptr().get()); } std::vector jsons; if (row_begin != nullptr) { assert(row_end); for (auto x = row_begin; x != row_end; ++x) { StringRef ref = parsing_column->get_data_at(*x); jsons.push_back(ref); } } else { for (size_t i = 0; i < parsing_column->size(); ++i) { StringRef ref = parsing_column->get_data_at(i); jsons.push_back(ref); } } // batch parse RETURN_IF_ERROR(parse_json_to_variant(dest, jsons)); if (need_finalize) { dest.finalize(); } return Status::OK(); } Status parse_object_column(Block& block, size_t position) { // parse variant column and rewrite column auto col = block.get_by_position(position).column; const std::string& col_name = block.get_by_position(position).name; if (!col->is_column_string()) { return Status::InvalidArgument("only ColumnString can be parsed to ColumnObject"); } vectorized::DataTypePtr type( std::make_shared("", col->is_nullable())); auto column_object = type->create_column(); RETURN_IF_ERROR( parse_object_column(assert_cast(column_object->assume_mutable_ref()), *col, true /*need finalize*/, nullptr, nullptr)); // replace by object block.safe_get_by_position(position).column = column_object->get_ptr(); block.safe_get_by_position(position).type = type; block.safe_get_by_position(position).name = col_name; return Status::OK(); } void flatten_object(Block& block, size_t pos, bool replace_if_duplicated) { auto column_object_ptr = assert_cast(block.get_by_position(pos).column->assume_mutable().get()); if (column_object_ptr->empty()) { block.erase(pos); return; } size_t num_rows = column_object_ptr->size(); assert(block.rows() <= num_rows); assert(column_object_ptr->is_finalized()); Columns subcolumns; DataTypes types; Names names; for (auto& subcolumn : column_object_ptr->get_subcolumns()) { subcolumns.push_back(subcolumn->data.get_finalized_column().get_ptr()); types.push_back(subcolumn->data.get_least_common_type()); names.push_back(subcolumn->path.get_path()); } block.erase(pos); for (size_t i = 0; i < subcolumns.size(); ++i) { // block may already contains this column, eg. key columns, we should ignore // or replcace the same column from object subcolumn if (block.has(names[i])) { if (replace_if_duplicated) { auto& column_type_name = block.get_by_name(names[i]); column_type_name.column = subcolumns[i]; column_type_name.type = types[i]; } continue; } block.insert(ColumnWithTypeAndName {subcolumns[i], types[i], names[i]}); } // fill default value for (auto& [column, _1, _2] : block.get_columns_with_type_and_name()) { if (column->size() < num_rows) { column->assume_mutable()->insert_many_defaults(num_rows - column->size()); } } } Status flatten_object(Block& block, bool replace_if_duplicated) { auto object_pos = std::find_if(block.begin(), block.end(), [](const ColumnWithTypeAndName& column) { return column.type->get_type_id() == TypeIndex::VARIANT; }); if (object_pos != block.end()) { flatten_object(block, object_pos - block.begin(), replace_if_duplicated); } return Status::OK(); } bool is_conversion_required_between_integers(const IDataType& lhs, const IDataType& rhs) { WhichDataType which_lhs(lhs); WhichDataType which_rhs(rhs); bool is_native_int = which_lhs.is_native_int() && which_rhs.is_native_int(); bool is_native_uint = which_lhs.is_native_uint() && which_rhs.is_native_uint(); return (is_native_int || is_native_uint) && lhs.get_size_of_value_in_memory() <= rhs.get_size_of_value_in_memory(); } bool is_conversion_required_between_integers(FieldType lhs, FieldType rhs) { // We only support signed integers for semi-structure data at present // TODO add unsigned integers if (lhs == OLAP_FIELD_TYPE_BIGINT) { return !(rhs == OLAP_FIELD_TYPE_TINYINT || rhs == OLAP_FIELD_TYPE_SMALLINT || rhs == OLAP_FIELD_TYPE_INT || rhs == OLAP_FIELD_TYPE_BIGINT); } if (lhs == OLAP_FIELD_TYPE_INT) { return !(rhs == OLAP_FIELD_TYPE_TINYINT || rhs == OLAP_FIELD_TYPE_SMALLINT || rhs == OLAP_FIELD_TYPE_INT); } if (lhs == OLAP_FIELD_TYPE_SMALLINT) { return !(rhs == OLAP_FIELD_TYPE_TINYINT || rhs == OLAP_FIELD_TYPE_SMALLINT); } if (lhs == OLAP_FIELD_TYPE_TINYINT) { return !(rhs == OLAP_FIELD_TYPE_TINYINT); } return true; } Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) { ColumnsWithTypeAndName arguments {arg, {type->create_column_const_with_default_value(1), type, ""}}; auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type); Block tmp_block {arguments}; // the 0 position is input argument, the 1 position is to type argument, the 2 position is result argument vectorized::ColumnNumbers argnum; argnum.emplace_back(0); argnum.emplace_back(1); size_t result_column = tmp_block.columns(); tmp_block.insert({nullptr, type, arg.name}); RETURN_IF_ERROR( function->execute(nullptr, tmp_block, argnum, result_column, arg.column->size())); *result = std::move(tmp_block.get_by_position(result_column).column); return Status::OK(); } static void get_column_def(const vectorized::DataTypePtr& data_type, const std::string& name, TColumnDef* column) { if (!name.empty()) { column->columnDesc.__set_columnName(name); } if (data_type->is_nullable()) { const auto& real_type = static_cast(*data_type); column->columnDesc.__set_isAllowNull(true); get_column_def(real_type.get_nested_type(), "", column); return; } column->columnDesc.__set_columnType(to_thrift(get_primitive_type(data_type->get_type_id()))); if (data_type->get_type_id() == TypeIndex::Array) { TColumnDef child; column->columnDesc.__set_children({}); get_column_def(assert_cast(data_type.get())->get_nested_type(), "", &child); column->columnDesc.columnLength = TabletColumn::get_field_length_by_type(column->columnDesc.columnType, 0); column->columnDesc.children.push_back(child.columnDesc); return; } if (data_type->get_type_id() == TypeIndex::Tuple) { // TODO // auto tuple_type = assert_cast(data_type.get()); // DCHECK_EQ(tuple_type->get_elements().size(), tuple_type->get_element_names().size()); // for (size_t i = 0; i < tuple_type->get_elements().size(); ++i) { // TColumnDef child; // get_column_def(tuple_type->get_element(i), tuple_type->get_element_names()[i], &child); // column->columnDesc.children.push_back(child.columnDesc); // } // return; } if (data_type->get_type_id() == TypeIndex::String) { return; } if (WhichDataType(*data_type).is_simple()) { column->columnDesc.__set_columnLength(data_type->get_size_of_value_in_memory()); return; } } // send an empty add columns rpc, the rpc response will fill with base schema info // maybe we could seperate this rpc from add columns rpc Status send_fetch_full_base_schema_view_rpc(FullBaseSchemaView* schema_view) { TAddColumnsRequest req; TAddColumnsResult res; TTabletInfo tablet_info; req.__set_table_name(schema_view->table_name); req.__set_db_name(schema_view->db_name); req.__set_table_id(schema_view->table_id); // Set empty columns req.__set_addColumns({}); auto master_addr = ExecEnv::GetInstance()->master_info()->network_address; Status rpc_st = ThriftRpcHelper::rpc( master_addr.hostname, master_addr.port, [&req, &res](FrontendServiceConnection& client) { client->addColumns(res, req); }, config::txn_commit_rpc_timeout_ms); if (!rpc_st.ok()) { return Status::InternalError("Failed to fetch schema info, encounter rpc failure"); } // TODO(lhy) handle more status code if (res.status.status_code != TStatusCode::OK) { LOG(WARNING) << "failed to fetch schema info, code:" << res.status.status_code << ", msg:" << res.status.error_msgs[0]; return Status::InvalidArgument( fmt::format("Failed to fetch schema info, {}", res.status.error_msgs[0])); } for (const auto& column : res.allColumns) { schema_view->column_name_to_column[column.column_name] = column; } schema_view->schema_version = res.schema_version; return Status::OK(); } // Do batch add columns schema change // only the base table supported Status send_add_columns_rpc(ColumnsWithTypeAndName column_type_names, FullBaseSchemaView* schema_view) { if (column_type_names.empty()) { return Status::OK(); } TAddColumnsRequest req; TAddColumnsResult res; TTabletInfo tablet_info; req.__set_table_name(schema_view->table_name); req.__set_db_name(schema_view->db_name); req.__set_table_id(schema_view->table_id); // TODO(lhy) more configurable req.__set_allow_type_conflict(true); req.__set_addColumns({}); for (const auto& column_type_name : column_type_names) { TColumnDef col; get_column_def(column_type_name.type, column_type_name.name, &col); req.addColumns.push_back(col); } auto master_addr = ExecEnv::GetInstance()->master_info()->network_address; Status rpc_st = ThriftRpcHelper::rpc( master_addr.hostname, master_addr.port, [&req, &res](FrontendServiceConnection& client) { client->addColumns(res, req); }, config::txn_commit_rpc_timeout_ms); if (!rpc_st.ok()) { return Status::InternalError("Failed to do schema change, rpc error"); } // TODO(lhy) handle more status code if (res.status.status_code != TStatusCode::OK) { LOG(WARNING) << "failed to do schema change, code:" << res.status.status_code << ", msg:" << res.status.error_msgs[0]; return Status::InvalidArgument( fmt::format("Failed to do schema change, {}", res.status.error_msgs[0])); } size_t sz = res.allColumns.size(); if (sz < column_type_names.size()) { return Status::InternalError( fmt::format("Unexpected result columns {}, expected at least {}", res.allColumns.size(), column_type_names.size())); } for (const auto& column : res.allColumns) { schema_view->column_name_to_column[column.column_name] = column; } schema_view->schema_version = res.schema_version; return Status::OK(); } template void align_block_by_name_and_type(MutableBlock* mblock, const Block* block, size_t row_cnt, ColumnInserterFn inserter) { assert(!mblock->get_names().empty()); const auto& names = mblock->get_names(); [[maybe_unused]] const auto& data_types = mblock->data_types(); size_t num_rows = mblock->rows(); for (size_t i = 0; i < mblock->columns(); ++i) { auto& dst = mblock->get_column_by_position(i); if (!block->has(names[i])) { dst->insert_many_defaults(row_cnt); } else { assert(data_types[i]->equals(*block->get_by_name(names[i]).type)); const auto& src = *(block->get_by_name(names[i]).column.get()); inserter(src, dst); } } for (const auto& [column, type, name] : *block) { // encounter a new column if (!mblock->has(name)) { auto new_column = type->create_column(); new_column->insert_many_defaults(num_rows); inserter(*column.get(), new_column); mblock->mutable_columns().push_back(std::move(new_column)); mblock->data_types().push_back(type); mblock->get_names().push_back(name); } } #ifndef NDEBUG // Check all columns rows matched num_rows = mblock->rows(); for (size_t i = 0; i < mblock->columns(); ++i) { DCHECK_EQ(mblock->mutable_columns()[i]->size(), num_rows); } #endif } void align_block_by_name_and_type(MutableBlock* mblock, const Block* block, const int* row_begin, const int* row_end) { align_block_by_name_and_type(mblock, block, row_end - row_begin, [row_begin, row_end](const IColumn& src, MutableColumnPtr& dst) { dst->insert_indices_from(src, row_begin, row_end); }); } void align_block_by_name_and_type(MutableBlock* mblock, const Block* block, size_t row_begin, size_t length) { align_block_by_name_and_type(mblock, block, length, [row_begin, length](const IColumn& src, MutableColumnPtr& dst) { dst->insert_range_from(src, row_begin, length); }); } void align_append_block_by_selector(MutableBlock* mblock, const Block* block, const IColumn::Selector& selector) { // append by selector with alignment assert(!mblock->get_names().empty()); align_block_by_name_and_type(mblock, block, selector.size(), [&selector](const IColumn& src, MutableColumnPtr& dst) { src.append_data_by_selector(dst, selector); }); } void LocalSchemaChangeRecorder::add_extended_columns(const TabletColumn& new_column, int32_t schema_version) { std::lock_guard lock(_lock); _schema_version = std::max(_schema_version, schema_version); auto it = _extended_columns.find(new_column.name()); if (it != _extended_columns.end()) { return; } _extended_columns.emplace_hint(it, new_column.name(), new_column); } bool LocalSchemaChangeRecorder::has_extended_columns() { std::lock_guard lock(_lock); return !_extended_columns.empty(); } std::map LocalSchemaChangeRecorder::copy_extended_columns() { std::lock_guard lock(_lock); return _extended_columns; } const TabletColumn& LocalSchemaChangeRecorder::column(const std::string& col_name) { std::lock_guard lock(_lock); assert(_extended_columns.find(col_name) != _extended_columns.end()); return _extended_columns[col_name]; } int32_t LocalSchemaChangeRecorder::schema_version() { std::lock_guard lock(_lock); return _schema_version; } } // namespace doris::vectorized::schema_util