diff --git a/src/observer/omt/ob_multi_tenant.cpp b/src/observer/omt/ob_multi_tenant.cpp index f01e044cfa..e847a37aa0 100644 --- a/src/observer/omt/ob_multi_tenant.cpp +++ b/src/observer/omt/ob_multi_tenant.cpp @@ -83,6 +83,7 @@ #include "logservice/leader_coordinator/ob_leader_coordinator.h" #include "storage/lob/ob_lob_manager.h" #include "share/deadlock/ob_deadlock_detector_mgr.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" #include "storage/tx_storage/ob_tablet_gc_service.h" #include "share/ob_occam_time_guard.h" @@ -103,6 +104,7 @@ using namespace oceanbase::logservice; using namespace oceanbase::archive; using namespace oceanbase::observer; using namespace oceanbase::rootserver; +using namespace oceanbase::blocksstable; #define OB_TENANT_LOCK_BUCKET_NUM 10000L @@ -324,6 +326,7 @@ int ObMultiTenant::init(ObAddr myaddr, //MTL_BIND(ObTransAuditRecordMgr::mtl_init, ObTransAuditRecordMgr::mtl_destroy); MTL_BIND(ObTenantSqlMemoryManager::mtl_init, ObTenantSqlMemoryManager::mtl_destroy); MTL_BIND(ObPlanMonitorNodeList::mtl_init, ObPlanMonitorNodeList::mtl_destroy); + MTL_BIND2(mtl_new_default, ObSharedMacroBlockMgr::mtl_init, mtl_start_default, mtl_stop_default, mtl_wait_default, mtl_destroy_default); } if (OB_SUCC(ret)) { diff --git a/src/share/rc/ob_tenant_base.h b/src/share/rc/ob_tenant_base.h index fd88198edc..172f1ccec4 100644 --- a/src/share/rc/ob_tenant_base.h +++ b/src/share/rc/ob_tenant_base.h @@ -42,6 +42,9 @@ namespace sql { class ObDataAccessService; class ObDASIDService; } +namespace blocksstable { + class ObSharedMacroBlockMgr; +} namespace storage { struct ObTenantStorageInfo; class ObLSService; @@ -151,7 +154,7 @@ namespace detector #define MTL_MEMBERS \ MTL_LIST( \ common::ObTenantIOManager*, \ - storage::ObStorageLogger*, \ + storage::ObStorageLogger*, \ storage::ObTenantMetaMemMgr*, \ transaction::ObTransService*, \ logservice::coordinator::ObLeaderCoordinator*, \ @@ -193,6 +196,7 @@ namespace detector sql::ObDataAccessService*, \ sql::ObDASIDService*, \ share::schema::ObTenantSchemaService*, \ + blocksstable::ObSharedMacroBlockMgr*, \ storage::ObTenantFreezer*, \ storage::checkpoint::ObCheckPointService *, \ storage::checkpoint::ObTabletGCService *, \ diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt index b08dc24c33..5a3b5d9352 100644 --- a/src/storage/CMakeLists.txt +++ b/src/storage/CMakeLists.txt @@ -37,6 +37,7 @@ ob_set_subtarget(ob_storage blocksstable blocksstable/ob_row_queue.cpp blocksstable/ob_row_reader.cpp blocksstable/ob_row_writer.cpp + blocksstable/ob_shared_macro_block_manager.cpp blocksstable/ob_sstable.cpp blocksstable/ob_sstable_macro_block_header.cpp blocksstable/ob_sstable_meta.cpp diff --git a/src/storage/access/ob_index_sstable_estimator.cpp b/src/storage/access/ob_index_sstable_estimator.cpp index d517707b67..2b4947211f 100644 --- a/src/storage/access/ob_index_sstable_estimator.cpp +++ b/src/storage/access/ob_index_sstable_estimator.cpp @@ -17,6 +17,7 @@ #include "storage/blocksstable/ob_storage_cache_suite.h" #include "storage/tablet/ob_tablet.h" #include "share/schema/ob_column_schema.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" namespace oceanbase { @@ -86,7 +87,8 @@ int ObIndexBlockScanEstimator::estimate_row_count(ObPartitionEst &part_est) agg_column_schema, &context_.tablet_handle_.get_obj()->get_index_read_info(), allocator_, - context_.query_flag_))) { + context_.query_flag_, + context_.sstable_.get_macro_offset()))) { STORAGE_LOG(WARN, "Failed to init index block row scanner", K(ret), K(agg_projector), K(agg_column_schema)); } else if (OB_FAIL(context_.sstable_.get_index_tree_root(context_.tablet_handle_.get_obj()->get_index_read_info(), root_index_block_))) { STORAGE_LOG(WARN, "Failed to get index tree root", K(ret)); diff --git a/src/storage/access/ob_index_tree_prefetcher.cpp b/src/storage/access/ob_index_tree_prefetcher.cpp index 24b7212a83..ff3fcf28a7 100644 --- a/src/storage/access/ob_index_tree_prefetcher.cpp +++ b/src/storage/access/ob_index_tree_prefetcher.cpp @@ -296,11 +296,12 @@ int ObIndexTreePrefetcher::prefetch_block_data( bool need_submit_io = false; uint64_t tenant_id = MTL_ID(); const MacroBlockId ¯o_id = index_block_info.get_macro_id(); + const int64_t offset = index_block_info.get_block_offset(); if (is_rescan_ && tenant_id == micro_handle.tenant_id_ && macro_id == micro_handle.macro_block_id_ && - index_block_info.row_header_->get_block_offset() == micro_handle.micro_info_.offset_ && + offset == micro_handle.micro_info_.offset_ && index_block_info.row_header_->get_block_size() == micro_handle.micro_info_.size_) { LOG_DEBUG("Cur micro handle is still valid"); if (is_data) { @@ -310,8 +311,7 @@ int ObIndexTreePrefetcher::prefetch_block_data( } } else if (OB_FAIL(micro_block_handle_mgr_.get_micro_block_handle( tenant_id, - macro_id, - *index_block_info.row_header_, + index_block_info, is_data, micro_handle))) { //cache miss @@ -332,7 +332,7 @@ int ObIndexTreePrefetcher::prefetch_block_data( } } if (OB_SUCC(ret)) { - micro_handle.micro_info_.offset_ = index_block_info.get_block_offset(); + micro_handle.micro_info_.offset_ = offset; micro_handle.micro_info_.size_ = index_block_info.get_block_size(); if (need_submit_io) { ObMacroBlockHandle macro_handle; @@ -508,7 +508,8 @@ int ObIndexTreeMultiPassPrefetcher::switch_context( index_read_info_ = &index_read_info; for (int64_t level = 0; OB_SUCC(ret) && level < index_tree_height_; level++) { if (tree_handles_[level].index_scanner_.is_valid()) { - tree_handles_[level].index_scanner_.set_index_read_info(&index_read_info); + tree_handles_[level].index_scanner_.switch_context( + &index_read_info, sstable.get_macro_offset()); } else if (OB_FAIL(init_index_scanner(tree_handles_[level].index_scanner_))) { LOG_WARN("Fail to init index_scanner", K(ret), K(level)); } @@ -609,7 +610,7 @@ int ObIndexTreeMultiPassPrefetcher::prefetch() ret = OB_SUCCESS; } else { LOG_WARN("Fail to prefetch", K(ret)); - } + } } return ret; } @@ -1098,10 +1099,10 @@ int ObIndexTreeMultiPassPrefetcher::check_row_lock( } else if (ObStoreRowIterator::IteratorRowLockCheck == iter_type_ && !is_row_lock_checked) { const int64_t read_snapshot_version = access_ctx_->trans_version_range_.snapshot_version_; - if (!index_info.contain_uncommitted_row() + if (!index_info.contain_uncommitted_row() && index_info.get_max_merged_trans_version() <= read_snapshot_version) { ++cur_range_fetch_idx_; - row_lock_check_version_ = index_info.get_max_merged_trans_version(); + row_lock_check_version_ = index_info.get_max_merged_trans_version(); ret = OB_ITER_END; } is_row_lock_checked = true; diff --git a/src/storage/access/ob_index_tree_prefetcher.h b/src/storage/access/ob_index_tree_prefetcher.h index 0f651f7165..58c0732035 100644 --- a/src/storage/access/ob_index_tree_prefetcher.h +++ b/src/storage/access/ob_index_tree_prefetcher.h @@ -155,7 +155,8 @@ protected: agg_column_schema_, index_read_info_, *access_ctx_->stmt_allocator_, - access_ctx_->query_flag_); + access_ctx_->query_flag_, + sstable_->get_macro_offset()); } int check_bloom_filter(const ObMicroIndexInfo &index_info, ObSSTableReadHandle &read_handle); int prefetch_block_data( @@ -346,7 +347,7 @@ private: void reset() { is_prefetch_end_ = false; - is_row_lock_checked_ = false; + is_row_lock_checked_ = false; can_blockscan_ = false; read_idx_ = 0; fetch_idx_ = -1; @@ -411,7 +412,9 @@ private: const blocksstable::ObDatumRowkey &border_rowkey, const int64_t level, ObIndexTreeMultiPassPrefetcher &prefetcher); - int forward(const ObTableReadInfo &read_info, const blocksstable::ObDatumRowkey &border_rowkey); + int forward( + const ObTableReadInfo &read_info, + const blocksstable::ObDatumRowkey &border_rowkey); OB_INLINE int check_blockscan(const blocksstable::ObDatumRowkey &border_rowkey) { int ret = OB_SUCCESS; @@ -445,7 +448,7 @@ public: int32_t cur_range_prefetch_idx_; int64_t cur_micro_data_fetch_idx_; int64_t micro_data_prefetch_idx_; - int64_t row_lock_check_version_; + int64_t row_lock_check_version_; ObAggregatedStore *agg_row_store_; private: bool can_blockscan_; diff --git a/src/storage/access/ob_sstable_row_exister.cpp b/src/storage/access/ob_sstable_row_exister.cpp index 4697982840..32d36431a9 100644 --- a/src/storage/access/ob_sstable_row_exister.cpp +++ b/src/storage/access/ob_sstable_row_exister.cpp @@ -116,7 +116,7 @@ int ObSSTableRowExister::exist_block_row(ObSSTableReadHandle &read_handle, ObDat } else { if (!found) { store_row.row_flag_.set_flag(ObDmlFlag::DF_NOT_EXIST); - if (!access_ctx_->query_flag_.is_index_back() && access_ctx_->query_flag_.is_use_bloomfilter_cache()) { + if (!access_ctx_->query_flag_.is_index_back() && access_ctx_->query_flag_.is_use_bloomfilter_cache() && !sstable_->is_small_sstable()) { (void) OB_STORE_CACHE.get_bf_cache().inc_empty_read( MTL_ID(), iter_param_->table_id_, diff --git a/src/storage/access/ob_sstable_row_multi_exister.cpp b/src/storage/access/ob_sstable_row_multi_exister.cpp index 7c65aadd0c..1163a815d9 100644 --- a/src/storage/access/ob_sstable_row_multi_exister.cpp +++ b/src/storage/access/ob_sstable_row_multi_exister.cpp @@ -125,11 +125,11 @@ int ObSSTableRowMultiExister::exist_block_row(ObSSTableReadHandle &read_handle, } else { if (!found) { store_row.row_flag_.set_flag(ObDmlFlag::DF_NOT_EXIST); - if (!access_ctx_->query_flag_.is_index_back() && access_ctx_->query_flag_.is_use_bloomfilter_cache()) { + if (!access_ctx_->query_flag_.is_index_back() && access_ctx_->query_flag_.is_use_bloomfilter_cache() && !sstable_->is_small_sstable()) { (void) OB_STORE_CACHE.get_bf_cache().inc_empty_read( MTL_ID(), iter_param_->table_id_, - read_handle.micro_handle_->macro_block_id_, + read_handle.micro_handle_->macro_block_id_, read_handle.rowkey_->get_datum_cnt()); if (read_handle.is_bf_contain_) { ++access_ctx_->table_store_stat_.bf_empty_read_cnt_; diff --git a/src/storage/access/ob_sstable_row_whole_scanner.cpp b/src/storage/access/ob_sstable_row_whole_scanner.cpp index f448b7fd03..983d0d18ce 100644 --- a/src/storage/access/ob_sstable_row_whole_scanner.cpp +++ b/src/storage/access/ob_sstable_row_whole_scanner.cpp @@ -231,8 +231,8 @@ int ObSSTableRowWholeScanner::open( scan_handle.is_left_border_ = true; scan_handle.is_right_border_ = true; read_info.macro_block_id_ = macro_desc.macro_block_id_; - read_info.offset_ = 0; - read_info.size_ = OB_SERVER_BLOCK_MGR.get_macro_block_size(); + read_info.offset_ = sstable_->get_macro_offset(); + read_info.size_ = sstable_->get_macro_read_size(); read_info.io_desc_.set_category(ObIOCategory::SYS_IO); read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_READ); if (OB_FAIL(ObBlockManager::async_read_block(read_info, scan_handle.macro_io_handle_))) { @@ -317,8 +317,8 @@ int ObSSTableRowWholeScanner::prefetch() } else { scan_handle.is_left_border_ = (0 == prefetch_macro_cursor_); scan_handle.is_right_border_ = false; // set right border correctly when open macro block - read_info.offset_ = 0; - read_info.size_ = OB_SERVER_BLOCK_MGR.get_macro_block_size(); + read_info.offset_ = sstable_->get_macro_offset(); + read_info.size_ = sstable_->get_macro_read_size(); read_info.io_desc_.set_category(common::ObIOCategory::SYS_IO); read_info.io_desc_.set_wait_event(common::ObWaitEventIds::DB_FILE_COMPACT_READ); if (OB_FAIL(ObBlockManager::async_read_block(read_info, scan_handle.macro_io_handle_))) { diff --git a/src/storage/backup/ob_backup_data_struct.cpp b/src/storage/backup/ob_backup_data_struct.cpp index 0e45135a7a..f46a81599a 100644 --- a/src/storage/backup/ob_backup_data_struct.cpp +++ b/src/storage/backup/ob_backup_data_struct.cpp @@ -258,18 +258,25 @@ int ObBackupDataFileTrailer::check_valid() const /* ObBackupMacroBlockId */ -ObBackupMacroBlockId::ObBackupMacroBlockId() : logic_id_(), macro_block_id_() +ObBackupMacroBlockId::ObBackupMacroBlockId() + : logic_id_(), macro_block_id_(), + nested_offset_(0), nested_size_(0) {} -bool ObBackupMacroBlockId::is_valid() +bool ObBackupMacroBlockId::is_valid() const { - return logic_id_.is_valid() && macro_block_id_.is_valid(); + return logic_id_.is_valid() && + macro_block_id_.is_valid() && + nested_offset_ >= 0 && + nested_size_ >= 0; } void ObBackupMacroBlockId::reset() { logic_id_.reset(); macro_block_id_.reset(); + nested_offset_ = 0; + nested_size_ = 0; } /* ObBackupPhysicalID */ diff --git a/src/storage/backup/ob_backup_data_struct.h b/src/storage/backup/ob_backup_data_struct.h index 325aef4b53..38d9f8776d 100644 --- a/src/storage/backup/ob_backup_data_struct.h +++ b/src/storage/backup/ob_backup_data_struct.h @@ -184,11 +184,13 @@ struct ObBackupDataFileTrailer { struct ObBackupMacroBlockId { ObBackupMacroBlockId(); - bool is_valid(); + bool is_valid() const; void reset(); - TO_STRING_KV(K_(logic_id), K_(macro_block_id)); + TO_STRING_KV(K_(logic_id), K_(macro_block_id), K_(nested_offset), K_(nested_size)); blocksstable::ObLogicMacroBlockId logic_id_; blocksstable::MacroBlockId macro_block_id_; + int64_t nested_offset_; + int64_t nested_size_; }; struct ObBackupMacroBlockIndex; diff --git a/src/storage/backup/ob_backup_reader.cpp b/src/storage/backup/ob_backup_reader.cpp index 2bd5394a00..28a1539c8d 100644 --- a/src/storage/backup/ob_backup_reader.cpp +++ b/src/storage/backup/ob_backup_reader.cpp @@ -148,6 +148,8 @@ int ObTabletLogicMacroIdReader::get_next_batch(common::ObIArrayreset(); const ObBackupMacroBlockId ¯o_id = macro_list_.at(idx); - const blocksstable::ObLogicMacroBlockId &logic_id = macro_id.logic_id_; - const blocksstable::MacroBlockId ¯o_block_id = macro_id.macro_block_id_; - if (OB_FAIL(readers_.at(idx)->init(logic_id, macro_block_id))) { - LOG_WARN("failed to init reader", K(ret), K(idx), K(logic_id), K(macro_block_id)); + if (OB_FAIL(readers_.at(idx)->init(macro_id))) { + LOG_WARN("failed to init reader", K(ret), K(idx), K(macro_id)); } } return ret; diff --git a/src/storage/backup/ob_backup_reader.h b/src/storage/backup/ob_backup_reader.h index a9bc2e32d6..e9c795287c 100644 --- a/src/storage/backup/ob_backup_reader.h +++ b/src/storage/backup/ob_backup_reader.h @@ -26,6 +26,7 @@ #include "storage/blocksstable/ob_sstable_sec_meta_iterator.h" #include "storage/meta_mem/ob_tablet_handle.h" #include "storage/ob_i_table.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" #include "storage/blocksstable/ob_logic_macro_id.h" namespace oceanbase { @@ -141,7 +142,7 @@ class ObIMacroBlockBackupReader { public: ObIMacroBlockBackupReader(); virtual ~ObIMacroBlockBackupReader(); - virtual int init(const blocksstable::ObLogicMacroBlockId &logic_id, const blocksstable::MacroBlockId ¯o_id) = 0; + virtual int init(const ObBackupMacroBlockId ¯o_id) = 0; virtual int get_macro_block_data( blocksstable::ObBufferReader &buffer_reader, blocksstable::ObLogicMacroBlockId &logic_id) = 0; virtual void reset() = 0; @@ -151,7 +152,7 @@ public: protected: bool is_inited_; blocksstable::ObLogicMacroBlockId logic_id_; - blocksstable::MacroBlockId macro_block_id_; + blocksstable::ObBlockInfo block_info_; DISALLOW_COPY_AND_ASSIGN(ObIMacroBlockBackupReader); }; @@ -159,7 +160,7 @@ class ObMacroBlockBackupReader : public ObIMacroBlockBackupReader { public: ObMacroBlockBackupReader(); virtual ~ObMacroBlockBackupReader(); - int init(const blocksstable::ObLogicMacroBlockId &logic_id, const blocksstable::MacroBlockId ¯o_block_id); + int init(const ObBackupMacroBlockId ¯o_id); virtual int get_macro_block_data( blocksstable::ObBufferReader &buffer_reader, blocksstable::ObLogicMacroBlockId &logic_id) override; virtual void reset() override; @@ -167,7 +168,7 @@ public: { return LOCAL_MACRO_BLOCK_READER; } - TO_STRING_KV(K_(logic_id), K_(macro_block_id)); + TO_STRING_KV(K_(logic_id), K_(block_info)); private: int process_(); diff --git a/src/storage/backup/ob_backup_task.cpp b/src/storage/backup/ob_backup_task.cpp index ebd8c86c0d..13fe6b9e36 100644 --- a/src/storage/backup/ob_backup_task.cpp +++ b/src/storage/backup/ob_backup_task.cpp @@ -2909,6 +2909,8 @@ int ObLSBackupDataTask::get_macro_block_id_list_(common::ObIArray(item_type_value); } @@ -1255,6 +1284,8 @@ DEFINE_GET_SERIALIZE_SIZE(ObBackupProviderItem) size += macro_block_id_.get_serialize_size(); size += table_key_.get_serialize_size(); size += tablet_id_.get_serialize_size(); + size += serialization::encoded_length_vi64(nested_offset_); + size += serialization::encoded_length_vi64(nested_size_); return size; } @@ -1810,8 +1841,7 @@ int ObBackupTabletProvider::add_macro_block_id_item_list_(const common::ObTablet LOG_WARN("failed to check macro block need skip", K(ret), K(macro_id)); } else if (need_skip) { // do nothing - } else if (OB_FAIL(item.set( - PROVIDER_ITEM_MACRO_ID, macro_id.logic_id_, macro_id.macro_block_id_, table_key, tablet_id))) { + } else if (OB_FAIL(item.set(PROVIDER_ITEM_MACRO_ID, macro_id, table_key, tablet_id))) { LOG_WARN("failed to set item", K(ret), K(macro_id), K(table_key), K(tablet_id)); } else if (!item.is_valid()) { ret = OB_INVALID_DATA; @@ -1832,7 +1862,10 @@ int ObBackupTabletProvider::add_sstable_item_(const common::ObTabletID &tablet_i ObLogicMacroBlockId fake_logic_id; MacroBlockId fake_macro_block_id; ObITable::TableKey fake_table_key; - if (OB_FAIL(item.set(PROVIDER_ITEM_SSTABLE_META, fake_logic_id, fake_macro_block_id, fake_table_key, tablet_id))) { + ObBackupMacroBlockId macro_id; + macro_id.macro_block_id_ = fake_macro_block_id; + macro_id.logic_id_ = fake_logic_id; + if (OB_FAIL(item.set(PROVIDER_ITEM_SSTABLE_META, macro_id, fake_table_key, tablet_id))) { LOG_WARN("failed to set item", K(ret), K(tablet_id)); } else if (!item.is_valid()) { ret = OB_INVALID_DATA; @@ -1852,7 +1885,10 @@ int ObBackupTabletProvider::add_tablet_item_(const common::ObTabletID &tablet_id ObLogicMacroBlockId fake_logic_id; MacroBlockId fake_macro_block_id; ObITable::TableKey fake_table_key; - if (OB_FAIL(item.set(PROVIDER_ITEM_TABLET_META, fake_logic_id, fake_macro_block_id, fake_table_key, tablet_id))) { + ObBackupMacroBlockId macro_id; + macro_id.macro_block_id_ = fake_macro_block_id; + macro_id.logic_id_ = fake_logic_id; + if (OB_FAIL(item.set(PROVIDER_ITEM_TABLET_META, macro_id, fake_table_key, tablet_id))) { LOG_WARN("failed to set item", K(ret), K(fake_table_key), K(tablet_id)); } else if (!item.is_valid()) { ret = OB_INVALID_DATA; diff --git a/src/storage/backup/ob_backup_utils.h b/src/storage/backup/ob_backup_utils.h index 24772b4416..23160b2459 100644 --- a/src/storage/backup/ob_backup_utils.h +++ b/src/storage/backup/ob_backup_utils.h @@ -178,9 +178,8 @@ class ObBackupProviderItem { public: ObBackupProviderItem(); virtual ~ObBackupProviderItem(); - int set(const ObBackupProviderItemType &item_type, const blocksstable::ObLogicMacroBlockId &logic_id, - const blocksstable::MacroBlockId ¯o_block_id, const storage::ObITable::TableKey &table_key, - const common::ObTabletID &tablet_id); + int set(const ObBackupProviderItemType &item_type, const ObBackupMacroBlockId &backup_macro_id, + const storage::ObITable::TableKey &table_key, const common::ObTabletID &tablet_id); bool operator==(const ObBackupProviderItem &other) const; bool operator!=(const ObBackupProviderItem &other) const; ObBackupProviderItemType get_item_type() const; @@ -188,11 +187,13 @@ public: blocksstable::MacroBlockId get_macro_block_id() const; const storage::ObITable::TableKey &get_table_key() const; common::ObTabletID get_tablet_id() const; + int64_t get_nested_offset() const; + int64_t get_nested_size() const; int64_t get_deep_copy_size() const; int deep_copy(const ObBackupProviderItem &src, char *buf, int64_t len, int64_t &pos); bool is_valid() const; void reset(); - TO_STRING_KV(K_(item_type), K_(logic_id), K_(table_key), K_(tablet_id)); + TO_STRING_KV(K_(item_type), K_(logic_id), K_(table_key), K_(tablet_id), K_(nested_offset), K_(nested_size)); NEED_SERIALIZE_AND_DESERIALIZE; private: @@ -201,6 +202,8 @@ private: blocksstable::MacroBlockId macro_block_id_; storage::ObITable::TableKey table_key_; common::ObTabletID tablet_id_; // logic_id_.tablet_id_ may not equal to tablet_id_ + int64_t nested_offset_; + int64_t nested_size_; }; class ObBackupProviderItemCompare { diff --git a/src/storage/blocksstable/ob_block_manager.cpp b/src/storage/blocksstable/ob_block_manager.cpp index d8f586437d..8a997fe834 100644 --- a/src/storage/blocksstable/ob_block_manager.cpp +++ b/src/storage/blocksstable/ob_block_manager.cpp @@ -162,6 +162,7 @@ int ObBlockManager::init( } else if (OB_ISNULL(io_device) || OB_UNLIKELY(block_size < ObServerSuperBlockHeader::OB_MAX_SUPER_BLOCK_SIZE)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument, ", K(ret), KP(io_device), K(block_size)); + } else if (FALSE_IT(timer_.set_run_wrapper(MTL_CTX()))) { } else if (OB_FAIL(timer_.init("BlkMgr"))) { LOG_WARN("fail to init timer", K(ret)); } else if (OB_FAIL(bucket_lock_.init(DEFAULT_LOCK_BUCKET_COUNT, ObLatchIds::BLOCK_MANAGER_LOCK))) { @@ -173,7 +174,6 @@ int ObBlockManager::init( } else if (OB_FAIL(super_block_buf_holder_.init(ObServerSuperBlockHeader::OB_MAX_SUPER_BLOCK_SIZE))) { LOG_WARN("fail to init super block buffer holder, ", K(ret)); } else { - timer_.set_run_wrapper(MTL_CTX()); MEMSET(used_macro_cnt_, 0, sizeof(used_macro_cnt_)); mark_cost_time_ = 0; sweep_cost_time_= 0; @@ -505,6 +505,31 @@ int ObBlockManager::get_macro_block_info(const MacroBlockId ¯o_id, return ret; } +int ObBlockManager::check_macro_block_free(const MacroBlockId ¯o_id, bool &is_free) const +{ + int ret = OB_SUCCESS; + is_free = false; + BlockInfo block_info; + + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("not init", K(ret)); + } else if (OB_UNLIKELY(!macro_id.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument, ", K(ret), K(macro_id)); + } else if (OB_FAIL(block_map_.get(macro_id, block_info))) { + if (OB_ENTRY_NOT_EXIST != ret) { + LOG_WARN("fail to get macro id, ", K(ret), K(macro_id)); + } else { + is_free = true; + ret = OB_SUCCESS; + } + } else { + is_free = !(block_info.mem_ref_cnt_ > 0 || block_info.disk_ref_cnt_ > 0 ); + } + return ret; +} + int ObBlockManager::get_bad_block_infos(common::ObIArray &bad_block_infos) { int ret = OB_SUCCESS; diff --git a/src/storage/blocksstable/ob_block_manager.h b/src/storage/blocksstable/ob_block_manager.h index b1e81a0cbd..10213c875b 100644 --- a/src/storage/blocksstable/ob_block_manager.h +++ b/src/storage/blocksstable/ob_block_manager.h @@ -197,7 +197,7 @@ public: int64_t get_free_macro_block_count() const; int64_t get_used_macro_block_count() const; - int get_macro_block_info(const MacroBlockId ¯o_id, ObMacroBlockInfo ¯o_block_info) const; + int check_macro_block_free(const MacroBlockId ¯o_id, bool &is_free) const; int get_bad_block_infos(common::ObIArray &bad_block_infos); int report_bad_block( const MacroBlockId ¯o_block_id, @@ -324,6 +324,7 @@ private: }; private: + int get_macro_block_info(const MacroBlockId ¯o_id, ObMacroBlockInfo ¯o_block_info) const; bool is_bad_block(const MacroBlockId ¯o_block_id); void reset_mark_status(); diff --git a/src/storage/blocksstable/ob_bloom_filter_cache.cpp b/src/storage/blocksstable/ob_bloom_filter_cache.cpp index e847578268..887c252cb9 100644 --- a/src/storage/blocksstable/ob_bloom_filter_cache.cpp +++ b/src/storage/blocksstable/ob_bloom_filter_cache.cpp @@ -659,18 +659,18 @@ int ObBloomFilterCache::get_sstable_bloom_filter(const uint64_t tenant_id, int ObBloomFilterCache::inc_empty_read( const uint64_t tenant_id, const uint64_t table_id, - const MacroBlockId ¯o_block_id, + const MacroBlockId ¯o_id, const int64_t empty_read_prefix) { int ret = OB_SUCCESS; if (OB_UNLIKELY(OB_INVALID_TENANT_ID == tenant_id || empty_read_prefix <= 0)) { ret = OB_INVALID_ARGUMENT; STORAGE_LOG(WARN, "Invalid argument, ", - K(ret), K(tenant_id), K(macro_block_id), K(empty_read_prefix)); + K(ret), K(tenant_id), K(macro_id), K(empty_read_prefix)); } else if (0 == bf_cache_miss_count_threshold_) { // bf cache is disabled, do nothing } else { - const ObBloomFilterCacheKey bfc_key(tenant_id, macro_block_id, empty_read_prefix); + const ObBloomFilterCacheKey bfc_key(tenant_id, macro_id, empty_read_prefix); uint64_t key_hash = bfc_key.hash(); uint64_t cur_cnt = 1; ObEmptyReadCell *cell = nullptr; @@ -687,7 +687,7 @@ int ObBloomFilterCache::inc_empty_read( } else if (cur_cnt > bf_cache_miss_count_threshold_ && (!cell->is_building())) { if (cell->check_timeout()) { } else if (OB_FAIL(MTL(storage::ObTenantTabletScheduler *)->schedule_build_bloomfilter( - table_id, macro_block_id, empty_read_prefix))) { + table_id, macro_id, empty_read_prefix))) { STORAGE_LOG(WARN, "Fail to schedule build bloom filter, ", K(ret), K(bfc_key), K(cur_cnt), K_(bf_cache_miss_count_threshold)); } else { @@ -695,7 +695,7 @@ int ObBloomFilterCache::inc_empty_read( cell->build_time_ = ObTimeUtility::current_time(); } } - STORAGE_LOG(DEBUG, "inc_empty_read", K(tenant_id), K(table_id), K(macro_block_id), + STORAGE_LOG(DEBUG, "inc_empty_read", K(tenant_id), K(table_id), K(macro_id), K(cur_cnt), K(bf_cache_miss_count_threshold_)); } return ret; diff --git a/src/storage/blocksstable/ob_bloom_filter_cache.h b/src/storage/blocksstable/ob_bloom_filter_cache.h index f841bf21ff..858b0fbf74 100644 --- a/src/storage/blocksstable/ob_bloom_filter_cache.h +++ b/src/storage/blocksstable/ob_bloom_filter_cache.h @@ -232,7 +232,7 @@ public: int inc_empty_read( const uint64_t tenant_id, const uint64_t table_id, - const MacroBlockId ¯o_block_id, + const MacroBlockId ¯o_id, const int64_t empty_read_prefix); int get_sstable_bloom_filter( const uint64_t tenant_id, diff --git a/src/storage/blocksstable/ob_imicro_block_writer.h b/src/storage/blocksstable/ob_imicro_block_writer.h index 644f3fa025..9be2344f95 100644 --- a/src/storage/blocksstable/ob_imicro_block_writer.h +++ b/src/storage/blocksstable/ob_imicro_block_writer.h @@ -49,6 +49,7 @@ struct ObMicroBlockDesc ObMicroBlockDesc() { reset(); } bool is_valid() const; void reset(); + int64_t get_block_size() const { return buf_size_ + header_->header_size_; } TO_STRING_KV( K_(last_rowkey), diff --git a/src/storage/blocksstable/ob_index_block_builder.cpp b/src/storage/blocksstable/ob_index_block_builder.cpp index 37a0b99099..b4b0b0c8c3 100644 --- a/src/storage/blocksstable/ob_index_block_builder.cpp +++ b/src/storage/blocksstable/ob_index_block_builder.cpp @@ -20,6 +20,7 @@ #include "share/rc/ob_tenant_base.h" #include "share/ob_encryption_util.h" #include "storage/ob_storage_struct.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" namespace oceanbase { @@ -113,6 +114,7 @@ void ObIndexTreeRootBlockDesc::set_empty() buf_ = nullptr; row_type_ = FLAT_ROW_STORE; height_ = 0; + is_meta_root_ = false; } void ObIndexTreeInfo::set_empty() @@ -143,7 +145,9 @@ ObSSTableMergeRes::ObSSTableMergeRes() data_default_column_rows_cnt_(), compressor_type_(ObCompressorType::INVALID_COMPRESSOR), encrypt_id_(0), - master_key_id_(0) + master_key_id_(0), + nested_offset_(0), + nested_size_(0) { MEMSET(encrypt_key_, 0, share::OB_MAX_TABLESPACE_ENCRYPT_KEY_LENGTH); } @@ -172,6 +176,8 @@ void ObSSTableMergeRes::reset() } other_block_ids_.destroy(); micro_block_cnt_ = 0; + nested_size_ = 0; + nested_offset_ = 0; } bool ObSSTableMergeRes::is_valid() const @@ -181,7 +187,9 @@ bool ObSSTableMergeRes::is_valid() const && index_blocks_cnt_ >= 0 && data_blocks_cnt_ >= 0 && micro_block_cnt_ >= 0 - && data_column_cnt_ > 0; + && data_column_cnt_ > 0 + && nested_offset_ >= 0 + && nested_size_ >= 0; } int ObSSTableMergeRes::assign(const ObSSTableMergeRes &src) @@ -209,6 +217,8 @@ int ObSSTableMergeRes::assign(const ObSSTableMergeRes &src) compressor_type_ = src.compressor_type_; encrypt_id_ = src.encrypt_id_; master_key_id_ = src.master_key_id_; + nested_size_ = src.nested_size_; + nested_offset_ = src.nested_offset_; MEMCPY(encrypt_key_, src.encrypt_key_, sizeof(encrypt_key_)); if (OB_FAIL(data_block_ids_.reserve(src.data_block_ids_.count()))) { @@ -363,12 +373,18 @@ ObSSTableIndexBuilder::ObSSTableIndexBuilder() callback_(nullptr), roots_(), res_(), + optimization_mode_(ENABLE), is_closed_(false), is_inited_(false) { } ObSSTableIndexBuilder::~ObSSTableIndexBuilder() +{ + reset(); +} + +void ObSSTableIndexBuilder::reset() { index_store_desc_.reset(); container_store_desc_.reset(); @@ -388,12 +404,14 @@ ObSSTableIndexBuilder::~ObSSTableIndexBuilder() data_builder_.reset(); macro_writer_.reset(); callback_ = nullptr; + index_write_ctxs_.reset(); roots_.reset(); index_row_.reset(); res_.reset(); allocator_.reset(); orig_allocator_.reset(); self_allocator_.reset(); + optimization_mode_ = ENABLE; is_closed_ = false; is_inited_ = false; } @@ -411,7 +429,8 @@ bool ObSSTableIndexBuilder::check_index_desc(const ObDataStoreDesc &index_desc) return ret; } int ObSSTableIndexBuilder::init(const ObDataStoreDesc &index_desc, - ObIMacroBlockFlushCallback *callback) + ObIMacroBlockFlushCallback *callback, + ObSpaceOptimizationMode mode) { int ret = OB_SUCCESS; if (OB_UNLIKELY(is_inited_)) { @@ -435,6 +454,7 @@ int ObSSTableIndexBuilder::init(const ObDataStoreDesc &index_desc, } else { index_store_desc_.sstable_index_builder_ = this; callback_ = callback; + optimization_mode_ = mode; is_inited_ = true; } STORAGE_LOG(DEBUG, "init sstable index builder", K(ret), K(index_desc), K_(index_store_desc)); @@ -711,7 +731,7 @@ int ObSSTableIndexBuilder::build_meta_tree(ObSSTableMergeRes &res) } } if (OB_FAIL(ret)) { - } else if (OB_FAIL(builder.close(self_allocator_, res.data_root_desc_))) { + } else if (OB_FAIL(builder.close(roots_, res.data_root_desc_))) { STORAGE_LOG(WARN, "fail to close index tree of meta", K(ret)); } else if (OB_FAIL(macro_writer_.close())) { STORAGE_LOG(WARN, "fail to close macro block writer", K(ret)); @@ -772,7 +792,7 @@ int ObSSTableIndexBuilder::generate_macro_blocks_info(ObSSTableMergeRes &res) ObMacroBlocksWriteCtx *write_ctx = index_write_ctxs_.at(i); if (OB_ISNULL(write_ctx)) { ret = OB_ERR_SYS; - STORAGE_LOG(WARN, "invalid null index write ctx", K(ret)); + STORAGE_LOG(WARN, "invalid null index write ctx", K(ret), K(index_write_ctxs_), K(i)); } else if (OB_FAIL(write_ctx->get_macro_id_array(res.other_block_ids_))) { STORAGE_LOG(WARN, "fail to get macro ids of index blocks", K(ret)); } else { @@ -840,6 +860,34 @@ int ObSSTableIndexBuilder::close(const int64_t column_cnt, ObSSTableMergeRes &re STORAGE_LOG(DEBUG, "sstable has no data", K(ret)); } else if (OB_FAIL(sort_roots())) { STORAGE_LOG(WARN, "fail to sort roots", K(ret)); + } else { // TODO GET_MIN_CLUSTER_VERSION() >= CLUSTER_VERSION_4_0_0_0 + const bool is_single_block = check_single_block(); + if (is_single_block) { + switch (optimization_mode_) { + case ENABLE: + if (OB_FAIL(check_and_rewrite_sstable(res))) { + STORAGE_LOG(WARN, "fail to check and rewrite small sstable", K(ret)); + } + break; + case DISABLE: + res.nested_offset_ = 0; + res.nested_size_ = OB_DEFAULT_MACRO_BLOCK_SIZE; + break; + case AUTO: + if (OB_FAIL(check_and_rewrite_sstable_without_size(res))) { + STORAGE_LOG(WARN, "fail to check and rewrite small sstable", K(ret)); + } + break; + default: + ret = OB_ERR_UNEXPECTED; + STORAGE_LOG(WARN, "the optimization mode is invalid", K(ret), K(optimization_mode_)); + break; + } + } + } + + if (OB_FAIL(ret) || roots_.empty() || is_closed_) { + // do nothing } else if (OB_FAIL(merge_index_tree(res))) { STORAGE_LOG(WARN, "fail to merge index tree", K(ret)); } else if (OB_FAIL(build_meta_tree(res))) { @@ -881,6 +929,155 @@ int ObSSTableIndexBuilder::close(const int64_t column_cnt, ObSSTableMergeRes &re return ret; } +int ObSSTableIndexBuilder::check_and_rewrite_sstable(ObSSTableMergeRes &res) +{ + int ret = OB_SUCCESS; + int64_t macro_size = 0; + for (int64_t i = 0; i < roots_.count(); i++) { + macro_size = roots_[i]->last_macro_size_; + } + const int64_t align_macro_size = upper_align(macro_size, DIO_READ_ALIGN_SIZE); + + if (align_macro_size >= SMALL_SSTABLE_THRESHOLD) { // skip rewrite + res.nested_offset_ = 0; + res.nested_size_ = OB_DEFAULT_MACRO_BLOCK_SIZE; + } else if (0 == macro_size) { + if (OB_FAIL(check_and_rewrite_sstable_without_size(res))) { + STORAGE_LOG(WARN, "fail to check and rewrite small sstable without macro size", K(ret), K(macro_size)); + } + } else { // align_macro_size < SMALL_SSTABLE_THRESHOLD && 0 != macro_size + if (OB_FAIL(rewrite_small_sstable(res))) { + STORAGE_LOG(WARN, "fail to rewrite small sstable with given macro size", K(ret)); + } + } + return ret; +} + +int ObSSTableIndexBuilder::rewrite_small_sstable(ObSSTableMergeRes &res) +{ + int ret = OB_SUCCESS; + ObBlockInfo block_info; + ObMacroBlockHandle read_handle; + const ObDataMacroBlockMeta ¯o_meta = *(roots_[0]->macro_metas_->at(0)); + ObMacroBlockReadInfo read_info; + read_info.macro_block_id_ = macro_meta.val_.macro_id_; + read_info.offset_ = 0; + read_info.size_ = upper_align(roots_[0]->last_macro_size_, DIO_READ_ALIGN_SIZE); + read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_READ); + read_info.io_desc_.set_category(ObIOCategory::SYS_IO); + const int64_t io_timeout_ms = std::max(GCONF._data_storage_io_timeout / 1000, DEFAULT_IO_WAIT_TIME_MS); + + if (OB_FAIL(ObBlockManager::async_read_block(read_info, read_handle))) { + STORAGE_LOG(WARN, "fail to async read macro block", K(ret), K(read_info), K(macro_meta), K(roots_[0]->last_macro_size_)); + } else if (OB_FAIL(read_handle.wait(io_timeout_ms))) { + STORAGE_LOG(WARN, "fail to wait io finish", K(ret), K(io_timeout_ms)); + } else { + ObSharedMacroBlockMgr *shared_block_mgr = MTL(ObSharedMacroBlockMgr*); + if (OB_FAIL(shared_block_mgr->write_block( + read_handle.get_buffer(), read_handle.get_data_size(), block_info, *(roots_[0]->data_write_ctx_)))) { + STORAGE_LOG(WARN, "fail to write small sstable through shared_block_mgr", K(ret)); + } else if (OB_UNLIKELY(!block_info.is_valid())) { + ret = OB_ERR_UNEXPECTED; + STORAGE_LOG(WARN, "successfully rewrite small sstable, but block info is invali", K(ret), K(block_info)); + } else { + roots_[0]->macro_metas_->at(0)->val_.macro_id_ = block_info.macro_id_; + res.nested_offset_ = block_info.nested_offset_; + res.nested_size_ = block_info.nested_size_; + } + } + return ret; +} + +int ObSSTableIndexBuilder::check_and_rewrite_sstable_without_size(ObSSTableMergeRes &res) +{ + int ret = OB_SUCCESS; + ObBlockInfo block_info; + + if (OB_FAIL(do_check_and_rewrite_sstable(block_info))) { + STORAGE_LOG(WARN, "fail to check macro block size and rewrite", K(ret)); + } else if (block_info.is_small_sstable()) { + roots_[0]->macro_metas_->at(0)->val_.macro_id_ = block_info.macro_id_; + res.nested_offset_ = block_info.nested_offset_; + res.nested_size_ = block_info.nested_size_; + } + + if (OB_SUCC(ret) && !block_info.is_small_sstable()) { + res.nested_offset_ = 0; + res.nested_size_ = OB_DEFAULT_MACRO_BLOCK_SIZE; + } + return ret; +} + +int ObSSTableIndexBuilder::do_check_and_rewrite_sstable(ObBlockInfo &block_info) +{ + int ret = OB_SUCCESS; + ObMacroBlockHandle read_handle; + const ObDataMacroBlockMeta ¯o_meta = *(roots_[0]->macro_metas_->at(0)); + ObMacroBlockReadInfo read_info; + read_info.macro_block_id_ = macro_meta.val_.macro_id_; + read_info.offset_ = 0; + read_info.size_ = OB_SERVER_BLOCK_MGR.get_macro_block_size(); + read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_READ); + read_info.io_desc_.set_category(ObIOCategory::SYS_IO); + const int64_t io_timeout_ms = std::max(GCONF._data_storage_io_timeout / 1000, DEFAULT_IO_WAIT_TIME_MS); + ObSSTableMacroBlockHeader macro_header; + + if (OB_FAIL(ObBlockManager::async_read_block(read_info, read_handle))) { + STORAGE_LOG(WARN, "fail to async read macro block", K(ret), K(read_info), K(macro_meta), K(roots_[0]->last_macro_size_)); + } else if (OB_FAIL(read_handle.wait(io_timeout_ms))) { + STORAGE_LOG(WARN, "fail to wait io finish", K(ret), K(io_timeout_ms)); + } else if (OB_FAIL(parse_macro_header(read_handle.get_buffer(), read_handle.get_data_size(), macro_header))) { + STORAGE_LOG(WARN, "fail to parse macro header", K(ret)); + } else { + const int64_t align_size = upper_align( + macro_header.fixed_header_.meta_block_offset_ + macro_header.fixed_header_.meta_block_size_, + DIO_READ_ALIGN_SIZE); + if (align_size < SMALL_SSTABLE_THRESHOLD) { // need to be rewritten + ObSharedMacroBlockMgr *shared_block_mgr = MTL(ObSharedMacroBlockMgr*); + if (OB_FAIL(shared_block_mgr->write_block(read_handle.get_buffer(), align_size, block_info, *(roots_[0]->data_write_ctx_)))) { + STORAGE_LOG(WARN, "fail to write small sstable through shared_block_mgr", K(ret)); + } else if (OB_UNLIKELY(!block_info.is_valid())) { + ret = OB_ERR_UNEXPECTED; + STORAGE_LOG(WARN, "successfully rewrite small sstable, but block info is invalid", K(ret), K(block_info)); + } + } + } + + return ret; +} + +int ObSSTableIndexBuilder::parse_macro_header( + const char *buf, + const int64_t buf_size, + ObSSTableMacroBlockHeader ¯o_header) +{ + int ret = OB_SUCCESS; + int64_t pos = 0; + ObMacroBlockCommonHeader common_header; + + if (OB_UNLIKELY(buf_size <= 0) || OB_ISNULL(buf)) { + ret = OB_INVALID_ARGUMENT; + STORAGE_LOG(WARN, "the argument is invalid", K(ret), K(buf_size), KP(buf)); + } else if (common_header.deserialize(buf, buf_size, pos)) { + STORAGE_LOG(WARN, "fail to deserialize common header", K(ret), K(buf_size), KP(buf), K(pos)); + } else if (OB_FAIL(macro_header.deserialize(buf, buf_size, pos))) { + STORAGE_LOG(WARN, "fail to deserialize macro header", K(ret), KP(buf), K(buf_size), K(pos)); + } else if (OB_UNLIKELY(!macro_header.is_valid())) { + ret = OB_INVALID_DATA; + STORAGE_LOG(WARN, "invalid macro header", K(ret), K(macro_header)); + } + return ret; +} + +bool ObSSTableIndexBuilder::check_single_block() +{ + int64_t cnt = 0; + for (int64_t i = 0; i < roots_.count(); i++) { + cnt += roots_[i]->macro_metas_->count(); + } + return 1 == cnt; +} + //===================== ObBaseIndexBlockBuilder(public) ================ ObBaseIndexBlockBuilder::ObBaseIndexBlockBuilder() :is_inited_(false), @@ -1064,7 +1261,7 @@ int ObBaseIndexBlockBuilder::close(ObIAllocator &allocator, ObIndexTreeInfo &tre } else if (OB_FAIL(micro_writer->build_micro_block_desc(micro_block_desc))) { STORAGE_LOG(WARN, "fail to build root block", K(ret)); } else if (FALSE_IT(micro_block_desc.last_rowkey_ = root_builder->last_rowkey_)) { - } else if (OB_UNLIKELY(micro_block_desc.buf_size_ >= ObMetaDiskAddr::ROOT_BLOCK_SIZE_LIMIT)) { + } else if (OB_UNLIKELY(micro_block_desc.get_block_size() >= ObMetaDiskAddr::ROOT_BLOCK_SIZE_LIMIT)) { if (OB_FAIL(macro_writer_->append_index_micro_block(micro_block_desc))) { micro_writer->dump_diagnose_info(); // ignore dump error STORAGE_LOG(WARN, "fail to append root block", K(ret), K(micro_block_desc)); @@ -1094,7 +1291,9 @@ int ObBaseIndexBlockBuilder::close(ObIAllocator &allocator, ObIndexTreeInfo &tre } } } - if (OB_SUCC(ret)) { + if (OB_FAIL(ret) && nullptr != desc.buf_) { + allocator.free(desc.buf_); + } else if (OB_SUCC(ret)) { tree_info.row_count_ = root_builder->row_count_; tree_info.max_merged_trans_version_ = root_builder->max_merged_trans_version_; tree_info.contain_uncommitted_row_ = root_builder->contain_uncommitted_row_; @@ -1195,7 +1394,6 @@ void ObBaseIndexBlockBuilder::block_to_row_desc( row_desc.block_size_ = micro_block_desc.buf_size_ + micro_block_desc.header_->header_size_; row_desc.row_count_ = micro_block_desc.row_count_; row_desc.row_count_delta_ = micro_block_desc.row_count_delta_; - row_desc.block_size_ = micro_block_desc.buf_size_ + micro_block_desc.header_->header_size_; row_desc.is_deleted_ = micro_block_desc.can_mark_deletion_; row_desc.max_merged_trans_version_ = micro_block_desc.max_merged_trans_version_; row_desc.contain_uncommitted_row_ = micro_block_desc.contain_uncommitted_row_; @@ -1345,7 +1543,9 @@ ObDataIndexBlockBuilder::ObDataIndexBlockBuilder() macro_meta_list_(nullptr), meta_block_writer_(nullptr), meta_row_(), - data_blocks_cnt_(0) + data_blocks_cnt_(0), + meta_block_offset_(0), + meta_block_size_(0) { } @@ -1370,6 +1570,8 @@ void ObDataIndexBlockBuilder::reset() } meta_row_.reset(); data_blocks_cnt_ = 0; + meta_block_offset_ = 0; + meta_block_size_ = 0; sstable_allocator_ = nullptr; ObBaseIndexBlockBuilder::reset(); } @@ -1603,6 +1805,8 @@ int ObDataIndexBlockBuilder::write_meta_block( } if (OB_SUCC(ret)) { macro_meta_.val_.macro_id_ = block_id; // real macro id + meta_block_offset_ = data_offset; + meta_block_size_ = meta_block_desc.get_block_size(); if (OB_FAIL(ObDataIndexBlockBuilder::add_macro_block_meta( macro_meta_, *macro_meta_list_, *sstable_allocator_))) { STORAGE_LOG(WARN, "failed to add macro block meta", K(ret), K_(macro_meta)); @@ -1621,6 +1825,7 @@ int ObDataIndexBlockBuilder::append_index_micro_block(ObMacroBlock ¯o_block, int ret = OB_SUCCESS; ObMicroBlockDesc leaf_block_desc; // n-1 level index block int64_t data_offset = 0; + int64_t leaf_block_size = 0; if (OB_FAIL(build_index_micro_block(leaf_block_desc))) { STORAGE_LOG(WARN, "fail to build n-1 level micro block", K(ret)); } else if (OB_FAIL(micro_helper_.compress_encrypt_micro_block(leaf_block_desc))) { @@ -1630,6 +1835,7 @@ int ObDataIndexBlockBuilder::append_index_micro_block(ObMacroBlock ¯o_block, } else { leaf_block_desc.macro_id_ = block_id; leaf_block_desc.block_offset_ = data_offset; + leaf_block_size = leaf_block_desc.get_block_size(); } if (OB_FAIL(ret)) { @@ -1641,6 +1847,8 @@ int ObDataIndexBlockBuilder::append_index_micro_block(ObMacroBlock ¯o_block, } else if (FALSE_IT(macro_row_desc_.macro_id_ = ObIndexBlockRowHeader::DEFAULT_IDX_ROW_MACRO_ID)) { } else if (OB_FAIL(write_meta_block(macro_block, block_id, macro_row_desc_))) { STORAGE_LOG(WARN, "fail to build meta block", K(ret)); + } else { + root_micro_block_desc_->last_macro_size_ = data_offset + leaf_block_size + meta_block_size_; } clean_status(); return ret; @@ -1705,6 +1913,10 @@ int ObDataIndexBlockBuilder::close(const ObDatumRowkey &last_key, root_micro_block_desc_->data_column_cnt_ = data_store_desc_->row_column_count_; // root_micro_block_desc_->macro_metas_ = macro_meta_list_; // should be done in init method root_micro_block_desc_->data_blocks_cnt_ = blocks_cnt; + if (blocks_cnt == 1) { + root_micro_block_desc_->meta_block_size_ = meta_block_size_; + root_micro_block_desc_->meta_block_offset_ = meta_block_offset_; + } STORAGE_LOG(INFO, "succeed to close data index builder", KPC_(root_micro_block_desc)); } is_closed_ = true; // close() is not re-entrant @@ -1774,17 +1986,24 @@ int ObMetaIndexBlockBuilder::init(ObDataStoreDesc &data_store_desc, return ret; } -int ObMetaIndexBlockBuilder::build_micro_block() +int ObMetaIndexBlockBuilder::build_micro_block(ObMicroBlockDesc µ_block_desc) { int ret = OB_SUCCESS; - ObMicroBlockDesc micro_block_desc; - ObIndexBlockRowDesc row_desc(*index_store_desc_); if (OB_UNLIKELY(0 == micro_writer_->get_row_count())) { STORAGE_LOG(DEBUG, "build empty micro block", K(ret)); } else if (OB_FAIL(micro_writer_->build_micro_block_desc(micro_block_desc))) { STORAGE_LOG(WARN, "fail to build micro block", K(ret)); - } else if (FALSE_IT(micro_block_desc.last_rowkey_ = last_leaf_rowkey_)) { - } else if (OB_FAIL(macro_writer_->append_index_micro_block(micro_block_desc))) { + } else { + micro_block_desc.last_rowkey_ = last_leaf_rowkey_; + } + return ret; +} + +int ObMetaIndexBlockBuilder::append_micro_block(ObMicroBlockDesc µ_block_desc) +{ + int ret = OB_SUCCESS; + ObIndexBlockRowDesc row_desc(*index_store_desc_); + if (OB_FAIL(macro_writer_->append_index_micro_block(micro_block_desc))) { micro_writer_->dump_diagnose_info(); // ignore dump error STORAGE_LOG(WARN, "fail to append micro block of meta", K(ret), K(micro_block_desc)); } else if (FALSE_IT(block_to_row_desc(micro_block_desc, row_desc))) { @@ -1803,6 +2022,7 @@ int ObMetaIndexBlockBuilder::build_micro_block() int ObMetaIndexBlockBuilder::append_leaf_row(const ObDatumRow &leaf_row) { int ret = OB_SUCCESS; + ObMicroBlockDesc micro_block_desc; if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; STORAGE_LOG(WARN, "invalid ObMetaIndexBlockBuilder", K(ret), K(is_inited_)); @@ -1815,8 +2035,10 @@ int ObMetaIndexBlockBuilder::append_leaf_row(const ObDatumRow &leaf_row) } else if (OB_UNLIKELY(0 == micro_writer_->get_row_count())) { ret = OB_NOT_SUPPORTED; STORAGE_LOG(WARN, "The single row is too large, ", K(ret), K(leaf_row)); - } else if (OB_FAIL(build_micro_block())) { + } else if (OB_FAIL(build_micro_block(micro_block_desc))) { STORAGE_LOG(WARN, "fail to build micro block of meta", K(ret), K(leaf_row)); + } else if (OB_FAIL(append_micro_block(micro_block_desc))) { + STORAGE_LOG(WARN, "fail to append micro block of meta to macro block", K(ret), K(leaf_row)); } else if (OB_FAIL(micro_writer_->append_row(leaf_row))) { STORAGE_LOG(WARN, "fail to append leaf row of meta", K(ret), K(leaf_row)); } @@ -1837,24 +2059,112 @@ int ObMetaIndexBlockBuilder::append_leaf_row(const ObDatumRow &leaf_row) return ret; } -int ObMetaIndexBlockBuilder::close(ObIAllocator &allocator, ObIndexTreeRootBlockDesc &block_desc) +int ObMetaIndexBlockBuilder::close( + const IndexMicroBlockDescList &roots, + ObIndexTreeRootBlockDesc &block_desc) { int ret = OB_SUCCESS; ObIndexTreeInfo tree_info; + ObMicroBlockDesc micro_block_desc; if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; STORAGE_LOG(WARN, "invalid ObMetaIndexBlockBuilder", K(ret), K(is_inited_)); } else if (OB_UNLIKELY(is_closed_)) { ret = OB_ERR_UNEXPECTED; STORAGE_LOG(WARN, "meta index builder is closed", K(ret), K(is_closed_)); - } else if (OB_FAIL(build_micro_block())) { - STORAGE_LOG(WARN, "fail to build last micro block of meta", K(ret)); - } else if (OB_FAIL(ObBaseIndexBlockBuilder::close(allocator, tree_info))) { - STORAGE_LOG(WARN, "fail to close index tree of meta", K(ret)); + } else if (OB_FAIL(build_micro_block(micro_block_desc))) { + STORAGE_LOG(WARN, "fail to build micro block of meta", K(ret)); + } else if (row_count_ <= 0 && micro_block_desc.get_block_size() <= ObMetaDiskAddr::ROOT_BLOCK_SIZE_LIMIT) { + // meta block's size is smaller than ROOT_BLOCK_SIZE_LIMIT, all meta data will be stored in root + if (OB_FAIL(ObBaseIndexBlockBuilder::close(*allocator_, tree_info))) { + STORAGE_LOG(WARN, "fail to close index tree of meta", K(ret)); + } else if (OB_FAIL(build_single_node_tree(*allocator_, micro_block_desc, block_desc))) { + STORAGE_LOG(WARN, "fail to build single node tree of meta", K(ret)); + } + } else if (row_count_ <= 0 && 1 == micro_block_desc.row_count_) { + // this sstable only has one data block, but the size of meta data exceeds ROOT_BLOCK_SIZE_LIMIT, + // so sstable's root points to the tail of its data block (macro meta row) + if (OB_FAIL(build_single_macro_row_desc(roots))) { + STORAGE_LOG(WARN, "fail to build single marcro row descn", K(ret)); + } else if (OB_FAIL(ObBaseIndexBlockBuilder::close(*allocator_, tree_info))) { + STORAGE_LOG(WARN, "fail to close index tree of meta", K(ret)); + } else { + block_desc = tree_info.root_desc_; + } } else { - block_desc = tree_info.root_desc_; + if (OB_FAIL(append_micro_block(micro_block_desc))) { + STORAGE_LOG(WARN, "fail to append micro block of meta to macro block", K(ret)); + } else if (OB_FAIL(ObBaseIndexBlockBuilder::close(*allocator_, tree_info))) { + STORAGE_LOG(WARN, "fail to close index tree of meta", K(ret)); + } else { + block_desc = tree_info.root_desc_; + } + } + + if (OB_SUCC(ret)) { is_closed_ = true; - STORAGE_LOG(INFO, "succeed to close meta index builder", K(tree_info)); + STORAGE_LOG(DEBUG, "succeed to close index tree of meta", K(ret), K(block_desc)); + } + return ret; +} + +int ObMetaIndexBlockBuilder::build_single_macro_row_desc(const IndexMicroBlockDescList &roots) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(1 != roots.count() || 1 != roots[0]->macro_metas_->count())) { + ret = OB_ERR_UNEXPECTED; + STORAGE_LOG(WARN, "number of macro meta should be 1", K(ret), K(roots)); + } else { + ObIndexBlockRowDesc row_desc(*index_store_desc_); + const ObDataBlockMetaVal ¯o_meta_val = roots[0]->macro_metas_->at(0)->val_; + row_desc.row_key_ = roots[0]->macro_metas_->at(0)->end_key_; + row_desc.macro_id_ = macro_meta_val.macro_id_; + row_desc.block_offset_ = roots[0]->meta_block_offset_; + row_desc.block_size_ = roots[0]->meta_block_size_; + row_desc.row_count_ = macro_meta_val.row_count_; + row_desc.row_count_delta_ = macro_meta_val.row_count_delta_; + row_desc.is_deleted_ = macro_meta_val.is_deleted_; + row_desc.max_merged_trans_version_ = macro_meta_val.max_merged_trans_version_; + row_desc.contain_uncommitted_row_ = macro_meta_val.contain_uncommitted_row_; + last_leaf_rowkey_.reset(); + row_desc.is_data_block_ = true; + row_desc.is_secondary_meta_ = true; + if (OB_FAIL(ObBaseIndexBlockBuilder::append_row(row_desc))) { + STORAGE_LOG(WARN, "fail to append n-1 level index row of meta", K(ret), K(roots)); + } + } + return ret; +} + +int ObMetaIndexBlockBuilder::build_single_node_tree( + ObIAllocator &allocator, + const ObMicroBlockDesc µ_block_desc, + ObIndexTreeRootBlockDesc &block_desc) +{ + int ret = OB_SUCCESS; + ObMetaDiskAddr &root_addr = block_desc.addr_; + block_desc.row_type_ = index_store_desc_->row_store_type_; + block_desc.height_ = 1; + char *&root_buf = block_desc.buf_; + const int64_t buf_size = micro_block_desc.buf_size_ + micro_block_desc.header_->header_size_; + int64_t pos = 0; + if (OB_ISNULL(root_buf = static_cast (allocator.alloc(buf_size)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + STORAGE_LOG(WARN, "fail to alloc root buf", K(ret), K(buf_size)); + } else if (OB_FAIL(micro_block_desc.header_->serialize(root_buf, buf_size, pos))) { + STORAGE_LOG(WARN, "fail to serialize header", K(ret), K(micro_block_desc)); + } else { + MEMCPY(root_buf + pos, micro_block_desc.buf_, buf_size - pos); + if (OB_FAIL(root_addr.set_mem_addr(0, buf_size))) { + STORAGE_LOG(WARN, "fail to set memory address", K(ret), K(buf_size)); + } else { + block_desc.is_meta_root_ = true; + STORAGE_LOG(INFO, "successfully build single node tree, whose root is a data root", K(ret), K(block_desc)); + } + } + if (OB_FAIL(ret) && nullptr != root_buf) { + allocator.free(root_buf); + STORAGE_LOG(INFO, "succeed to close meta index builder", K(block_desc)); } return ret; } diff --git a/src/storage/blocksstable/ob_index_block_builder.h b/src/storage/blocksstable/ob_index_block_builder.h index beaa1b8953..b40390b87d 100644 --- a/src/storage/blocksstable/ob_index_block_builder.h +++ b/src/storage/blocksstable/ob_index_block_builder.h @@ -25,6 +25,7 @@ namespace oceanbase { namespace blocksstable { +struct ObBlockInfo; struct ObIndexMicroBlockDesc; class ObIMicroBlockReader; class ObIMacroBlockFlushCallback; @@ -34,6 +35,7 @@ static const uint32_t META_BLOCK_VERSION = 1; static const int64_t DEFAULT_MICRO_BLOCK_WRITER_COUNT = 64 + 1; static const int64_t DEFAULT_MACRO_LEVEL_ROWS_COUNT = 8; static const int64_t DEFAULT_MACRO_BLOCK_CNT = 64; +static const int64_t SMALL_SSTABLE_THRESHOLD = 1 << 20; // 1M typedef common::ObSEArray IndexMicroBlockDescList; @@ -50,7 +52,10 @@ public: data_column_cnt_(0), data_blocks_cnt_(0), macro_metas_(nullptr), - data_write_ctx_(nullptr) {} + data_write_ctx_(nullptr), + meta_block_offset_(0), + meta_block_size_(0), + last_macro_size_(0) {} ~ObIndexMicroBlockDesc() = default; TO_STRING_KV(K_(last_key), K_(data_column_cnt), K_(data_blocks_cnt), KPC_(data_write_ctx), KP(macro_metas_)); @@ -59,6 +64,9 @@ public: int64_t data_blocks_cnt_; ObMacroMetasArray *macro_metas_; ObMacroBlocksWriteCtx *data_write_ctx_; // contains: block_ids array [hold ref] + int64_t meta_block_offset_; + int64_t meta_block_size_; + int64_t last_macro_size_; }; struct ObIndexMicroBlockDescCompare final @@ -122,7 +130,8 @@ public: :addr_(), buf_(nullptr), row_type_(common::ObRowStoreType::MAX_ROW_STORE), - height_(0) {} + height_(0), + is_meta_root_(false) {} ~ObIndexTreeRootBlockDesc() = default; bool is_valid() const; bool is_empty() const { return addr_.is_none(); } @@ -135,6 +144,7 @@ public: char *buf_; ObRowStoreType row_type_; int64_t height_; + bool is_meta_root_; }; // when we close index tree, return tree info to record necessary inner info @@ -212,6 +222,8 @@ public: common::ObCompressorType compressor_type_; int64_t encrypt_id_; int64_t master_key_id_; + int64_t nested_offset_; + int64_t nested_size_; char encrypt_key_[share::OB_MAX_TABLESPACE_ENCRYPT_KEY_LENGTH]; DISALLOW_COPY_AND_ASSIGN(ObSSTableMergeRes); }; @@ -320,6 +332,8 @@ private: ObDataMacroBlockMeta macro_meta_; ObArenaAllocator row_allocator_; int64_t data_blocks_cnt_; + int64_t meta_block_offset_; + int64_t meta_block_size_; }; class ObMetaIndexBlockBuilder : public ObBaseIndexBlockBuilder @@ -331,10 +345,16 @@ public: ObIAllocator &allocator, ObMacroBlockWriter ¯o_writer); int append_leaf_row(const ObDatumRow &leaf_row); - int close(ObIAllocator &allocator, ObIndexTreeRootBlockDesc &block_desc); + int close(const IndexMicroBlockDescList &roots, ObIndexTreeRootBlockDesc &block_desc); void reset(); + int build_single_macro_row_desc(const IndexMicroBlockDescList &roots); private: - int build_micro_block(); + int build_micro_block(ObMicroBlockDesc µ_block_desc); + int append_micro_block(ObMicroBlockDesc µ_block_desc); + int build_single_node_tree( + ObIAllocator &allocator, + const ObMicroBlockDesc µ_block_desc, + ObIndexTreeRootBlockDesc &block_desc); private: ObIMicroBlockWriter *micro_writer_; ObMacroBlockWriter *macro_writer_; @@ -389,10 +409,21 @@ private: class ObSSTableIndexBuilder final { +public: + enum ObSpaceOptimizationMode + { + ENABLE = 0, // enable the optimization for small sstable with given occupy_size + DISABLE = 1, // disable the optimization + AUTO = 2 // enable the optimization without giving occupy_size (users don't know/assign occupy_size) + }; public: ObSSTableIndexBuilder(); ~ObSSTableIndexBuilder(); - int init(const ObDataStoreDesc &index_desc, ObIMacroBlockFlushCallback *callback = nullptr); + int init( + const ObDataStoreDesc &index_desc, + ObIMacroBlockFlushCallback *callback = nullptr, + ObSpaceOptimizationMode mode = ENABLE); + void reset(); int new_index_builder(ObDataIndexBlockBuilder *&builder, ObDataStoreDesc &data_store_desc, ObIAllocator &data_allocator); @@ -406,6 +437,15 @@ public: int close(const int64_t column_cnt, ObSSTableMergeRes &res); TO_STRING_KV(K(roots_.count())); private: + int check_and_rewrite_sstable(ObSSTableMergeRes &res); + int check_and_rewrite_sstable_without_size(ObSSTableMergeRes &res); + int do_check_and_rewrite_sstable(ObBlockInfo &block_info); + int parse_macro_header( + const char *buf, + const int64_t buf_size, + ObSSTableMacroBlockHeader ¯o_header); + int rewrite_small_sstable(ObSSTableMergeRes &res); + bool check_single_block(); int set_row_store_type(ObDataStoreDesc &index_desc); bool check_index_desc(const ObDataStoreDesc &index_desc) const; int trim_empty_roots(); @@ -435,6 +475,7 @@ private: ObIMacroBlockFlushCallback *callback_; IndexMicroBlockDescList roots_; ObSSTableMergeRes res_; + ObSpaceOptimizationMode optimization_mode_; bool is_closed_; bool is_inited_; DISALLOW_COPY_AND_ASSIGN(ObSSTableIndexBuilder); diff --git a/src/storage/blocksstable/ob_index_block_row_scanner.cpp b/src/storage/blocksstable/ob_index_block_row_scanner.cpp index 477f6b28c3..0a3b084d41 100644 --- a/src/storage/blocksstable/ob_index_block_row_scanner.cpp +++ b/src/storage/blocksstable/ob_index_block_row_scanner.cpp @@ -212,8 +212,8 @@ ObIndexBlockRowScanner::ObIndexBlockRowScanner() current_(ObIMicroBlockReaderInfo::INVALID_ROW_INDEX), start_(ObIMicroBlockReaderInfo::INVALID_ROW_INDEX), end_(ObIMicroBlockReaderInfo::INVALID_ROW_INDEX), - step_(1), range_idx_(0), is_transformed_(false), is_get_(false), is_reverse_scan_(false), - is_left_border_(false), is_right_border_(false), is_inited_(false) + step_(1), range_idx_(0), nested_offset_(0), is_transformed_(false), is_get_(false), + is_reverse_scan_(false), is_left_border_(false), is_right_border_(false), is_inited_(false) {} ObIndexBlockRowScanner::~ObIndexBlockRowScanner() {} @@ -249,6 +249,7 @@ void ObIndexBlockRowScanner::reset() end_ = ObIMicroBlockReaderInfo::INVALID_ROW_INDEX; step_ = 1; range_idx_ = 0; + nested_offset_ = 0; is_transformed_ = false; is_get_ = false; is_reverse_scan_ = false; @@ -262,7 +263,8 @@ int ObIndexBlockRowScanner::init( const ObIArray &agg_column_schema, const ObTableReadInfo *index_read_info, ObIAllocator &allocator, - const common::ObQueryFlag &query_flag) + const common::ObQueryFlag &query_flag, + const int64_t nested_offset) { int ret = OB_SUCCESS; if (IS_INIT) { @@ -280,6 +282,7 @@ int ObIndexBlockRowScanner::init( is_reverse_scan_ = query_flag.is_reverse_scan(); step_ = is_reverse_scan_ ? -1 : 1; index_read_info_ = index_read_info; + nested_offset_ = nested_offset; is_inited_ = true; } return ret; @@ -383,6 +386,7 @@ int ObIndexBlockRowScanner::get_next(ObMicroIndexInfo &idx_block_row) idx_block_row.range_idx_ = range_idx_; idx_block_row.query_range_ = query_range_; idx_block_row.parent_macro_id_ = macro_id_; + idx_block_row.nested_offset_ = nested_offset_; } LOG_DEBUG("Get next index block row", K(ret), K_(current), K_(start), K_(end), K(idx_block_row)); return ret; diff --git a/src/storage/blocksstable/ob_index_block_row_scanner.h b/src/storage/blocksstable/ob_index_block_row_scanner.h index fd2a4e877c..b28deca619 100644 --- a/src/storage/blocksstable/ob_index_block_row_scanner.h +++ b/src/storage/blocksstable/ob_index_block_row_scanner.h @@ -93,7 +93,8 @@ public: const ObIArray &agg_column_schema, const storage::ObTableReadInfo *index_read_info, ObIAllocator &allocator, - const common::ObQueryFlag &query_flag); + const common::ObQueryFlag &query_flag, + const int64_t nested_offset); int open( const MacroBlockId ¯o_id, const ObMicroBlockData &idx_block_data, @@ -111,7 +112,10 @@ public: int get_index_row_count(int64_t &index_row_count) const; int check_blockscan(const ObDatumRowkey &rowkey, bool &can_blockscan); OB_INLINE bool is_valid() const { return is_inited_; } - OB_INLINE void set_index_read_info(const ObTableReadInfo *index_read_info) { index_read_info_ = index_read_info; } + OB_INLINE void switch_context(const ObTableReadInfo *index_read_info, const int64_t nested_offset) { + index_read_info_ = index_read_info; + nested_offset_ = nested_offset; + } TO_STRING_KV(K_(current), K_(start), K_(end), K_(step), K_(range_idx), K_(is_get), K_(is_reverse_scan), K_(is_left_border), K_(is_right_border), @@ -144,6 +148,7 @@ private: int64_t end_; // inclusive int64_t step_; int16_t range_idx_; + int64_t nested_offset_; bool is_transformed_; bool is_get_; bool is_reverse_scan_; diff --git a/src/storage/blocksstable/ob_index_block_row_struct.h b/src/storage/blocksstable/ob_index_block_row_struct.h index f5a0d681bf..7d75e6db32 100644 --- a/src/storage/blocksstable/ob_index_block_row_struct.h +++ b/src/storage/blocksstable/ob_index_block_row_struct.h @@ -199,7 +199,8 @@ public: query_range_(nullptr), flag_(0), range_idx_(-1), - parent_macro_id_() + parent_macro_id_(), + nested_offset_(0) { } OB_INLINE void reset() @@ -211,6 +212,7 @@ public: flag_ = 0; range_idx_ = -1; parent_macro_id_.reset(); + nested_offset_ = 0; } OB_INLINE bool is_valid() const { @@ -249,7 +251,7 @@ public: OB_INLINE uint64_t get_block_offset() const { OB_ASSERT(nullptr != row_header_); - return row_header_->get_block_offset(); + return row_header_->get_block_offset() + nested_offset_; } OB_INLINE uint64_t get_block_size() const { @@ -346,7 +348,7 @@ public: } TO_STRING_KV(KP_(query_range), KPC_(row_header), KPC_(minor_meta_info), KPC_(endkey), - K_(flag), K_(range_idx), K_(parent_macro_id)); + K_(flag), K_(range_idx), K_(parent_macro_id), K_(nested_offset)); public: const ObIndexBlockRowHeader *row_header_; @@ -371,6 +373,7 @@ public: }; int16_t range_idx_; MacroBlockId parent_macro_id_; + int64_t nested_offset_; }; diff --git a/src/storage/blocksstable/ob_index_block_tree_cursor.cpp b/src/storage/blocksstable/ob_index_block_tree_cursor.cpp index 9ea7580760..f21ab9d920 100644 --- a/src/storage/blocksstable/ob_index_block_tree_cursor.cpp +++ b/src/storage/blocksstable/ob_index_block_tree_cursor.cpp @@ -976,14 +976,21 @@ int ObIndexBlockTreeCursor::get_next_level_block( const ObIndexBlockRowHeader &idx_row_header) { int ret = OB_SUCCESS; + int64_t absolute_offset = 0; if (OB_UNLIKELY(!macro_block_id.is_valid() || !idx_row_header.is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("Invalid macro block id or index block row data", K(ret), K(macro_block_id), K(idx_row_header)); + } else { + absolute_offset = sstable_->get_macro_offset() + idx_row_header.get_block_offset(); + } + + if (OB_FAIL(ret)) { + // do nothing } else if (OB_FAIL(index_block_cache_->get_cache_block( tenant_id_, macro_block_id, - idx_row_header.get_block_offset(), + absolute_offset, idx_row_header.get_block_size(), curr_path_item_->cache_handle_))) { if (OB_UNLIKELY(OB_ENTRY_NOT_EXIST != ret)) { @@ -999,7 +1006,7 @@ int ObIndexBlockTreeCursor::get_next_level_block( ObMicroBlockDesMeta block_des_meta; bool is_compressed = false; read_info.macro_block_id_ = macro_block_id; - read_info.offset_ = idx_row_header.get_block_offset(); + read_info.offset_ = absolute_offset; read_info.size_ = idx_row_header.get_block_size(); read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_DATA_READ); read_info.io_desc_.set_category(ObIOCategory::USER_IO); diff --git a/src/storage/blocksstable/ob_macro_block_checker.cpp b/src/storage/blocksstable/ob_macro_block_checker.cpp index 34122137a7..88908bb465 100644 --- a/src/storage/blocksstable/ob_macro_block_checker.cpp +++ b/src/storage/blocksstable/ob_macro_block_checker.cpp @@ -41,6 +41,8 @@ int ObSSTableMacroBlockChecker::check( } else if (OB_FAIL(common_header.deserialize(macro_block_buf, macro_block_buf_size, pos))) { STORAGE_LOG(ERROR, "fail to deserialize common header", K(ret), KP(macro_block_buf), K(macro_block_buf_size), K(pos), K(common_header)); + } else if (common_header.is_shared_macro_block()) { + // skip the check } else if (OB_FAIL(common_header.check_integrity())) { ret = OB_INVALID_DATA; STORAGE_LOG(ERROR, "Invalid common header", K(ret), K(common_header)); diff --git a/src/storage/blocksstable/ob_macro_block_common_header.h b/src/storage/blocksstable/ob_macro_block_common_header.h index dfa5b8c1af..2b8e398717 100644 --- a/src/storage/blocksstable/ob_macro_block_common_header.h +++ b/src/storage/blocksstable/ob_macro_block_common_header.h @@ -41,6 +41,7 @@ public: BloomFilterData = 5, SSTableIndex = 6, SSTableMacroMeta = 7, + SharedSSTableData = 8, MaxMacroType, }; static_assert( @@ -66,6 +67,7 @@ public: bool is_bloom_filter_data_block() const { return MacroBlockType::BloomFilterData == attr_; } bool is_sstable_index_block() const { return MacroBlockType::SSTableIndex == attr_; } bool is_sstable_macro_meta_block() const { return MacroBlockType::SSTableMacroMeta == attr_; } + bool is_shared_macro_block() const { return MacroBlockType::SharedSSTableData == attr_; } int32_t get_header_size() const { return header_size_; } int32_t get_version() const { return version_; } int32_t get_magic() const { return magic_; } diff --git a/src/storage/blocksstable/ob_macro_block_handle.cpp b/src/storage/blocksstable/ob_macro_block_handle.cpp index 78d94c3afc..b4fa641f93 100644 --- a/src/storage/blocksstable/ob_macro_block_handle.cpp +++ b/src/storage/blocksstable/ob_macro_block_handle.cpp @@ -26,6 +26,7 @@ namespace oceanbase { namespace blocksstable { + /** * ---------------------------------------ObMacroBlockHandle---------------------------------------- */ @@ -285,5 +286,6 @@ int ObMacroBlocksHandle::reserve(const int64_t block_cnt) } return ret; } + } // namespace blocksstable } // namespace oceanbase diff --git a/src/storage/blocksstable/ob_macro_block_handle.h b/src/storage/blocksstable/ob_macro_block_handle.h index 808588a93a..e2d1e2cc41 100644 --- a/src/storage/blocksstable/ob_macro_block_handle.h +++ b/src/storage/blocksstable/ob_macro_block_handle.h @@ -71,6 +71,7 @@ private: common::ObArray macro_id_list_; DISALLOW_COPY_AND_ASSIGN(ObMacroBlocksHandle); }; + } // namespace blocksstable } // namespace oceanbase diff --git a/src/storage/blocksstable/ob_macro_block_meta.cpp b/src/storage/blocksstable/ob_macro_block_meta.cpp index f621c0645a..f099fe107f 100644 --- a/src/storage/blocksstable/ob_macro_block_meta.cpp +++ b/src/storage/blocksstable/ob_macro_block_meta.cpp @@ -337,7 +337,9 @@ DEFINE_GET_SERIALIZE_SIZE(ObDataBlockMetaVal) //================================== ObDataMacroBlockMeta ================================== ObDataMacroBlockMeta::ObDataMacroBlockMeta() : val_(), - end_key_() + end_key_(), + nested_offset_(0), + nested_size_(0) { } diff --git a/src/storage/blocksstable/ob_macro_block_meta.h b/src/storage/blocksstable/ob_macro_block_meta.h index 5b564c8a07..cdb3406b14 100644 --- a/src/storage/blocksstable/ob_macro_block_meta.h +++ b/src/storage/blocksstable/ob_macro_block_meta.h @@ -67,8 +67,8 @@ public: int64_t data_zsize_; // sum of size of compressed/encrypted micro blocks int64_t original_size_; // sum of size of original micro blocks int64_t progressive_merge_round_; - int64_t block_offset_; - int64_t block_size_; + int64_t block_offset_; // offset of n-1 level index micro blocks + int64_t block_size_; // size of n-1 level index micro blocks int64_t row_count_; int64_t row_count_delta_; int64_t max_merged_trans_version_; @@ -120,11 +120,15 @@ public: { val_.reset(); end_key_.reset(); + nested_offset_ = 0; + nested_size_ = 0; } TO_STRING_KV(K_(val), K_(end_key)); public: ObDataBlockMetaVal val_; ObDatumRowkey end_key_; // rowkey is primary key + int64_t nested_offset_; + int64_t nested_size_; DISALLOW_COPY_AND_ASSIGN(ObDataMacroBlockMeta); }; diff --git a/src/storage/blocksstable/ob_micro_block_cache.cpp b/src/storage/blocksstable/ob_micro_block_cache.cpp index c4f1e83f64..ceea615a1b 100644 --- a/src/storage/blocksstable/ob_micro_block_cache.cpp +++ b/src/storage/blocksstable/ob_micro_block_cache.cpp @@ -19,6 +19,8 @@ #include "storage/blocksstable/ob_index_block_row_struct.h" #include "storage/blocksstable/ob_micro_block_cache.h" #include "storage/blocksstable/ob_block_manager.h" +#include "storage/blocksstable/ob_macro_block_handle.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" namespace oceanbase { @@ -368,15 +370,19 @@ int ObIMicroBlockCache::load_block( int ObIMicroBlockCache::prefetch( const uint64_t tenant_id, const MacroBlockId ¯o_id, - const ObIndexBlockRowHeader& idx_row_header, + const ObMicroIndexInfo& idx_row, const common::ObQueryFlag &flag, ObMacroBlockHandle ¯o_handle, ObIMicroBlockIOCallback &callback) { int ret = OB_SUCCESS; + const ObIndexBlockRowHeader *idx_row_header = idx_row.row_header_; BaseBlockCache *cache = nullptr; ObIAllocator *allocator = nullptr; - if (OB_FAIL(get_cache(cache))) { + if (OB_ISNULL(idx_row_header)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", K(ret)); + } else if (OB_FAIL(get_cache(cache))) { LOG_WARN("Fail to get base cache", K(ret)); } else if (OB_FAIL(get_allocator(allocator))) { LOG_WARN("Fail to get allocator", K(ret)); @@ -387,13 +393,13 @@ int ObIMicroBlockCache::prefetch( callback.put_size_stat_ = this; callback.tenant_id_ = tenant_id; callback.block_id_ = macro_id; - callback.offset_ = idx_row_header.get_block_offset(); - callback.size_ = idx_row_header.get_block_size(); - callback.row_store_type_ = idx_row_header.get_row_store_type(); - callback.block_des_meta_.compressor_type_ = idx_row_header.get_compressor_type(); - callback.block_des_meta_.encrypt_id_ = idx_row_header.get_encrypt_id(); - callback.block_des_meta_.master_key_id_ = idx_row_header.get_master_key_id(); - callback.block_des_meta_.encrypt_key_ = idx_row_header.get_encrypt_key(); + callback.offset_ = idx_row.get_block_offset(); + callback.size_ = idx_row.get_block_size(); + callback.row_store_type_ = idx_row.get_row_store_type(); + callback.block_des_meta_.compressor_type_ = idx_row_header->get_compressor_type(); + callback.block_des_meta_.encrypt_id_ = idx_row_header->get_encrypt_id(); + callback.block_des_meta_.master_key_id_ = idx_row_header->get_master_key_id(); + callback.block_des_meta_.encrypt_key_ = idx_row_header->get_encrypt_key(); callback.use_block_cache_ = flag.is_use_block_cache(); // fill read info ObMacroBlockReadInfo read_info; @@ -404,15 +410,15 @@ int ObIMicroBlockCache::prefetch( read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_DATA_READ); read_info.io_callback_ = &callback; common::align_offset_size( - idx_row_header.get_block_offset(), - idx_row_header.get_block_size(), + idx_row.get_block_offset(), + idx_row.get_block_size(), read_info.offset_, read_info.size_); if (OB_FAIL(ObBlockManager::async_read_block(read_info, macro_handle))) { STORAGE_LOG(WARN, "Fail to async read block, ", K(ret)); } else { EVENT_INC(ObStatEventIds::IO_READ_PREFETCH_MICRO_COUNT); - EVENT_ADD(ObStatEventIds::IO_READ_PREFETCH_MICRO_BYTES, idx_row_header.get_block_size()); + EVENT_ADD(ObStatEventIds::IO_READ_PREFETCH_MICRO_BYTES, idx_row.get_block_size()); } } return ret; @@ -763,11 +769,9 @@ int ObDataMicroBlockCache::prefetch( { int ret = OB_SUCCESS; const ObIndexBlockRowHeader *idx_header = idx_row.row_header_; - if (OB_ISNULL(idx_header)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("Invalid null index block row header", K(ret), K(idx_row)); - } else if (OB_UNLIKELY( - !idx_header->is_valid() + if (OB_UNLIKELY( + nullptr == idx_header + || !idx_header->is_valid() || 0 >= idx_header->get_block_size() || !idx_header->is_data_block())) { ret = OB_INVALID_ARGUMENT; @@ -779,7 +783,7 @@ int ObDataMicroBlockCache::prefetch( callback.need_write_extra_buf_ = idx_header->is_data_index() && ObStoreFormat::is_row_store_type_with_encoding(idx_header->get_row_store_type()); if (OB_FAIL(ObIMicroBlockCache::prefetch( - tenant_id, macro_id, *idx_header, flag, macro_handle, callback))) { + tenant_id, macro_id, idx_row, flag, macro_handle, callback))) { LOG_WARN("Fail to prefetch data micro block", K(ret)); } } @@ -1254,7 +1258,7 @@ int ObIndexMicroBlockCache::prefetch( callback.index_read_info_ = &index_read_info; callback.tablet_handle_ = tablet_handle; if (OB_FAIL(ObIMicroBlockCache::prefetch( - tenant_id, macro_id, *idx_header, flag, macro_handle, callback))) { + tenant_id, macro_id, idx_row, flag, macro_handle, callback))) { LOG_WARN("Fail to prefetch data micro block", K(ret)); } } diff --git a/src/storage/blocksstable/ob_micro_block_cache.h b/src/storage/blocksstable/ob_micro_block_cache.h index 5269b1331a..f372c13c0a 100644 --- a/src/storage/blocksstable/ob_micro_block_cache.h +++ b/src/storage/blocksstable/ob_micro_block_cache.h @@ -235,7 +235,7 @@ protected: virtual int prefetch( const uint64_t tenant_id, const MacroBlockId ¯o_id, - const ObIndexBlockRowHeader& idx_row_header, + const ObMicroIndexInfo& idx_row, const common::ObQueryFlag &flag, ObMacroBlockHandle ¯o_handle, ObIMicroBlockIOCallback &callback); diff --git a/src/storage/blocksstable/ob_micro_block_row_getter.cpp b/src/storage/blocksstable/ob_micro_block_row_getter.cpp index 0370999604..d6c9cb89ac 100644 --- a/src/storage/blocksstable/ob_micro_block_row_getter.cpp +++ b/src/storage/blocksstable/ob_micro_block_row_getter.cpp @@ -253,7 +253,7 @@ int ObMicroBlockRowGetter::get_block_row( if (store_row->row_flag_.is_not_exist()) { ++context_->table_store_stat_.get_row_.empty_read_cnt_; EVENT_INC(ObStatEventIds::GET_ROW_EMPTY_READ); - if (!context_->query_flag_.is_index_back() && context_->query_flag_.is_use_bloomfilter_cache()) { + if (!context_->query_flag_.is_index_back() && context_->query_flag_.is_use_bloomfilter_cache() && !sstable_->is_small_sstable()) { (void) OB_STORE_CACHE.get_bf_cache().inc_empty_read( MTL_ID(), param_->table_id_, diff --git a/src/storage/blocksstable/ob_shared_macro_block_manager.cpp b/src/storage/blocksstable/ob_shared_macro_block_manager.cpp new file mode 100644 index 0000000000..f5c07870b6 --- /dev/null +++ b/src/storage/blocksstable/ob_shared_macro_block_manager.cpp @@ -0,0 +1,748 @@ +/** + * Copyright (c) 2021 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#define USING_LOG_PREFIX STORAGE_BLKMGR + +#include "storage/blocksstable/ob_shared_macro_block_manager.h" + +#include "lib/oblog/ob_log_module.h" +#include "lib/utility/ob_macro_utils.h" +#include "share/ob_force_print_log.h" +#include "storage/blocksstable/ob_block_manager.h" +#include "share/ob_force_print_log.h" +#include "storage/blocksstable/ob_imicro_block_writer.h" +#include "storage/blocksstable/ob_macro_block_struct.h" +#include "storage/meta_mem/ob_tenant_meta_mem_mgr.h" +#include "storage/meta_mem/ob_tablet_handle.h" +#include "storage/blocksstable/ob_index_block_builder.h" +#include "storage/blocksstable/ob_sstable_sec_meta_iterator.h" +#include "storage/tablet/ob_tablet_create_delete_helper.h" +#include "storage/ls/ob_ls.h" +#include "share/ob_ls_id.h" +#include "storage/tx_storage/ob_ls_service.h" + +namespace oceanbase +{ +namespace blocksstable +{ +using namespace common; +using namespace common::hash; +using namespace share; + +/** + * ---------------------------------------ObBlockInfo---------------------------------------- + */ +ObBlockInfo::~ObBlockInfo() +{ + reset(); +} + +void ObBlockInfo::reset() +{ + nested_size_ = OB_DEFAULT_MACRO_BLOCK_SIZE; + nested_offset_ = 0; + macro_id_.reset(); +} + +bool ObBlockInfo::is_valid() const +{ + return macro_id_.is_valid() + && nested_offset_ >= 0 + && nested_size_ >= 0; +} + +bool ObBlockInfo::is_small_sstable() const +{ + return OB_DEFAULT_MACRO_BLOCK_SIZE != nested_size_; +} + +/** + * ---------------------------------------ObSharedMacroBlockMgr---------------------------------------- + */ +ObSharedMacroBlockMgr::ObSharedMacroBlockMgr() + : offset_(OB_DEFAULT_MACRO_BLOCK_SIZE), + common_header_buf_(nullptr), + header_size_(0), + mutex_(), + blocks_mutex_(), + block_used_size_(), + defragmentation_task_(*this), + timer_(), + is_inited_(false) +{ +} + +ObSharedMacroBlockMgr::~ObSharedMacroBlockMgr() +{ + destroy(); +} + +void ObSharedMacroBlockMgr::destroy() +{ + timer_.destroy(); + macro_handle_.reset(); + offset_ = OB_DEFAULT_MACRO_BLOCK_SIZE; // so we can init block automatically for first write + header_size_ = 0; + if (nullptr != common_header_buf_) { + ob_free(common_header_buf_); + } + common_header_buf_ = nullptr; + block_used_size_.destroy(); + is_inited_ = false; +} + +int ObSharedMacroBlockMgr::mtl_init(ObSharedMacroBlockMgr* &shared_block_mgr) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(nullptr == shared_block_mgr)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("shared_block_mgr is null", K(ret)); + } else if (OB_FAIL(shared_block_mgr->init())) { + LOG_WARN("fail to init shared_block_mgr", K(ret)); + } + return ret; +} + +int ObSharedMacroBlockMgr::init() +{ + int ret = OB_SUCCESS; + ObMacroBlockCommonHeader common_header; + common_header.reset(); + header_size_ = upper_align(common_header.get_serialize_size(), DIO_READ_ALIGN_SIZE); + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("shared macro block handle has been inited", K(ret)); + } else if (FALSE_IT(common_header.set_attr(ObMacroBlockCommonHeader::MacroBlockType::SharedSSTableData))) { + } else if (OB_ISNULL(common_header_buf_ = reinterpret_cast(ob_malloc(header_size_)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("fail to alloc memory for buffer that holds common header", K(ret), K(common_header)); + } else if (FALSE_IT(MEMSET(common_header_buf_, 9, header_size_))) { + } else if (OB_FAIL(common_header.build_serialized_header(common_header_buf_, common_header.get_serialize_size()))) { + LOG_WARN("fail to serialize common header", K(ret), K(common_header)); + } else if (OB_FAIL(block_used_size_.init("ShareBlksMap", MTL_ID()))) { + LOG_WARN("fail to init block used size array", K(ret)); + } else if (FALSE_IT(timer_.set_run_wrapper(MTL_CTX()))) { + } else if (OB_FAIL(timer_.init("SharedBlk"))) { + LOG_WARN("fail to init timer", K(ret)); + } else { + is_inited_ = true; + } + + if (IS_NOT_INIT) { + destroy(); + } + return ret; +} + +int ObSharedMacroBlockMgr::start() +{ + int ret = OB_SUCCESS; + if (!timer_.task_exist(defragmentation_task_) && OB_FAIL(timer_.schedule(defragmentation_task_, DEFRAGMENT_DELAY_US, true))) { + LOG_WARN("fail to schedule fragmentation task", K(ret)); + } + return ret; +} + +void ObSharedMacroBlockMgr::stop() +{ + timer_.stop(); +} + +void ObSharedMacroBlockMgr::wait() +{ + timer_.wait(); +} + +int ObSharedMacroBlockMgr::write_block( + const char *buf, + const int64_t size, + ObBlockInfo &block_info, + ObMacroBlocksWriteCtx &write_ctx) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("Shared Macro Block Handle hasn't been inited.", K(ret)); + } else if (OB_ISNULL(buf) || OB_UNLIKELY(size <= 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", K(ret), KP(buf), K(size)); + } else if (OB_UNLIKELY(0 != size % DIO_READ_ALIGN_SIZE)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("write size is not aligned", K(ret), K(size)); + } else { + ObMacroBlockWriteInfo write_info; + write_info.buffer_ = buf; + write_info.io_desc_.set_category(ObIOCategory::SYS_IO); + write_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_WRITE); + write_info.size_ = size; + lib::ObMutexGuard guard(mutex_); + + if (size >= SMALL_SSTABLE_STHRESHOLD_SIZE) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("small sstable's size shouldn't be larger than 1 MB", K(ret), K(write_info.size_)); + } else if (offset_ + size > OB_DEFAULT_MACRO_BLOCK_SIZE) { + if (OB_FAIL(try_switch_macro_block())) { + LOG_WARN("fail to switch macro handle", K(ret)); + } + } + + if (OB_SUCC(ret)) { + write_info.offset_ = offset_; + if (OB_FAIL(do_write_block(write_info, block_info))) { + LOG_WARN("fail to do write block", K(ret), K(write_info), K(block_info)); + } else if (FALSE_IT(write_ctx.reset())) { + } else if (OB_FAIL(write_ctx.add_macro_block_id(macro_handle_.get_macro_id()))) { + LOG_WARN("fail to add macro block id into write_ctx", + K(ret), K(macro_handle_.get_macro_id()), K(write_ctx)); + } else { + FLOG_INFO("successfully write small sstable", K(ret), K(block_info), K(offset_)); + } + } + } + + return ret; +} + +int ObSharedMacroBlockMgr::do_write_block( + const ObMacroBlockWriteInfo &write_info, + ObBlockInfo &block_info) +{ + int ret = OB_SUCCESS; + const int64_t io_timeout_ms = std::max(GCONF._data_storage_io_timeout / 1000, DEFAULT_IO_WAIT_TIME_MS); + + if (OB_FAIL(macro_handle_.async_write(write_info))) { + LOG_WARN("fail to async write virtual macro block", K(ret), K(macro_handle_)); + } else if (OB_FAIL(macro_handle_.wait(io_timeout_ms))) { + LOG_WARN("fail to wait previous io", K(ret), K(io_timeout_ms)); + } else if (!macro_handle_.is_empty() && MICRO_BLOCK_MERGE_VERIFY_LEVEL::ENCODING_AND_COMPRESSION_AND_WRITE_COMPLETE == + GCONF.micro_block_merge_verify_level && 0 != offset_) { + if (OB_FAIL(check_write_complete(macro_handle_.get_macro_id(), write_info.size_))) { + LOG_WARN("fail to check write completion", K(ret)); + } + } + + if (OB_SUCC(ret)) { + block_info.macro_id_ = macro_handle_.get_macro_id(); + block_info.nested_size_ = write_info.size_; + block_info.nested_offset_ = offset_; + offset_ += write_info.size_; + } + return ret; +} + +int ObSharedMacroBlockMgr::check_write_complete(const MacroBlockId ¯o_id, const int64_t macro_size) +{ + int ret = OB_SUCCESS; + ObMacroBlockReadInfo read_info; + read_info.macro_block_id_ = macro_id; + read_info.size_ = macro_size; + read_info.offset_ = offset_; + read_info.io_desc_.set_category(ObIOCategory::SYS_IO); + read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_READ); + const int64_t io_timeout_ms = std::max(GCONF._data_storage_io_timeout / 1000, DEFAULT_IO_WAIT_TIME_MS); + ObMacroBlockHandle read_handle; + ObSSTableMacroBlockChecker macro_block_checker; + + if (OB_FAIL(ObBlockManager::async_read_block(read_info, read_handle))) { + LOG_WARN("fail to async read macro block", K(ret), K(read_info)); + } else if (OB_FAIL(read_handle.wait(io_timeout_ms))) { + LOG_WARN("fail to wait io finish", K(ret), K(io_timeout_ms)); + } else if (OB_FAIL(macro_block_checker.check( + read_handle.get_buffer(), + read_handle.get_data_size(), + CHECK_LEVEL_PHYSICAL))) { + LOG_WARN("fail to verify macro block", K(ret), K(macro_id)); + } + return ret; +} + +int ObSharedMacroBlockMgr::try_switch_macro_block() +{ + int ret = OB_SUCCESS; + const MacroBlockId &block_id = macro_handle_.get_macro_id(); + const int32_t used_size = offset_; + if (block_id.is_valid() && OB_FAIL(add_block(block_id, used_size))) { + LOG_WARN("fail to add cur block to map", K(ret), K(block_id)); + } else if (FALSE_IT(macro_handle_.reset())) { + } else if (FALSE_IT(offset_ = 0)) { + } else if (OB_FAIL(OB_SERVER_BLOCK_MGR.alloc_block(macro_handle_))) { + LOG_WARN("fail to alloc block for new macro block", K(ret)); + } else { + ObMacroBlockWriteInfo write_info; + ObBlockInfo block_info; + write_info.buffer_ = common_header_buf_; + write_info.size_ = header_size_; + write_info.offset_ = 0; + write_info.io_desc_.set_category(ObIOCategory::SYS_IO); + write_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_WRITE); + if (OB_FAIL(do_write_block(write_info, block_info))) { + LOG_WARN("fail to write common header to the shared macro block", K(ret), K(block_info)); + } + } + return ret; +} + +int ObSharedMacroBlockMgr::add_block(const MacroBlockId &block_id, const int64_t block_size) +{ + int ret = OB_SUCCESS; + int32_t curr_size = 0; + if (OB_UNLIKELY(!block_id.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid block id", K(ret), K(block_id), K(block_size)); + } else { + // block_size may execeeds default 2M + // since we need get_and_set used_size of blocks, we need mutex to protect array + lib::ObMutexGuard guard(blocks_mutex_); + if (OB_FAIL(block_used_size_.get(block_id, curr_size)) && OB_ENTRY_NOT_EXIST != ret) { + LOG_WARN("fail to get block id from map", K(ret), K(block_id)); + } else if ((curr_size += block_size) == 0) { + if (OB_FAIL(block_used_size_.erase(block_id))) { + LOG_WARN("fail to erase id from map", K(ret), K(block_id)); + } + } else if (OB_FAIL(block_used_size_.insert_or_update(block_id, curr_size))) { + LOG_WARN("fail to add block to map", K(ret), K(block_id), K(curr_size)); + } + } + return ret; +} + +int ObSharedMacroBlockMgr::free_block(const MacroBlockId &block_id, const int64_t block_size) +{ + int ret = OB_SUCCESS; + int32_t curr_size = 0; + if (OB_UNLIKELY(!block_id.is_valid() || block_size <= 0 + || block_size >= SMALL_SSTABLE_STHRESHOLD_SIZE)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid block size or id", K(ret), K(block_id), K(block_size)); + } else { + // since we need get_and_set used_size of blocks, we need mutex to protect array + lib::ObMutexGuard guard(blocks_mutex_); + if (OB_FAIL(block_used_size_.get(block_id, curr_size)) && OB_ENTRY_NOT_EXIST != ret) { + LOG_WARN("fail to get block id from map", K(ret), K(block_id)); + } else if ((curr_size -= block_size) == 0) { + if (OB_FAIL(block_used_size_.erase(block_id))) { + LOG_WARN("fail to erase id from map", K(ret), K(block_id)); + } + } else if (OB_FAIL(block_used_size_.insert_or_update(block_id, curr_size))) { + LOG_WARN("fail to set block used size", K(ret), K(block_id), K(block_size), K(curr_size)); + } + } + return ret; +} + +int ObSharedMacroBlockMgr::get_recyclable_blocks(ObIArray &block_ids, ObIAllocator &allocator) +{ + int ret = OB_SUCCESS; + { + // since we need for_loop, we need mutex to protect array + lib::ObMutexGuard guard(blocks_mutex_); + ObFixedArray recycled_block_ids(allocator); + GetSmallBlockOp getOp(block_ids, recycled_block_ids); + + if (OB_FAIL(recycled_block_ids.init(MAX_RECYCLABLE_BLOCK_CNT))) { + LOG_WARN("fail to init recycled_block_ids", K(ret)); + } else if (OB_FAIL(block_used_size_.for_each(getOp))) { + if (OB_ITER_END == getOp.get_execution_ret() && MAX_RECYCLABLE_BLOCK_CNT == block_ids.count()) { + ret = OB_SUCCESS; + FLOG_INFO("number of recyclable blocks reaches 1000", K(ret)); + } else { + LOG_WARN("fail to get recyclable blocks", K(ret), K(block_ids), K(recycled_block_ids)); + } + } + + if (OB_FAIL(ret)) { + // do nothing + } else { + int tmp_ret = OB_SUCCESS; + for (int64_t i = 0; i < recycled_block_ids.count(); ++i) { // ignore tmp_ret + const MacroBlockId &block_id = recycled_block_ids.at(i); + if (OB_TMP_FAIL(block_used_size_.erase(block_id))) { + LOG_WARN("fail to erase id from map", K(tmp_ret), K(block_id)); + } + } + } + } + return ret; +} + +int ObSharedMacroBlockMgr::defragment() +{ + int ret = OB_SUCCESS; + ObArenaAllocator task_allocator; + ObFixedArray macro_ids(task_allocator); + ObTenantTabletIterator tablet_iter(*(MTL(ObTenantMetaMemMgr*)), task_allocator); + ObSSTableIndexBuilder *sstable_index_builder = nullptr; + ObIndexBlockRebuilder *index_block_rebuilder = nullptr; + int64_t rewrite_cnt = 0; + + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("ObSharedMacroBlockMgr hasn't been initiated", K(ret)); + } else if (OB_FAIL(macro_ids.init(MAX_RECYCLABLE_BLOCK_CNT))) { + LOG_WARN("fail to init macro ids", K(ret)); + } else if (OB_FAIL(get_recyclable_blocks(macro_ids, task_allocator))) { + LOG_WARN("fail to get recycle blocks", K(ret)); + } else if (macro_ids.empty()) { + // skip following steps + } else if (OB_FAIL(alloc_for_tools(task_allocator, sstable_index_builder, index_block_rebuilder))) { + LOG_WARN("fail to allocate memory for index builders", K(ret)); + } else { + ObTabletHandle tablet_handle; + while (OB_SUCC(ret)) { + if (OB_FAIL(tablet_iter.get_next_tablet(tablet_handle))) { + LOG_WARN("fail to get tablet", K(ret), K(tablet_handle)); + } else if (OB_UNLIKELY(!tablet_handle.is_valid())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid tablet handle", K(ret), K(tablet_handle)); + } else if (tablet_handle.get_obj()->is_ls_inner_tablet()) { + // skip update + } else if (OB_FAIL(update_tablet( + tablet_handle, + macro_ids, + rewrite_cnt, + *sstable_index_builder, + *index_block_rebuilder))) { + LOG_WARN("fail to update tablet", K(ret)); + } + if (OB_UNLIKELY(OB_EAGAIN == ret)) { + ret = OB_SUCCESS; + } + } + } + + if (OB_ITER_END == ret || OB_SUCC(ret)) { + ret = OB_SUCCESS; + FLOG_INFO("successfully defragment data blocks", K(ret), K(rewrite_cnt)); + } + + if (nullptr != sstable_index_builder) { + sstable_index_builder->~ObSSTableIndexBuilder(); + task_allocator.free(sstable_index_builder); + sstable_index_builder = nullptr; + } + if (nullptr != index_block_rebuilder) { + index_block_rebuilder->~ObIndexBlockRebuilder(); + task_allocator.free(index_block_rebuilder); + index_block_rebuilder = nullptr; + } + + if (OB_FAIL(ret) && REACH_COUNT_INTERVAL(FAILURE_COUNT_INTERVAL)) { + LOG_ERROR("defragmentation can't be finished, something is wrong", K(ret), K(macro_ids), K(*this)); + } + + return ret; +} + +int ObSharedMacroBlockMgr::update_tablet( + const ObTabletHandle &tablet_handle, + const ObIArray ¯o_ids, + int64_t &rewrite_cnt, + ObSSTableIndexBuilder &sstable_index_builder, + ObIndexBlockRebuilder &index_block_rebuilder) +{ + int ret = OB_SUCCESS; + ObSArray table_handles; + ObTableHandleV2 sstable_handle; + ObSArray sstables; + + if (OB_FAIL(tablet_handle.get_obj()->get_all_sstables(sstables))) { + LOG_WARN("fail to get sstables of this tablet", K(ret)); + } + for (int64_t i = 0; i < sstables.count() && OB_SUCC(ret); i++) { + const ObSSTable *sstable = static_cast(sstables.at(i)); + if (OB_ISNULL(sstable) || !sstable->is_valid()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("the sstable is null or invalid", K(ret)); + } else if (sstable->is_small_sstable()) { + const ObIArray &data_block_ids = sstable->get_meta().get_macro_info().get_data_block_ids(); + if (OB_UNLIKELY(1 != data_block_ids.count())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("this sstable is not small", K(ret), K(data_block_ids.count()), K(sstable->is_small_sstable())); + } else if (is_contain(macro_ids, data_block_ids.at(0))) { + if (OB_FAIL(rebuild_sstable( + *(tablet_handle.get_obj()), + *sstable, + sstable_index_builder, + index_block_rebuilder, + sstable_handle))) { + LOG_WARN("fail to rebuild sstable and update tablet", K(ret)); + } else if (OB_FAIL(table_handles.push_back(sstable_handle))) { + LOG_WARN("fail to push table handle to array", K(ret)); + } + } + } + } + + if (OB_SUCC(ret) && !table_handles.empty()) { + const ObTabletMeta &tablet_meta = tablet_handle.get_obj()->get_tablet_meta(); + const share::ObLSID &ls_id = tablet_meta.ls_id_; + ObLSService *ls_svr = MTL(ObLSService*); + ObLSHandle ls_handle; + + if (OB_FAIL(ls_svr->get_ls(ls_id, ls_handle, ObLSGetMod::STORAGE_MOD))) { + LOG_WARN("fail to get ls handle", K(ret), K(ls_id)); + } else { + const int64_t rebuild_seq = ls_handle.get_ls()->get_rebuild_seq(); + if (OB_UNLIKELY(!ls_handle.is_valid())) { + LOG_WARN("la handle is invalid", K(ret), K(ls_handle)); + } else if (OB_FAIL(ls_handle.get_ls()->update_tablet_table_store( + rebuild_seq, tablet_handle, table_handles))) { + LOG_WARN("fail to replace small sstables in the tablet", K(ret), K(rebuild_seq), K(tablet_handle), K(table_handles)); + } else { + rewrite_cnt += table_handles.count(); + } + } + } + + return ret; +} + +int ObSharedMacroBlockMgr::rebuild_sstable( + const ObTablet &tablet, + const ObSSTable &old_sstable, + ObSSTableIndexBuilder &sstable_index_builder, + ObIndexBlockRebuilder &index_block_rebuilder, + ObTableHandleV2 &table_handle) +{ + int ret = OB_SUCCESS; + ObDataStoreDesc data_desc; + ObMergeType merge_type; + sstable_index_builder.reset(); + index_block_rebuilder.reset(); + table_handle.reset(); + ObDataMacroBlockMeta data_macro_meta; + ObMacroBlockHandle block_handle; + ObBlockInfo block_info; + ObMacroBlocksWriteCtx write_ctx; + ObSSTableMergeRes res; + const int64_t column_count = old_sstable.get_meta().get_basic_meta().column_cnt_; + + if (OB_FAIL(parse_merge_type(old_sstable, merge_type))) { + LOG_WARN("fail to parse merge type from old_sstable", K(ret)); + } else if (OB_FAIL(prepare_data_desc( + tablet, merge_type, tablet.get_snapshot_version(), GET_MIN_CLUSTER_VERSION(), data_desc))) { + LOG_WARN("fail to prepare data desc", K(ret), K(merge_type), K(tablet.get_snapshot_version())); + } else if (OB_FAIL(sstable_index_builder.init(data_desc, nullptr, ObSSTableIndexBuilder::DISABLE))) { + LOG_WARN("fail to init sstable index builder", K(ret), K(data_desc)); + } else if (OB_FAIL(index_block_rebuilder.init(sstable_index_builder))) { + LOG_WARN("fail to init index block rebuilder", K(ret)); + } else if (OB_FAIL(read_sstable_block(old_sstable, block_handle))) { + LOG_WARN("fail to read old_sstable's block", K(ret)); + } else if (OB_FAIL(write_block( + block_handle.get_buffer(), block_handle.get_data_size(), block_info, write_ctx))) { + LOG_WARN("fail to write old_sstable's buf to new block", K(ret)); + } else if (OB_FAIL(index_block_rebuilder.append_macro_row( + block_handle.get_buffer(), block_handle.get_data_size(), block_info.macro_id_))) { + LOG_WARN("fail to append macro row", K(ret), K(block_info)); + } else if (OB_FAIL(index_block_rebuilder.close())) { + LOG_WARN("fail to close index block rebuilder", K(ret)); + } else if (OB_FAIL(sstable_index_builder.close(column_count, res))) { + LOG_WARN("fail to close sstable index builder", K(ret), K(column_count)); + } else if (OB_FAIL(create_new_sstable(res, tablet, old_sstable, block_info, table_handle))) { + LOG_WARN("fail to create new sstable", K(ret)); + } else { + ObSSTable *new_sstable = nullptr; + if (OB_FAIL(table_handle.get_sstable(new_sstable))) { + LOG_WARN("fail to get new sstable", K(table_handle)); + } else if (OB_FAIL(new_sstable->set_upper_trans_version(old_sstable.get_meta().get_basic_meta().upper_trans_version_))) { + LOG_WARN("fail to update upper trans version", K(ret), K(old_sstable.get_meta().get_basic_meta().upper_trans_version_)); + } else if (OB_UNLIKELY(new_sstable->get_key() != old_sstable.get_key()) + || OB_FAIL(ObSSTableMetaChecker::check_sstable_meta_strict_equality(old_sstable.get_meta(), new_sstable->get_meta()))) { + ret = OB_INVALID_DATA; + LOG_WARN("new sstable is not equal to old sstable", K(ret), K(new_sstable->get_key()), K(old_sstable.get_key())); + } + } + + return ret; +} + +int ObSharedMacroBlockMgr::create_new_sstable( + const ObSSTableMergeRes &res, + const ObTablet &tablet, + const ObSSTable &old_table, + const ObBlockInfo &block_info, + ObTableHandleV2 &table_handle) const +{ + int ret = OB_SUCCESS; + const ObStorageSchema &storage_schema = tablet.get_storage_schema(); + const ObSSTableBasicMeta &basic_meta = old_table.get_meta().get_basic_meta(); + table_handle.reset(); + ObTabletCreateSSTableParam param; + + param.filled_tx_scn_ = basic_meta.filled_tx_scn_; + param.ddl_scn_ = basic_meta.ddl_scn_; + param.table_key_ = old_table.get_key(); + param.table_mode_ = basic_meta.table_mode_; + param.index_type_ = static_cast(basic_meta.index_type_); + param.schema_version_ = basic_meta.schema_version_; + param.create_snapshot_version_ = basic_meta.create_snapshot_version_; + param.progressive_merge_round_ = basic_meta.progressive_merge_round_; + param.progressive_merge_step_ = basic_meta.progressive_merge_step_; + param.rowkey_column_cnt_ = basic_meta.rowkey_column_count_; + + ObSSTableMergeRes::fill_addr_and_data(res.root_desc_, + param.root_block_addr_, param.root_block_data_); + ObSSTableMergeRes::fill_addr_and_data(res.data_root_desc_, + param.data_block_macro_meta_addr_, param.data_block_macro_meta_); + param.root_row_store_type_ = res.root_desc_.row_type_; + param.data_index_tree_height_ = res.root_desc_.height_; + param.index_blocks_cnt_ = res.index_blocks_cnt_; + param.data_blocks_cnt_ = res.data_blocks_cnt_; + param.micro_block_cnt_ = res.micro_block_cnt_; + param.use_old_macro_block_count_ = res.use_old_macro_block_count_; + param.row_count_ = res.row_count_; + param.column_cnt_ = res.data_column_cnt_; + param.data_checksum_ = res.data_checksum_; + param.occupy_size_ = res.occupy_size_; + param.original_size_ = res.original_size_; + param.max_merged_trans_version_ = res.max_merged_trans_version_; + param.contain_uncommitted_row_ = res.contain_uncommitted_row_; + param.compressor_type_ = res.compressor_type_; + param.encrypt_id_ = res.encrypt_id_; + param.master_key_id_ = res.master_key_id_; + param.data_block_ids_ = res.data_block_ids_; + param.is_meta_root_ = res.data_root_desc_.is_meta_root_; + param.nested_offset_ = block_info.nested_offset_; + param.nested_size_ = block_info.nested_size_; + param.other_block_ids_ = res.other_block_ids_; + MEMCPY(param.encrypt_key_, res.encrypt_key_, share::OB_MAX_TABLESPACE_ENCRYPT_KEY_LENGTH); + + if (param.table_key_.is_major_sstable()) { + if (OB_FAIL(res.fill_column_checksum(&storage_schema, param.column_checksums_))) { + LOG_WARN("fail to fill column checksum", K(ret), K(res)); + } + } + if (OB_FAIL(ret)) { + // do nothing + } else if (OB_FAIL(ObTabletCreateDeleteHelper::create_sstable(param, table_handle))) { + LOG_WARN("fail to create sstable", K(ret), K(param)); + } + + return ret; +} + +int ObSharedMacroBlockMgr::prepare_data_desc( + const ObTablet &tablet, + const ObMergeType &merge_type, + const int64_t snapshot_version, + const int64_t cluster_version, + ObDataStoreDesc &data_desc) const +{ + int ret = OB_SUCCESS; + data_desc.reset(); + if (OB_FAIL(data_desc.init( + tablet.get_storage_schema(), + tablet.get_tablet_meta().ls_id_, + tablet.get_tablet_meta().tablet_id_, + merge_type, + snapshot_version, + cluster_version))) { + LOG_WARN("fail to init data store desc", K(ret), + K(tablet), K(merge_type), K(snapshot_version), K(cluster_version)); + } else { + data_desc.row_column_count_ = data_desc.rowkey_column_count_ + 1; + data_desc.col_desc_array_.reset(); + data_desc.need_prebuild_bloomfilter_ = false; + if (OB_FAIL(data_desc.col_desc_array_.init(data_desc.row_column_count_))) { + LOG_WARN("fail to reserve column desc array", K(ret)); + } else if (OB_FAIL(tablet.get_storage_schema().get_rowkey_column_ids(data_desc.col_desc_array_))) { + LOG_WARN("fail to get rowkey column ids", K(ret)); + } else if (OB_FAIL(storage::ObMultiVersionRowkeyHelpper::add_extra_rowkey_cols(data_desc.col_desc_array_))) { + LOG_WARN("fail to add extra rowkey cols", K(ret)); + } else { + ObObjMeta meta; + meta.set_varchar(); + meta.set_collation_type(CS_TYPE_BINARY); + share::schema::ObColDesc col; + col.col_id_ = static_cast(data_desc.row_column_count_ + OB_APP_MIN_COLUMN_ID); + col.col_type_ = meta; + col.col_order_ = DESC; + if (OB_FAIL(data_desc.col_desc_array_.push_back(col))) { + LOG_WARN("fail to push back last col for index", K(ret), K(col)); + } + } + } + return ret; +} + +int ObSharedMacroBlockMgr::parse_merge_type(const ObSSTable &sstable, ObMergeType &merge_type) const +{ + int ret = OB_SUCCESS; + merge_type = ObMergeType::INVALID_MERGE_TYPE; + + if (sstable.is_major_sstable()) { + merge_type = ObMergeType::MAJOR_MERGE; + } else if (sstable.is_minor_sstable()) { + merge_type = ObMergeType::MINI_MINOR_MERGE; + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("sstable type is unexpected", K(ret), K(sstable)); + } + return ret; +} + +int ObSharedMacroBlockMgr::alloc_for_tools( + ObIAllocator &allocator, + ObSSTableIndexBuilder *&sstable_index_builder, + ObIndexBlockRebuilder *&index_block_rebuilder) +{ + int ret = OB_SUCCESS; + void *buf = nullptr; + if (OB_ISNULL(buf = allocator.alloc(sizeof(ObSSTableIndexBuilder)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("fail to allocate memory for sstable index builder", K(ret)); + } else if (FALSE_IT(sstable_index_builder = new (buf) ObSSTableIndexBuilder)) { + } else if (OB_ISNULL(buf = allocator.alloc(sizeof(ObIndexBlockRebuilder)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("allocate memory failed", K(ret)); + } else if (FALSE_IT(index_block_rebuilder = new (buf) ObIndexBlockRebuilder)) { + LOG_WARN("fail to allocate memory for index rebuilder", K(ret)); + } + return ret; +} + +int ObSharedMacroBlockMgr::read_sstable_block( + const ObSSTable &sstable, + ObMacroBlockHandle &block_handle) +{ + int ret = OB_SUCCESS; + ObMacroBlockReadInfo read_info; + const ObSSTableMacroInfo ¯o_info = sstable.get_meta().get_macro_info(); + read_info.macro_block_id_ = macro_info.get_data_block_ids().at(0); + read_info.offset_ = macro_info.get_nested_offset(); + read_info.size_ = upper_align(macro_info.get_nested_size(), DIO_READ_ALIGN_SIZE); + read_info.io_desc_.set_category(ObIOCategory::SYS_IO); + read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_READ); + + if (OB_FAIL(ObBlockManager::read_block(read_info, block_handle))) { + LOG_WARN("fail to read block", K(ret), K(read_info)); + } else if (OB_UNLIKELY(!block_handle.is_valid() + || macro_info.get_nested_size() != block_handle.get_data_size())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("block handle is invalid", K(ret), K(block_handle)); + } + return ret; +} + +/** + * ---------------------------------------ObBlockDefragmentationTask---------------------------------------- + */ +void ObSharedMacroBlockMgr::ObBlockDefragmentationTask::runTimerTask() +{ + int ret = OB_SUCCESS; + if (OB_FAIL(shared_mgr_.defragment())) { + LOG_WARN("fail to defragment small sstables", K(ret)); + } +} + +} // namespace blocksstable +} // namespace oceanbase \ No newline at end of file diff --git a/src/storage/blocksstable/ob_shared_macro_block_manager.h b/src/storage/blocksstable/ob_shared_macro_block_manager.h new file mode 100644 index 0000000000..bef58e0d67 --- /dev/null +++ b/src/storage/blocksstable/ob_shared_macro_block_manager.h @@ -0,0 +1,188 @@ +/** + * Copyright (c) 2021 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#ifndef OCEANBASE_BLOCKSSTABLE_OB_SHARED_MACRO_BLOCK_MANAGER +#define OCEANBASE_BLOCKSSTABLE_OB_SHARED_MACRO_BLOCK_MANAGER + +#include "storage/blocksstable/ob_macro_block_handle.h" +#include "lib/hash/ob_linear_hash_map.h" +#include "storage/blocksstable/ob_block_manager.h" +#include "lib/task/ob_timer.h" + +namespace oceanbase +{ +namespace share +{ +class ObLSID; +} +namespace blocksstable +{ +struct ObMacroBlocksWriteCtx; +class ObSSTableIndexBuilder; +class ObIndexBlockRebuilder; +class ObSSTableSecMetaIterator; +class ObSSTableMergeRes; +struct ObBlockInfo +{ +public: + ObBlockInfo() + : nested_size_(OB_DEFAULT_MACRO_BLOCK_SIZE), nested_offset_(0), macro_id_() + { + } + ~ObBlockInfo(); + void reset(); + bool is_valid() const; + bool is_small_sstable() const; + TO_STRING_KV(K_(nested_size), K_(nested_offset), K_(macro_id)); +public: + int64_t nested_size_; + int64_t nested_offset_; + MacroBlockId macro_id_; +}; + +// set it as a member variable of t3m in 4.1 +class ObSharedMacroBlockMgr final +{ +public: + ObSharedMacroBlockMgr(); + ~ObSharedMacroBlockMgr(); + void destroy(); + int init(); + int start(); + void stop(); + void wait(); + int write_block(const char* buf, const int64_t size, ObBlockInfo &block_info, ObMacroBlocksWriteCtx &write_ctx); + int add_block(const MacroBlockId &block_id, const int64_t block_size); + int free_block(const MacroBlockId &block_id, const int64_t block_size); + + TO_STRING_KV(K_(macro_handle), K_(offset), K_(header_size)); + + static int mtl_init(ObSharedMacroBlockMgr* &shared_block_mgr); + +private: + class ObBlockDefragmentationTask : public common::ObTimerTask + { + public: + ObBlockDefragmentationTask(ObSharedMacroBlockMgr &shared_mgr) + : shared_mgr_(shared_mgr) {} + virtual ~ObBlockDefragmentationTask() = default; + virtual void runTimerTask() override; + + private: + ObSharedMacroBlockMgr &shared_mgr_; + }; + + struct GetSmallBlockOp + { + public: + GetSmallBlockOp(ObIArray &block_ids, ObIArray &unused_block_ids) + : block_ids(block_ids), unused_block_ids(unused_block_ids), execution_ret_(OB_SUCCESS) {} + bool operator()(const MacroBlockId &id, const int32_t used_size) + { + int ret = OB_SUCCESS; + bool bool_ret = true; + bool is_free = false; + if (OB_UNLIKELY(MAX_RECYCLABLE_BLOCK_CNT == block_ids.count())) { + bool_ret = false; + execution_ret_ = OB_ITER_END; + } else if (OB_FAIL(OB_SERVER_BLOCK_MGR.check_macro_block_free(id, is_free))) { + STORAGE_LOG(WARN, "fail to check macro block free", K(ret), K(id)); + } else if (is_free && unused_block_ids.count() < MAX_RECYCLABLE_BLOCK_CNT) { + if (OB_FAIL(unused_block_ids.push_back(id))) { + STORAGE_LOG(WARN, "fail to push unused block id", K(ret), K(id)); + } else { + bool_ret = true; + } + } else if (used_size > 0 && used_size < RECYCLABLE_BLOCK_SIZE) { + if (OB_FAIL(block_ids.push_back(id))) { + STORAGE_LOG(WARN, "fail to get small block", K(ret), K(id)); + } else { + bool_ret = true; + } + } + if (OB_FAIL(ret)) { + bool_ret = false; + } + return bool_ret; + } + int64_t get_execution_ret() { return execution_ret_; } + private: + ObIArray &block_ids; + ObIArray &unused_block_ids; + int64_t execution_ret_; // if the number of recyclable blocks reaches 1000, set it to OB_ITER_END + }; + +private: + int defragment(); + int get_recyclable_blocks(ObIArray &block_ids, common::ObIAllocator &allocator); + int update_tablet( + const ObTabletHandle &tablet_handle, + const ObIArray ¯o_ids, + int64_t &rewrite_cnt, + ObSSTableIndexBuilder &sstable_index_builder, + ObIndexBlockRebuilder &index_block_rebuilder); + int rebuild_sstable( + const ObTablet &tablet, + const ObSSTable &old_sstable, + ObSSTableIndexBuilder &sstable_index_builder, + ObIndexBlockRebuilder &index_block_rebuilder, + ObTableHandleV2 &table_handle); + int prepare_data_desc( + const ObTablet &tablet, + const ObMergeType &merge_type, + const int64_t snapshot_version, + const int64_t cluster_version, + ObDataStoreDesc &data_desc) const; + int alloc_for_tools( + common::ObIAllocator &allocator, + ObSSTableIndexBuilder *&sstable_index_builder, + ObIndexBlockRebuilder *&index_block_rebuilder); + int read_sstable_block( + const ObSSTable &sstable, + ObMacroBlockHandle &block_handle); + int create_new_sstable( + const ObSSTableMergeRes &res, + const ObTablet &tablet, + const ObSSTable &old_table, + const ObBlockInfo &block_info, + ObTableHandleV2 &table_handle) const; + int parse_merge_type(const ObSSTable &sstable, ObMergeType &merge_type) const; + int try_switch_macro_block(); + int check_write_complete(const MacroBlockId ¯o_id, const int64_t macro_size); + int do_write_block(const ObMacroBlockWriteInfo &write_info, ObBlockInfo &block_info); + int init_block(); + DISALLOW_COPY_AND_ASSIGN(ObSharedMacroBlockMgr); + +private: + const static int64_t DEFRAGMENT_DELAY_US = 30 * 1000 * 1000; // 30s + const static int64_t SMALL_SSTABLE_STHRESHOLD_SIZE = 1L << 20; // 1MB + const static int64_t RECYCLABLE_BLOCK_SIZE = (2L << 20) * 3 / 10; // 2MB * 30% + const static int64_t MAX_RECYCLABLE_BLOCK_CNT = 1000; // return at most 1k blocks once to avoid timeout + const static int64_t FAILURE_COUNT_INTERVAL = 10; + +private: + ObMacroBlockHandle macro_handle_; + int64_t offset_; + char *common_header_buf_; + int64_t header_size_; + lib::ObMutex mutex_; + lib::ObMutex blocks_mutex_; // protect block_used_size_ + ObLinearHashMap block_used_size_; + ObBlockDefragmentationTask defragmentation_task_; + common::ObTimer timer_; + bool is_inited_; +}; + +} // namespace blocksstable +} // namespace oceanbase + +#endif \ No newline at end of file diff --git a/src/storage/blocksstable/ob_sstable.cpp b/src/storage/blocksstable/ob_sstable.cpp index 25b9c32565..37e447248a 100644 --- a/src/storage/blocksstable/ob_sstable.cpp +++ b/src/storage/blocksstable/ob_sstable.cpp @@ -29,6 +29,7 @@ #include "storage/tablet/ob_tablet_create_sstable_param.h" #include "storage/meta_mem/ob_tenant_meta_mem_mgr.h" #include "storage/compaction/ob_tenant_tablet_scheduler.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" namespace oceanbase { @@ -94,6 +95,7 @@ void ObSSTable::reset() // dec ref first, then reset sstable meta if (hold_macro_ref_) { dec_macro_ref(); + dec_used_size(); } meta_.reset(); valid_for_reading_ = false; @@ -922,6 +924,9 @@ void ObSSTable::dec_macro_ref() LOG_ERROR("fail to dec other block ref cnt", K(ret), K(macro_id), K(idx)); } } + if (OB_FAIL(dec_used_size())) {// ignore ret + LOG_ERROR("fail to dec used size of shared block", K(ret)); + } hold_macro_ref_ = false; } @@ -946,6 +951,9 @@ int ObSSTable::add_macro_ref() ++j; } } + if (OB_SUCC(ret) && OB_FAIL(add_used_size())) { + LOG_ERROR("fail to add used size", K(ret)); + } if (OB_FAIL(ret)) { int tmp_ret = OB_SUCCESS; int64_t idx = i - 1; @@ -999,6 +1007,9 @@ int ObSSTable::add_disk_ref() ++k; } } + if (OB_SUCC(ret) && OB_FAIL(add_used_size())) { + LOG_ERROR("fail to add used size", K(ret)); + } if (OB_FAIL(ret)) { int tmp_ret = OB_SUCCESS; int64_t idx = i - 1; @@ -1056,6 +1067,9 @@ int ObSSTable::dec_disk_ref() ++k; } } + if (OB_SUCC(ret) && OB_FAIL(dec_used_size())) { + LOG_ERROR("fail to dec used size of shared block", K(ret)); + } if (OB_FAIL(ret)) { int tmp_ret = OB_SUCCESS; int64_t idx = i - 1; @@ -1083,6 +1097,44 @@ int ObSSTable::dec_disk_ref() return ret; } +int ObSSTable::add_used_size() +{ + int ret = OB_SUCCESS; + if (is_small_sstable()) { + const ObSSTableMacroInfo ¯o_info = meta_.get_macro_info(); + const ObIArray &data_block_ids = macro_info.get_data_block_ids(); + ObSharedMacroBlockMgr *shared_block_mgr = MTL(ObSharedMacroBlockMgr*); + if (data_block_ids.count() == 0) { // skip + } else if (data_block_ids.count() != 1) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected data block ids", K(ret), K(macro_info)); + } else if (OB_FAIL(shared_block_mgr->add_block( + data_block_ids.at(0), meta_.get_macro_info().get_nested_size()))) { + LOG_WARN("fail to add used size of shared block", K(ret), K_(meta)); + } + } + return ret; +} + +int ObSSTable::dec_used_size() +{ + int ret = OB_SUCCESS; + if (is_small_sstable()) { + const ObSSTableMacroInfo ¯o_info = meta_.get_macro_info(); + const ObIArray &data_block_ids = macro_info.get_data_block_ids(); + ObSharedMacroBlockMgr *shared_block_mgr = MTL(ObSharedMacroBlockMgr*); + if (data_block_ids.count() == 0) { // skip + } else if (data_block_ids.count() != 1) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected data block ids", K(ret), K(macro_info)); + } else if (OB_FAIL(shared_block_mgr->free_block( + data_block_ids.at(0), meta_.get_macro_info().get_nested_size()))) { + LOG_WARN("fail to dec used size of shared block", K(ret), K_(meta)); + } + } + return ret; +} + int ObSSTable::pre_transform_root_block(const ObTableReadInfo &index_read_info) { return meta_.transform_root_block_data(index_read_info); diff --git a/src/storage/blocksstable/ob_sstable.h b/src/storage/blocksstable/ob_sstable.h index 126a91f54e..bf4b692301 100644 --- a/src/storage/blocksstable/ob_sstable.h +++ b/src/storage/blocksstable/ob_sstable.h @@ -131,8 +131,13 @@ public: int dec_disk_ref(); int pre_transform_root_block(const ObTableReadInfo &index_read_info); int set_status_for_read(const ObSSTableStatus status); + OB_INLINE int64_t get_macro_offset() const { return meta_.get_macro_info().get_nested_offset(); } OB_INLINE bool is_valid() const { return valid_for_reading_; } OB_INLINE const blocksstable::ObSSTableMeta &get_meta() const { return meta_; } + OB_INLINE int64_t get_macro_read_size() const { return meta_.get_macro_info().get_nested_size(); } + OB_INLINE bool is_small_sstable() const { + return OB_DEFAULT_MACRO_BLOCK_SIZE != meta_.get_macro_info().get_nested_size() + && 0 < meta_.get_macro_info().get_nested_size(); } OB_INLINE int get_index_tree_root( const ObTableReadInfo &index_read_info, blocksstable::ObMicroBlockData &index_data, @@ -174,6 +179,8 @@ public: private: int check_valid_for_reading(); int add_macro_ref(); + int add_used_size(); + int dec_used_size(); int build_exist_iterator( const ObTableIterParam &iter_param, const ObDatumRowkey &rowkey, diff --git a/src/storage/blocksstable/ob_sstable_meta.cpp b/src/storage/blocksstable/ob_sstable_meta.cpp index b23bbff7b6..fdbf95e7f5 100644 --- a/src/storage/blocksstable/ob_sstable_meta.cpp +++ b/src/storage/blocksstable/ob_sstable_meta.cpp @@ -343,13 +343,13 @@ int ObSSTableBasicMeta::set_upper_trans_version(const int64_t upper_trans_versio //================================== ObSSTableMeta ================================== ObSSTableMeta::ObSSTableMeta() - : basic_meta_(), - column_checksums_(), - data_root_info_(), - macro_info_(), + : is_inited_(false), allocator_(nullptr), lock_(), - is_inited_(false) + basic_meta_(), + column_checksums_(), + data_root_info_(), + macro_info_() { } @@ -466,6 +466,7 @@ int ObSSTableMeta::init_base_meta( basic_meta_.encrypt_id_ = param.encrypt_id_; basic_meta_.master_key_id_ = param.master_key_id_; MEMCPY(basic_meta_.encrypt_key_, param.encrypt_key_, share::OB_MAX_TABLESPACE_ENCRYPT_KEY_LENGTH); + basic_meta_.length_ = basic_meta_.get_serialize_size(); if (OB_FAIL(prepare_column_checksum(param.column_checksums_))) { LOG_WARN("fail to prepare column checksum", K(ret), K(param)); } @@ -824,6 +825,23 @@ int64_t ObMigrationSSTableParam::get_serialize_size_() const return len; } +int ObSSTableMetaChecker::check_sstable_meta_strict_equality( + const ObSSTableMeta &old_sstable_meta, + const ObSSTableMeta &new_sstable_meta) +{ + int ret = OB_SUCCESS; + + if (!old_sstable_meta.is_valid() || !new_sstable_meta.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("old sstable meta or new sstable meta is invalid", K(ret)); + } else if (OB_UNLIKELY(old_sstable_meta.get_basic_meta() != new_sstable_meta.get_basic_meta() + || !is_array_equal(old_sstable_meta.get_col_checksum(), new_sstable_meta.get_col_checksum()))) { + ret = OB_INVALID_DATA; + LOG_WARN("new sstable meta is not equal to old sstable meta", K(ret)); + } + + return ret; +} int ObSSTableMetaChecker::check_sstable_meta( const ObSSTableMeta &old_sstable_meta, @@ -836,6 +854,7 @@ int ObSSTableMetaChecker::check_sstable_meta( LOG_WARN("old sstable meta or new sstable meta is invalid", K(ret), K(old_sstable_meta), K(new_sstable_meta)); } else if (OB_FAIL(check_sstable_basic_meta_(old_sstable_meta.get_basic_meta(), new_sstable_meta.get_basic_meta()))) { LOG_WARN("failed to check sstable basic meta", K(ret), K(old_sstable_meta), K(new_sstable_meta)); + // TODO replace check_sstable_column_checksum_ with is_array_equal } else if (OB_FAIL(check_sstable_column_checksum_(old_sstable_meta.get_col_checksum(), new_sstable_meta.get_col_checksum()))) { LOG_WARN("failed to check sstable column checksum", K(ret), K(old_sstable_meta), K(new_sstable_meta)); } diff --git a/src/storage/blocksstable/ob_sstable_meta.h b/src/storage/blocksstable/ob_sstable_meta.h index 3de1e06564..3df078f541 100644 --- a/src/storage/blocksstable/ob_sstable_meta.h +++ b/src/storage/blocksstable/ob_sstable_meta.h @@ -126,7 +126,9 @@ public: void reset(); OB_INLINE bool is_valid() const { return is_inited_; } OB_INLINE bool contain_uncommitted_row() const { return basic_meta_.contain_uncommitted_row_; } - OB_INLINE bool is_empty() const { return 0 == basic_meta_.data_macro_block_count_; } + OB_INLINE bool is_empty() const { + return 0 == basic_meta_.data_macro_block_count_; + } OB_INLINE ObSSTableBasicMeta &get_basic_meta() { return basic_meta_; } OB_INLINE const ObSSTableBasicMeta &get_basic_meta() const { return basic_meta_; } OB_INLINE const common::ObIArray &get_col_checksum() const { return column_checksums_; } @@ -176,14 +178,13 @@ private: static const int64_t SSTABLE_META_VERSION = 1; typedef common::ObFixedArray ColChecksumArray; private: + bool is_inited_; + common::ObIAllocator *allocator_; + common::TCRWLock lock_; ObSSTableBasicMeta basic_meta_; ColChecksumArray column_checksums_; ObRootBlockInfo data_root_info_; ObSSTableMacroInfo macro_info_; - // The following fields don't to persist - common::ObIAllocator *allocator_; - common::TCRWLock lock_; - bool is_inited_; DISALLOW_COPY_AND_ASSIGN(ObSSTableMeta); }; @@ -212,6 +213,9 @@ private: class ObSSTableMetaChecker { public: + static int check_sstable_meta_strict_equality( + const ObSSTableMeta &old_sstable_meta, + const ObSSTableMeta &new_sstable_meta); static int check_sstable_meta( const ObSSTableMeta &old_sstable_meta, const ObSSTableMeta &new_sstable_meta); diff --git a/src/storage/blocksstable/ob_sstable_meta_info.cpp b/src/storage/blocksstable/ob_sstable_meta_info.cpp index 1d05da7e3e..9bb3066f2f 100644 --- a/src/storage/blocksstable/ob_sstable_meta_info.cpp +++ b/src/storage/blocksstable/ob_sstable_meta_info.cpp @@ -371,7 +371,10 @@ ObSSTableMacroInfo::ObSSTableMacroInfo() data_block_ids_(), other_block_ids_(), linked_block_ids_(), - entry_id_() + entry_id_(), + is_meta_root_(false), + nested_offset_(0), + nested_size_(0) { } @@ -392,6 +395,9 @@ int ObSSTableMacroInfo::init_macro_info( param.data_block_macro_meta_addr_, param.data_block_macro_meta_))) { LOG_WARN("fail to init macro meta info", K(ret), K(param)); } else { + is_meta_root_ = param.is_meta_root_; + nested_offset_ = param.nested_offset_; + nested_size_ = 0 == param.nested_size_ ? OB_DEFAULT_MACRO_BLOCK_SIZE : param.nested_size_; data_block_ids_.set_allocator(allocator); other_block_ids_.set_allocator(allocator); linked_block_ids_.set_allocator(allocator); @@ -420,7 +426,10 @@ void ObSSTableMacroInfo::reset() data_block_ids_.reset(); other_block_ids_.reset(); entry_id_.reset(); + is_meta_root_ = false; reset_linked_block_list(); + nested_offset_ = 0; + nested_size_ = 0; } int ObSSTableMacroInfo::serialize(char *buf, const int64_t buf_len, int64_t &pos) const @@ -483,6 +492,15 @@ int ObSSTableMacroInfo::serialize_(char *buf, const int64_t buf_len, int64_t &po LOG_WARN("fail to serialize other id array", K(ret), K(other_block_ids_), K(buf_len), K(pos)); } } + if (OB_FAIL(ret)) { + // do nothing + } else if (OB_FAIL(serialization::encode_bool(buf, buf_len, pos, is_meta_root_))) { + LOG_WARN("fail to serialize is_meta_root_", K(ret), K(is_meta_root_), K(buf_len), K(pos)); + } else if (OB_FAIL(serialization::encode_i64(buf, buf_len, pos, nested_offset_))) { + LOG_WARN("fail to serialize nested_offset_", K(ret), K(buf_len), K(pos), K(nested_offset_)); + } else if (OB_FAIL(serialization::encode_i64(buf, buf_len, pos, nested_size_))) { + LOG_WARN("fail to serialize nested_size_", K(ret), K(buf_len), K(pos), K(nested_size_)); + } return ret; } @@ -596,6 +614,7 @@ int ObSSTableMacroInfo::deserialize_( int ret = OB_SUCCESS; data_block_ids_.set_allocator(allocator); other_block_ids_.set_allocator(allocator); + nested_size_ = OB_DEFAULT_MACRO_BLOCK_SIZE; if (OB_FAIL(macro_meta_info_.deserialize(allocator, des_meta, buf, data_len, pos))) { LOG_WARN("fail to deserialize macro meta info", K(ret), K(des_meta), K(data_len), K(pos)); @@ -611,6 +630,15 @@ int ObSSTableMacroInfo::deserialize_( LOG_WARN("fail to deserialize other block ids", K(ret), KP(buf), K(data_len), K(pos)); } } + if (OB_FAIL(ret)) { + // do nothing + } else if (pos < data_len && OB_FAIL(serialization::decode_bool(buf, data_len, pos, &is_meta_root_))) { + LOG_WARN("fail to deserialize is_meta_root_", K(ret)); + } else if (pos < data_len && OB_FAIL(serialization::decode_i64(buf, data_len, pos, &nested_offset_))) { + LOG_WARN("fail to deserialize nested_offset_", K(ret)); + } else if (pos < data_len && OB_FAIL(serialization::decode_i64(buf, data_len, pos, &nested_size_))) { + LOG_WARN("fail to deserialize nested_size_", K(ret)); + } return ret; } @@ -660,6 +688,9 @@ int64_t ObSSTableMacroInfo::get_serialize_size_() const len += data_block_ids_.get_serialize_size(); len += other_block_ids_.get_serialize_size(); } + len += serialization::encoded_length_bool(is_meta_root_); + len += serialization::encoded_length_i64(nested_offset_); + len += serialization::encoded_length_i64(nested_size_); return len; } @@ -670,7 +701,10 @@ DEF_TO_STRING(ObSSTableMacroInfo) J_KV(K_(macro_meta_info), K(data_block_ids_.count()), K(other_block_ids_.count()), - K(linked_block_ids_.count())); + K(linked_block_ids_.count()), + K(is_meta_root_), + K(nested_offset_), + K(nested_size_)); J_OBJ_END(); return pos; } diff --git a/src/storage/blocksstable/ob_sstable_meta_info.h b/src/storage/blocksstable/ob_sstable_meta_info.h index f495c6451a..91a87688ec 100644 --- a/src/storage/blocksstable/ob_sstable_meta_info.h +++ b/src/storage/blocksstable/ob_sstable_meta_info.h @@ -66,6 +66,7 @@ private: const int64_t data_len, int64_t &pos); int64_t get_serialize_size_() const; + protected: storage::ObMetaDiskAddr addr_; ObMicroBlockData block_data_; @@ -116,10 +117,22 @@ public: { return linked_block_ids_; } + OB_INLINE bool is_meta_root() const + { + return is_meta_root_; + } OB_INLINE int64_t get_total_block_cnt() const { return data_block_ids_.count() + other_block_ids_.count() + linked_block_ids_.count(); } + OB_INLINE int64_t get_nested_offset() const + { + return nested_offset_; + } + OB_INLINE int64_t get_nested_size() const + { + return nested_size_; + } DECLARE_TO_STRING; private: int serialize_(char *buf, const int64_t buf_len, int64_t &pos) const; @@ -153,6 +166,9 @@ private: MacroIdFixedList other_block_ids_; MacroIdFixedList linked_block_ids_; MacroBlockId entry_id_; + bool is_meta_root_; + int64_t nested_offset_; + int64_t nested_size_; DISALLOW_COPY_AND_ASSIGN(ObSSTableMacroInfo); }; diff --git a/src/storage/blocksstable/ob_sstable_sec_meta_iterator.cpp b/src/storage/blocksstable/ob_sstable_sec_meta_iterator.cpp index 56b85a0613..a34571c53c 100644 --- a/src/storage/blocksstable/ob_sstable_sec_meta_iterator.cpp +++ b/src/storage/blocksstable/ob_sstable_sec_meta_iterator.cpp @@ -15,6 +15,7 @@ #include "share/rc/ob_tenant_base.h" #include "share/schema/ob_table_param.h" #include "ob_sstable_sec_meta_iterator.h" +#include "storage/blocksstable/ob_shared_macro_block_manager.h" #include "storage/blocksstable/ob_logic_macro_id.h" namespace oceanbase @@ -67,6 +68,7 @@ int ObSSTableSecMetaIterator::open( const int64_t sample_step) { int ret = OB_SUCCESS; + bool is_meta_root = false; if (IS_INIT) { ret = OB_INIT_TWICE; LOG_WARN("Fail to open sstable secondary meta iterator", K(ret)); @@ -88,6 +90,7 @@ int ObSSTableSecMetaIterator::open( query_range_ = &query_range; is_reverse_scan_ = is_reverse_scan; block_cache_ = &ObStorageCacheSuite::get_instance().get_block_cache(); + is_meta_root = sstable_meta_->get_macro_info().is_meta_root(); } if (OB_FAIL(ret) || is_prefetch_end_) { @@ -103,6 +106,8 @@ int ObSSTableSecMetaIterator::open( - ObMultiVersionRowkeyHelpper::get_extra_rowkey_col_cnt(); is_range_end_key_multi_version_ = schema_rowkey_cnt < query_range.get_end_key().get_datum_cnt(); + } + if (OB_SUCC(ret) && !is_prefetch_end_ && !is_meta_root) { bool start_key_beyond_range = false; bool end_key_beyond_range = false; if (is_reverse_scan) { @@ -139,27 +144,31 @@ int ObSSTableSecMetaIterator::open( } else if (OB_UNLIKELY(start_key_beyond_range)) { ret = OB_BEYOND_THE_RANGE; set_iter_end(); - } else if (OB_FAIL(prefetch_micro_block(1 /* fetch first micro block */))) { - LOG_WARN("Fail to prefetch next micro block", K(ret), K_(is_prefetch_end)); - } else if (OB_FAIL(row_.init(allocator, index_read_info_->get_request_count()))) { - STORAGE_LOG(WARN, "Failed to init datum row", K(ret)); - } else { - if (sample_step != 0) { - // is sample scan - const int64_t start_offset = sample_step > 1 ? (sample_step / 2 - 1) : 0; - step_cnt_ = is_reverse_scan ? (-sample_step) : sample_step; - curr_block_idx_ = is_reverse_scan ? (-1 - start_offset) : start_offset; - } else { - step_cnt_ = is_reverse_scan ? -1 : 1; - curr_block_idx_ = is_reverse_scan ? -1 : 0; - } - curr_block_start_idx_ = 1; - curr_block_end_idx_ = -1; - is_inited_ = true; - LOG_DEBUG("Open secondary meta iterator", K(ret), K(meta_type), K(is_reverse_scan), - K(sample_step), K_(step_cnt), K_(curr_block_idx), K_(tenant_id), KPC_(query_range)); } } + + if (OB_FAIL(ret) || is_prefetch_end_) { + // do nothing + } else if (!is_meta_root && OB_FAIL(prefetch_micro_block(1 /* fetch first micro block */))) { + LOG_WARN("Fail to prefetch next micro block", K(ret), K_(is_prefetch_end)); + } else if (OB_FAIL(row_.init(allocator, index_read_info_->get_request_count()))) { + STORAGE_LOG(WARN, "Failed to init datum row", K(ret)); + } else { + if (sample_step != 0) { + // is sample scan + const int64_t start_offset = sample_step > 1 ? (sample_step / 2 - 1) : 0; + step_cnt_ = is_reverse_scan ? (-sample_step) : sample_step; + curr_block_idx_ = is_reverse_scan ? (-1 - start_offset) : start_offset; + } else { + step_cnt_ = is_reverse_scan ? -1 : 1; + curr_block_idx_ = is_reverse_scan ? -1 : 0; + } + curr_block_start_idx_ = 1; + curr_block_end_idx_ = -1; + is_inited_ = true; + LOG_DEBUG("Open secondary meta iterator", K(ret), K(meta_type), K(is_reverse_scan), + K(sample_step), K_(step_cnt), K_(curr_block_idx), K_(tenant_id), KPC_(query_range)); + } return ret; } @@ -172,9 +181,16 @@ int ObSSTableSecMetaIterator::get_next(ObDataMacroBlockMeta ¯o_meta) LOG_WARN("Secondary meta iterator not inited", K(ret)); } else { while (OB_SUCC(ret) && !is_target_row_in_curr_block()) { - if (OB_FAIL(open_next_micro_block())) { - if (OB_UNLIKELY(OB_ITER_END != ret)) { - LOG_WARN("Fail to open next micro block", K(ret)); + if (is_prefetch_end_ && is_handle_buffer_empty()) { + ret = OB_ITER_END; + } else { + const bool is_data_block = sstable_meta_->get_macro_info().is_meta_root(); + if (!is_data_block && OB_FAIL(open_next_micro_block())) { + if (OB_UNLIKELY(OB_ITER_END != ret)) { + LOG_WARN("Fail to open next micro block", K(ret)); + } + } else if (is_data_block && OB_FAIL(open_data_root_block())) { + LOG_WARN("Fail to open data root block", K(ret)); } } } @@ -184,7 +200,13 @@ int ObSSTableSecMetaIterator::get_next(ObDataMacroBlockMeta ¯o_meta) } else if (OB_FAIL(macro_meta.parse_row(row_))) { LOG_WARN("Fail to parse macro meta", K(ret)); } else { + const ObSSTableMacroInfo ¯o_info = sstable_meta_->get_macro_info(); + if (!macro_info.is_meta_root() && 0 == macro_info.get_other_block_ids().count()) { + macro_meta.val_.macro_id_ = macro_info.get_data_block_ids().at(0); + } curr_block_idx_ += step_cnt_; + macro_meta.nested_size_ = macro_info.get_nested_size(); + macro_meta.nested_offset_ = macro_info.get_nested_offset(); } } return ret; @@ -262,9 +284,7 @@ int ObSSTableSecMetaIterator::open_next_micro_block() int64_t end_idx = 0; ObMicroBlockData micro_data; ObMicroBlockDataHandle µ_handle = micro_handles_[curr_handle_idx_ % HANDLE_BUFFER_SIZE]; - if (is_prefetch_end_ && 0 == handle_buffer_count()) { - ret = OB_ITER_END; - } else if (OB_FAIL(prefetch_micro_block(HANDLE_BUFFER_SIZE - handle_buffer_count()))) { + if (OB_FAIL(prefetch_micro_block(HANDLE_BUFFER_SIZE - handle_buffer_count()))) { LOG_WARN("Fail to prefetch micro blocks", K(ret), K(handle_buffer_count())); } else if (OB_FAIL(micro_handle.get_data_block_data(macro_reader_, micro_data))) { LOG_WARN("Fail to get micro block data", K(ret), K_(curr_handle_idx), K(micro_handle)); @@ -301,41 +321,89 @@ int ObSSTableSecMetaIterator::open_next_micro_block() K(begin_idx), K(end_idx), K_(curr_block_idx), K(is_index_scan), K(block_id)); } - if (OB_SUCC(ret)) { - const int64_t curr_block_row_cnt = end_idx - begin_idx + 1; - if (is_reverse_scan_) { - if (OB_UNLIKELY(curr_block_idx_ >= 0)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("Invalid current block index on reverse scan", K(ret), K_(curr_block_idx), - K_(curr_block_start_idx), K_(curr_block_end_idx), K(begin_idx), K(end_idx)); - } else if (curr_block_idx_ + curr_block_row_cnt >= 0) { - // next row in this block - curr_block_idx_ = end_idx + curr_block_idx_ + 1; - } else { - curr_block_idx_ += curr_block_row_cnt; - } - } else { - if (OB_UNLIKELY(curr_block_idx_ < prev_block_row_cnt_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("Invalid current block index on sequential scan", K(ret), K_(curr_block_idx), - K_(curr_block_start_idx), K_(curr_block_end_idx), K(begin_idx), K(end_idx)); - } else if (curr_block_idx_ - prev_block_row_cnt_ < row_cnt) { - // First block in scan : begin_idx may larger than 0, update curr_block_idx_ - // Non-first block : next row in this block - curr_block_idx_ = begin_idx + (curr_block_idx_ - prev_block_row_cnt_); - } else { - curr_block_idx_ -= prev_block_row_cnt_; - } - } - prev_block_row_cnt_ = row_cnt; - curr_block_start_idx_ = begin_idx; - curr_block_end_idx_ = end_idx; + if (OB_SUCC(ret) && OB_FAIL(adjust_index(begin_idx, end_idx, row_cnt))) { + LOG_WARN("fail to move index", K(ret)); + } else { ++curr_handle_idx_; } return ret; } +int ObSSTableSecMetaIterator::open_data_root_block() +{ + int ret = OB_SUCCESS; + const ObMicroBlockData µ_data = sstable_meta_->get_macro_info().get_macro_meta_data(); + int64_t row_cnt = 0; + int64_t begin_idx = 0; + int64_t end_idx = 0; + if (OB_UNLIKELY(!micro_data.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("Invalid micro block data", K(ret), K(micro_data)); + } else if (OB_FAIL(micro_reader_->init(micro_data, *index_read_info_))) { + LOG_WARN("Fail to init micro block reader", K(ret)); + } else if (OB_FAIL(micro_reader_->get_row_count(row_cnt))) { + LOG_WARN("Fail to get end index", K(ret)); + } else { + end_idx = row_cnt; + const bool is_index_scan = !is_range_end_key_multi_version_ || !query_range_->get_border_flag().inclusive_end(); + if (OB_FAIL(micro_reader_->locate_range( + *query_range_, + true, + true, + begin_idx, + end_idx, + is_index_scan))) { + LOG_WARN("Fail to locate range", K(ret), KPC(query_range_)); + } + LOG_DEBUG("Open next micro block", K(ret), K(begin_idx), K(end_idx), K(is_index_scan)); + } + + if (OB_FAIL(ret)) { + // do nothing + } else if (OB_FAIL(adjust_index(begin_idx, end_idx, row_cnt))) { + LOG_WARN("fail to move index", K(ret)); + } else { + is_prefetch_end_ = true; + } + return ret; +} + +int ObSSTableSecMetaIterator::adjust_index(const int64_t begin_idx, const int64_t end_idx, const int64_t row_cnt) +{ + int ret = OB_SUCCESS; + const int64_t curr_block_row_cnt = end_idx - begin_idx + 1; + if (is_reverse_scan_) { + if (OB_UNLIKELY(curr_block_idx_ >= 0)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("Invalid current block index on reverse scan", K(ret), K_(curr_block_idx), + K_(curr_block_start_idx), K_(curr_block_end_idx), K(begin_idx), K(end_idx)); + } else if (curr_block_idx_ + curr_block_row_cnt >= 0) { + // next row in this block + curr_block_idx_ = end_idx + curr_block_idx_ + 1; + } else { + curr_block_idx_ += curr_block_row_cnt; + } + } else { + if (OB_UNLIKELY(curr_block_idx_ < prev_block_row_cnt_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("Invalid current block index on sequential scan", K(ret), K_(curr_block_idx), + K_(curr_block_start_idx), K_(curr_block_end_idx), K(begin_idx), K(end_idx)); + } else if (curr_block_idx_ - prev_block_row_cnt_ < curr_block_row_cnt) { + // next row in this block + curr_block_idx_ = begin_idx + (curr_block_idx_ - prev_block_row_cnt_); + } else { + curr_block_idx_ -= prev_block_row_cnt_; + } + } + if (OB_SUCC(ret)) { + prev_block_row_cnt_ = row_cnt; + curr_block_start_idx_ = begin_idx; + curr_block_end_idx_ = end_idx; + } + return ret; +} + int ObSSTableSecMetaIterator::prefetch_micro_block(int64_t prefetch_depth) { int ret = OB_SUCCESS; @@ -393,13 +461,18 @@ int ObSSTableSecMetaIterator::get_micro_block( int ret = OB_SUCCESS; data_handle.reset(); ObTabletHandle tablet_handle; + const int64_t nested_offset = sstable_meta_->get_macro_info().get_nested_offset(); if (OB_UNLIKELY(!macro_id.is_valid() || !idx_row_header.is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("Invalid parameters to locate micro block", K(ret), K(macro_id), K(idx_row_header)); + } + + if (OB_FAIL(ret)) { + // do nothing } else if (OB_FAIL(block_cache_->get_cache_block( tenant_id_, macro_id, - idx_row_header.get_block_offset(), + idx_row_header.get_block_offset() + nested_offset, idx_row_header.get_block_size(), data_handle.cache_handle_))) { if (OB_UNLIKELY(OB_ENTRY_NOT_EXIST != ret)) { @@ -408,6 +481,7 @@ int ObSSTableSecMetaIterator::get_micro_block( // Cache miss, async IO ObMicroIndexInfo idx_info; idx_info.row_header_ = &idx_row_header; + idx_info.nested_offset_ = nested_offset; // TODO: @saitong.zst not safe here, remove tablet_handle from SecMeta prefetch interface, disable cache decoders if (OB_FAIL(block_cache_->prefetch( tenant_id_, @@ -428,7 +502,7 @@ int ObSSTableSecMetaIterator::get_micro_block( if (OB_SUCC(ret)) { data_handle.macro_block_id_ = macro_id; - data_handle.micro_info_.offset_ = idx_row_header.get_block_offset(); + data_handle.micro_info_.offset_ = idx_row_header.get_block_offset() + nested_offset; data_handle.micro_info_.size_ = idx_row_header.get_block_size(); const bool deep_copy_key = true; if (OB_FAIL(idx_row_header.fill_micro_des_meta(deep_copy_key, data_handle.des_meta_))) { diff --git a/src/storage/blocksstable/ob_sstable_sec_meta_iterator.h b/src/storage/blocksstable/ob_sstable_sec_meta_iterator.h index c20e74a8a4..9a746168fc 100644 --- a/src/storage/blocksstable/ob_sstable_sec_meta_iterator.h +++ b/src/storage/blocksstable/ob_sstable_sec_meta_iterator.h @@ -46,6 +46,7 @@ private: const ObMacroBlockMetaType meta_type, const ObSSTableMeta &sstable_meta); void set_iter_end(); + int adjust_index(const int64_t begin_idx, const int64_t end_idx, const int64_t row_cnt); int init_micro_reader(const ObRowStoreType row_store_type, ObIAllocator &allocator); int init_by_type(const ObMacroBlockMetaType meta_type); OB_INLINE bool is_handle_buffer_empty() const { return curr_handle_idx_ == prefetch_handle_idx_; } @@ -66,6 +67,7 @@ private: bool &is_beyond_range); int prefetch_micro_block(int64_t prefetch_depth); int open_next_micro_block(); + int open_data_root_block(); // TODO: opt with prefetch int get_micro_block( diff --git a/src/storage/compaction/ob_index_block_micro_iterator.cpp b/src/storage/compaction/ob_index_block_micro_iterator.cpp index 9ddb648360..ace76b99ad 100644 --- a/src/storage/compaction/ob_index_block_micro_iterator.cpp +++ b/src/storage/compaction/ob_index_block_micro_iterator.cpp @@ -154,7 +154,8 @@ int ObIndexBlockMicroIterator::init( const blocksstable::MacroBlockId ¯o_id, const common::ObIArray µ_block_infos, const common::ObIArray &endkeys, - const ObRowStoreType row_store_type) + const ObRowStoreType row_store_type, + const ObSSTable *sstable) { int ret = OB_SUCCESS; if (IS_INIT) { @@ -170,8 +171,8 @@ int ObIndexBlockMicroIterator::init( ObMacroBlockReadInfo read_info; const int64_t io_timeout_ms = std::max(GCONF._data_storage_io_timeout / 1000, DEFAULT_IO_WAIT_TIME_MS); read_info.macro_block_id_ = macro_id; - read_info.offset_ = 0; - read_info.size_ = OB_SERVER_BLOCK_MGR.get_macro_block_size(); + read_info.offset_ = sstable->get_macro_offset(); + read_info.size_ = sstable->get_macro_read_size(); read_info.io_desc_.set_category(ObIOCategory::SYS_IO); read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_COMPACT_READ); if (OB_FAIL(ObBlockManager::async_read_block(read_info, macro_handle_))) { diff --git a/src/storage/compaction/ob_index_block_micro_iterator.h b/src/storage/compaction/ob_index_block_micro_iterator.h index a4f309fc90..a57ee4aeb5 100644 --- a/src/storage/compaction/ob_index_block_micro_iterator.h +++ b/src/storage/compaction/ob_index_block_micro_iterator.h @@ -76,7 +76,8 @@ public: const blocksstable::MacroBlockId ¯o_id, const common::ObIArray µ_block_infos, const common::ObIArray &endkeys, - const ObRowStoreType row_store_type); + const ObRowStoreType row_store_type, + const blocksstable::ObSSTable *sstable); void reset(); int next(const blocksstable::ObMicroBlock *µ_block); diff --git a/src/storage/compaction/ob_partition_merge_iter.cpp b/src/storage/compaction/ob_partition_merge_iter.cpp index 449adc294a..6073882810 100644 --- a/src/storage/compaction/ob_partition_merge_iter.cpp +++ b/src/storage/compaction/ob_partition_merge_iter.cpp @@ -715,7 +715,8 @@ int ObPartitionMicroMergeIter::open_curr_range(const bool for_rewrite) curr_block_desc_.macro_block_id_, macro_block_iter_->get_micro_index_infos(), macro_block_iter_->get_micro_endkeys(), - static_cast(curr_block_desc_.row_store_type_)))) { + static_cast(curr_block_desc_.row_store_type_), + reinterpret_cast(table_)))) { LOG_WARN("Failed to init micro_block_iter", K(ret), KPC(column_ids_), K_(curr_block_desc)); } else { micro_block_opened_ = false; diff --git a/src/storage/compaction/ob_partition_merger.cpp b/src/storage/compaction/ob_partition_merger.cpp index 476bea2574..f57f11f92b 100644 --- a/src/storage/compaction/ob_partition_merger.cpp +++ b/src/storage/compaction/ob_partition_merger.cpp @@ -579,7 +579,7 @@ int ObPartitionMajorMerger::init_merge_iters(ObIPartitionMergeFuser &fuser, // do nothing. don't need to construct iter for empty sstable FLOG_INFO("table is empty, need not create iter", K(i), KPC(sstable), K(sstable->get_meta())); continue; - } else if (0 == i && !merge_param.is_full_merge_) { + } else if (0 == i && !merge_param.is_full_merge_ && !sstable->is_small_sstable()) { if (MICRO_BLOCK_MERGE_LEVEL == merge_param.merge_level_) { merge_iter = alloc_merge_helper(); } else { @@ -657,8 +657,6 @@ int ObPartitionMajorMerger::merge_partition(ObTabletMergeCtx &ctx, const int64_t } else if (is_reuse_base_sstable) { if (OB_FAIL(reuse_base_sstable(merge_iters)) && OB_ITER_END != ret) { STORAGE_LOG(WARN, "Failed to reuse base sstable", K(ret), K(merge_iters)); - } else { - FLOG_INFO("succeed to reuse base sstable", K(merge_iters)); } } @@ -1000,13 +998,17 @@ int ObPartitionMajorMerger::check_need_reuse_base_sstable(MERGE_ITER_ARRAY &merg if (is_full_merge || need_rewrite_count != 0) { is_need_reuse_sstable = false; } else { - for (int64_t i = 0; OB_SUCC(ret) && i < merge_iters.count(); ++i) { + for (int64_t i = 0; OB_SUCC(ret) && i < merge_iters.count() && is_need_reuse_sstable; ++i) { if (OB_ISNULL(iter = merge_iters.at(i))) { ret = OB_ERR_UNEXPECTED; STORAGE_LOG(WARN, "Unexpected null iter", K(ret), K(merge_iters)); + } else if (!iter->is_iter_end() && iter->is_base_sstable_iter()) { + const ObSSTable *sstable = static_cast(iter->get_table()); + if (sstable->is_small_sstable()) { + is_need_reuse_sstable = false; + } } else if (!iter->is_iter_end() && !iter->is_base_sstable_iter()) { is_need_reuse_sstable = false; - break; } } } @@ -1322,7 +1324,8 @@ int ObPartitionMinorMerger::init_merge_iters(ObIPartitionMergeFuser &fuser, } else if (storage::is_backfill_tx_merge(merge_param.merge_type_)) { merge_iter = alloc_merge_helper (); } else if (merge_param.is_multi_version_minor_merge()) { - if (!merge_param.is_mini_merge() && 0 == i && !merge_param.is_full_merge_) { + if (!merge_param.is_mini_merge() && 0 == i && !merge_param.is_full_merge_ && + !(static_cast(table))->is_small_sstable()) { merge_iter = alloc_merge_helper(); } else { merge_iter = alloc_merge_helper (); diff --git a/src/storage/compaction/ob_tablet_merge_ctx.cpp b/src/storage/compaction/ob_tablet_merge_ctx.cpp index e41bef438b..06f8c9f87e 100644 --- a/src/storage/compaction/ob_tablet_merge_ctx.cpp +++ b/src/storage/compaction/ob_tablet_merge_ctx.cpp @@ -210,6 +210,7 @@ int ObTabletMergeInfo::build_create_sstable_param(const ObTabletMergeCtx &ctx, param.root_block_addr_, param.root_block_data_); ObSSTableMergeRes::fill_addr_and_data(res.data_root_desc_, param.data_block_macro_meta_addr_, param.data_block_macro_meta_); + param.is_meta_root_ = res.data_root_desc_.is_meta_root_; param.root_row_store_type_ = res.root_desc_.row_type_; param.data_index_tree_height_ = res.root_desc_.height_; param.index_blocks_cnt_ = res.index_blocks_cnt_; @@ -231,6 +232,8 @@ int ObTabletMergeInfo::build_create_sstable_param(const ObTabletMergeCtx &ctx, param.compressor_type_ = res.compressor_type_; param.encrypt_id_ = res.encrypt_id_; param.master_key_id_ = res.master_key_id_; + param.nested_size_ = res.nested_size_; + param.nested_offset_ = res.nested_offset_; param.data_block_ids_ = res.data_block_ids_; param.other_block_ids_ = res.other_block_ids_; param.ddl_scn_.set_min(); diff --git a/src/storage/ddl/ob_ddl_merge_task.cpp b/src/storage/ddl/ob_ddl_merge_task.cpp index 571f812ccc..7488e2a9d2 100644 --- a/src/storage/ddl/ob_ddl_merge_task.cpp +++ b/src/storage/ddl/ob_ddl_merge_task.cpp @@ -542,6 +542,7 @@ int ObTabletDDLUtil::prepare_index_data_desc(const share::ObLSID &ls_id, int ObTabletDDLUtil::prepare_index_builder(const ObTabletDDLParam &ddl_param, ObIAllocator &allocator, + const ObSSTableIndexBuilder::ObSpaceOptimizationMode mode, ObSSTableIndexBuilder *&sstable_index_builder, ObIndexBlockRebuilder *&index_block_rebuilder) { @@ -560,7 +561,10 @@ int ObTabletDDLUtil::prepare_index_builder(const ObTabletDDLParam &ddl_param, ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("allocate memory failed", K(ret)); } else if (FALSE_IT(sstable_index_builder = new (buf) ObSSTableIndexBuilder)) { - } else if (OB_FAIL(sstable_index_builder->init(data_desc))) { + } else if (OB_FAIL(sstable_index_builder->init( + data_desc, + nullptr, // this param is flush macro call back, nullptr is default val + mode))) { LOG_WARN("init sstable index builder failed", K(ret)); } else if (OB_ISNULL(buf = allocator.alloc(sizeof(ObIndexBlockRebuilder)))) { ret = OB_ALLOCATE_MEMORY_FAILED; @@ -627,6 +631,7 @@ int ObTabletDDLUtil::create_ddl_sstable(ObSSTableIndexBuilder *sstable_index_bui param.root_block_addr_, param.root_block_data_); ObSSTableMergeRes::fill_addr_and_data(res.data_root_desc_, param.data_block_macro_meta_addr_, param.data_block_macro_meta_); + param.is_meta_root_ = res.data_root_desc_.is_meta_root_; param.root_row_store_type_ = res.root_desc_.row_type_; param.data_index_tree_height_ = res.root_desc_.height_; param.index_blocks_cnt_ = res.index_blocks_cnt_; @@ -643,6 +648,8 @@ int ObTabletDDLUtil::create_ddl_sstable(ObSSTableIndexBuilder *sstable_index_bui param.compressor_type_ = res.compressor_type_; param.encrypt_id_ = res.encrypt_id_; param.master_key_id_ = res.master_key_id_; + param.nested_size_ = res.nested_size_; + param.nested_offset_ = res.nested_offset_; param.data_block_ids_ = res.data_block_ids_; param.other_block_ids_ = res.other_block_ids_; MEMCPY(param.encrypt_key_, res.encrypt_key_, share::OB_MAX_TABLESPACE_ENCRYPT_KEY_LENGTH); @@ -683,10 +690,13 @@ int ObTabletDDLUtil::compact_ddl_sstable(const ObIArray &ddl_sstable ObArenaAllocator arena; ObSSTableIndexBuilder *sstable_index_builder = nullptr; ObIndexBlockRebuilder *index_block_rebuilder = nullptr; + const ObSSTableIndexBuilder::ObSpaceOptimizationMode mode = ddl_param.table_key_.is_ddl_sstable() + ? ObSSTableIndexBuilder::DISABLE : ObSSTableIndexBuilder::AUTO; + if (OB_UNLIKELY(!ddl_param.is_valid() || ddl_sstables.empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(ddl_param), K(ddl_sstables.count())); - } else if (OB_FAIL(prepare_index_builder(ddl_param, arena, sstable_index_builder, index_block_rebuilder))) { + } else if (OB_FAIL(prepare_index_builder(ddl_param, arena, mode, sstable_index_builder, index_block_rebuilder))) { LOG_WARN("prepare sstable index builder failed", K(ret)); } else { // campact diff --git a/src/storage/ddl/ob_ddl_merge_task.h b/src/storage/ddl/ob_ddl_merge_task.h index d7e55fb26c..f3bd61bcaf 100644 --- a/src/storage/ddl/ob_ddl_merge_task.h +++ b/src/storage/ddl/ob_ddl_merge_task.h @@ -154,6 +154,7 @@ public: static int prepare_index_builder(const ObTabletDDLParam &ddl_param, ObIAllocator &allocator, + const blocksstable::ObSSTableIndexBuilder::ObSpaceOptimizationMode mode, blocksstable::ObSSTableIndexBuilder *&sstable_index_builder, blocksstable::ObIndexBlockRebuilder *&index_block_rebuilder); diff --git a/src/storage/ddl/ob_ddl_struct.cpp b/src/storage/ddl/ob_ddl_struct.cpp index 1f965eaabb..4fb1b47da8 100644 --- a/src/storage/ddl/ob_ddl_struct.cpp +++ b/src/storage/ddl/ob_ddl_struct.cpp @@ -174,7 +174,7 @@ int ObDDLKV::init(const share::ObLSID &ls_id, ddl_param.start_scn_ = ddl_start_scn; ddl_param.snapshot_version_ = snapshot_version; ddl_param.cluster_version_ = cluster_version; - if (OB_FAIL(ObTabletDDLUtil::prepare_index_builder(ddl_param, allocator_, sstable_index_builder_, index_block_rebuilder_))) { + if (OB_FAIL(ObTabletDDLUtil::prepare_index_builder(ddl_param, allocator_, ObSSTableIndexBuilder::DISABLE, sstable_index_builder_, index_block_rebuilder_))) { LOG_WARN("prepare index builder failed", K(ret)); } else { ls_id_ = ls_id; diff --git a/src/storage/high_availability/ob_physical_copy_task.cpp b/src/storage/high_availability/ob_physical_copy_task.cpp index 064f114f0b..a58acac796 100644 --- a/src/storage/high_availability/ob_physical_copy_task.cpp +++ b/src/storage/high_availability/ob_physical_copy_task.cpp @@ -799,6 +799,8 @@ int ObPhysicalCopyFinishTask::prepare_sstable_index_builder_( { int ret = OB_SUCCESS; ObDataStoreDesc desc; + const ObSSTableIndexBuilder::ObSpaceOptimizationMode mode = sstable_param->table_key_.is_ddl_sstable() + ? ObSSTableIndexBuilder::DISABLE : ObSSTableIndexBuilder::AUTO; if (!tablet_id.is_valid() || OB_ISNULL(sstable_param)) { ret = OB_INVALID_ARGUMENT; @@ -807,7 +809,10 @@ int ObPhysicalCopyFinishTask::prepare_sstable_index_builder_( LOG_INFO("sstable is empty, no need build sstable index builder", K(ret), K(tablet_id), KPC(sstable_param)); } else if (OB_FAIL(prepare_data_store_desc_(ls_id, tablet_id, sstable_param, cluster_version, desc))) { LOG_WARN("failed to prepare data store desc", K(ret), K(tablet_id), K(cluster_version)); - } else if (OB_FAIL(sstable_index_builder_.init(desc))) { + } else if (OB_FAIL(sstable_index_builder_.init( + desc, + nullptr, // macro block flush callback, default value is nullptr + mode))) { LOG_WARN("failed to init sstable index builder", K(ret), K(desc)); } return ret; @@ -939,6 +944,7 @@ int ObPhysicalCopyFinishTask::build_create_sstable_param_( param.root_block_addr_, param.root_block_data_); ObSSTableMergeRes::fill_addr_and_data(res.data_root_desc_, param.data_block_macro_meta_addr_, param.data_block_macro_meta_); + param.is_meta_root_ = res.data_root_desc_.is_meta_root_; param.root_row_store_type_ = res.root_desc_.row_type_; param.data_index_tree_height_ = res.root_desc_.height_; param.index_blocks_cnt_ = res.index_blocks_cnt_; @@ -955,6 +961,8 @@ int ObPhysicalCopyFinishTask::build_create_sstable_param_( param.compressor_type_ = res.compressor_type_; param.encrypt_id_ = res.encrypt_id_; param.master_key_id_ = res.master_key_id_; + param.nested_size_ = res.nested_size_; + param.nested_offset_ = res.nested_offset_; param.data_block_ids_ = res.data_block_ids_; param.other_block_ids_ = res.other_block_ids_; param.rowkey_column_cnt_ = sstable_param_->basic_meta_.rowkey_column_count_; diff --git a/src/storage/high_availability/ob_storage_ha_reader.cpp b/src/storage/high_availability/ob_storage_ha_reader.cpp index a1c0b4fdae..c9f0ed1ede 100644 --- a/src/storage/high_availability/ob_storage_ha_reader.cpp +++ b/src/storage/high_availability/ob_storage_ha_reader.cpp @@ -729,8 +729,8 @@ int ObCopyMacroBlockObProducer::prefetch_() } else { copy_macro_block_handle_[handle_idx_].is_reuse_macro_block_ = false; read_info.macro_block_id_ = macro_meta.get_macro_id(); - read_info.offset_ = 0; - read_info.size_ = OB_DEFAULT_MACRO_BLOCK_SIZE; + read_info.offset_ = sstable_->get_macro_offset(); + read_info.size_ = sstable_->get_macro_read_size(); read_info.io_desc_.set_category(ObIOCategory::SYS_IO); read_info.io_desc_.set_wait_event(ObWaitEventIds::DB_FILE_MIGRATE_READ); if (OB_FAIL(ObBlockManager::async_read_block(read_info, copy_macro_block_handle_[handle_idx_].read_handle_))) { diff --git a/src/storage/ls/ob_ls.cpp b/src/storage/ls/ob_ls.cpp index 8a2534dd55..dca1d7cde3 100644 --- a/src/storage/ls/ob_ls.cpp +++ b/src/storage/ls/ob_ls.cpp @@ -953,13 +953,13 @@ int ObLS::update_tablet_table_store( ObTabletHandle &handle) { int ret = OB_SUCCESS; - int64_t read_lock = LSLOCKLOGMETA; - int64_t write_lock = 0; + const int64_t read_lock = LSLOCKLOGMETA; + const int64_t write_lock = 0; ObLSLockGuard lock_myself(lock_, read_lock, write_lock); if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("ls is not inited", K(ret)); - } else if (!tablet_id.is_valid() || !param.is_valid()) { + } else if (OB_UNLIKELY(!tablet_id.is_valid() || !param.is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("update tablet table store get invalid argument", K(ret), K(tablet_id), K(param)); } else { @@ -975,6 +975,33 @@ int ObLS::update_tablet_table_store( return ret; } +int ObLS::update_tablet_table_store( + const int64_t rebuild_seq, + const ObTabletHandle &old_tablet_handle, + const ObIArray &table_handles) +{ + int ret = OB_SUCCESS; + const int64_t read_lock = LSLOCKLOGMETA; + const int64_t write_lock = 0; + ObLSLockGuard lock_myself(lock_, read_lock, write_lock); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("ls hasn't been inited", K(ret)); + } else if (OB_UNLIKELY(!old_tablet_handle.is_valid() || 0 == table_handles.count())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", K(old_tablet_handle), K(table_handles)); + } else { + const int64_t seq = ls_meta_.get_rebuild_seq(); + if (rebuild_seq != seq) { + ret = OB_EAGAIN; + LOG_WARN("rebuild seq has changed, retry", K(ret), K(seq), K(rebuild_seq)); + } else if (OB_FAIL(ls_tablet_svr_.update_tablet_table_store(old_tablet_handle, table_handles))) { + LOG_WARN("fail to replace small sstables in the tablet", K(ret), K(old_tablet_handle), K(table_handles)); + } + } + return ret; +} + int ObLS::build_ha_tablet_new_table_store( const ObTabletID &tablet_id, const ObBatchUpdateTableStoreParam ¶m) diff --git a/src/storage/ls/ob_ls.h b/src/storage/ls/ob_ls.h index 3dd4aa8d4d..8a05ff488b 100644 --- a/src/storage/ls/ob_ls.h +++ b/src/storage/ls/ob_ls.h @@ -618,6 +618,10 @@ public: const ObTabletID &tablet_id, const ObUpdateTableStoreParam ¶m, ObTabletHandle &handle); + int update_tablet_table_store( + const int64_t rebuild_seq, + const ObTabletHandle &old_tablet_handle, + const ObIArray &table_handles); int build_ha_tablet_new_table_store( const ObTabletID &tablet_id, const ObBatchUpdateTableStoreParam ¶m); diff --git a/src/storage/ls/ob_ls_tablet_service.cpp b/src/storage/ls/ob_ls_tablet_service.cpp index edd4be1048..2aaaeacd0e 100644 --- a/src/storage/ls/ob_ls_tablet_service.cpp +++ b/src/storage/ls/ob_ls_tablet_service.cpp @@ -1032,6 +1032,69 @@ int ObLSTabletService::migrate_create_tablet( return ret; } +int ObLSTabletService::update_tablet_table_store( + const ObTabletHandle &old_tablet_handle, + const ObIArray &table_handles) +{ + int ret = OB_SUCCESS; + + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("ls tablet svr hasn't been inited", K(ret)); + } else if (OB_UNLIKELY(!old_tablet_handle.is_valid() || 0 == table_handles.count())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("old tablet handle is invalid", K(ret), K(old_tablet_handle), K(table_handles.count())); + } else { + ObTablet *old_tablet = old_tablet_handle.get_obj(); + ObTimeGuard time_guard("ReplaceSSTable", 3000000/*3 seconds*/); + const common::ObTabletID &tablet_id = old_tablet->get_tablet_meta().tablet_id_; + ObBucketHashWLockGuard lock_guard(bucket_lock_, tablet_id.hash()); + time_guard.click("Lock"); + + ObTenantMetaMemMgr *t3m = MTL(ObTenantMetaMemMgr*); + ObTabletHandle tablet_handle; + if (OB_FAIL(direct_get_tablet(tablet_id, tablet_handle))) { + if (OB_TABLET_NOT_EXIST == ret) { + ret = OB_EAGAIN; + LOG_WARN("this tablet has been deleted, skip it", K(ret), K(tablet_id)); + } else { + LOG_WARN("fail to get tablet", K(ret)); + } + } else if (tablet_handle.get_obj() != old_tablet) { + ret = OB_EAGAIN; + LOG_WARN("tablet has changed, skip it", K(ret), K(tablet_handle), K(old_tablet_handle)); + } else { + ObTabletHandle new_tablet_handle; + ObTablet *new_tablet = nullptr; + const share::ObLSID &ls_id = ls_->get_ls_id(); + const ObTabletMapKey key(ls_id, tablet_id); + const ObTabletTxMultiSourceDataUnit &tx_data = old_tablet->tablet_meta_.tx_data_; + const ObTabletBindingInfo &ddl_data = old_tablet->tablet_meta_.ddl_data_; + const ObTabletAutoincSeq &autoinc_seq = old_tablet->tablet_meta_.autoinc_seq_; + ObMetaDiskAddr disk_addr; + + if (OB_FAIL(ObTabletCreateDeleteHelper::acquire_tablet(key, new_tablet_handle))) { + LOG_WARN("fail to acquire tablet", K(ret), K(key)); + } else if (FALSE_IT(new_tablet = new_tablet_handle.get_obj())) { + } else if (OB_FAIL(new_tablet->init(table_handles, *old_tablet, tx_data, ddl_data, autoinc_seq))) { + LOG_WARN("fail to init new tablet", K(ret), KPC(old_tablet), K(tx_data), K(ddl_data), K(autoinc_seq)); + } else if (FALSE_IT(time_guard.click("InitTablet"))) { + } else if (OB_FAIL(ObTabletSlogHelper::write_create_tablet_slog(new_tablet_handle, disk_addr))) { + LOG_WARN("fail to write update tablet slog", K(ret), K(new_tablet_handle), K(disk_addr)); + } else if (FALSE_IT(time_guard.click("WrSlog"))) { + } else if (OB_FAIL(t3m->compare_and_swap_tablet(key, disk_addr, old_tablet_handle, new_tablet_handle))) { + LOG_ERROR("failed to compare and swap tablet", K(ret), K(key), K(disk_addr), K(old_tablet_handle), K(lbt())); + ob_usleep(1000 * 1000); + ob_abort(); + } else { + time_guard.click("CASwap"); + LOG_INFO("succeeded to build new tablet", K(ret), K(new_tablet_handle), KPC(new_tablet_handle.get_obj())); + } + } + } + return ret; +} + int ObLSTabletService::update_tablet_table_store( const common::ObTabletID &tablet_id, const ObUpdateTableStoreParam ¶m, @@ -1392,7 +1455,7 @@ int ObLSTabletService::replay_create_tablet( new_tablet->get_tablet_meta().max_sync_storage_schema_version_, freezer))) { LOG_WARN("failed to init shared params", K(ret), K(ls_id), K(tablet_id)); } else if (OB_FAIL(refresh_tablet_addr(ls_id, tablet_id, disk_addr, new_tablet_handle))) { - LOG_WARN("failed to refresh tablet addr", K(key), K(ls_id), K(tablet_id), K(disk_addr)); + LOG_WARN("failed to refresh tablet addr", K(ret), K(ls_id), K(tablet_id), K(disk_addr)); } else if (OB_FAIL(new_tablet->start_ddl_if_need())) { LOG_WARN("start ddl if need failed", K(ret)); } else if (OB_FAIL(try_pin_tablet_if_needed(new_tablet_handle))) { diff --git a/src/storage/ls/ob_ls_tablet_service.h b/src/storage/ls/ob_ls_tablet_service.h index 886b4cfc21..17c48db2be 100644 --- a/src/storage/ls/ob_ls_tablet_service.h +++ b/src/storage/ls/ob_ls_tablet_service.h @@ -244,6 +244,9 @@ public: const common::ObTabletID &tablet_id, const ObUpdateTableStoreParam ¶m, ObTabletHandle &handle); + int update_tablet_table_store( // only for small sstables defragmentation + const ObTabletHandle &old_tablet_handle, + const ObIArray &table_handles); int update_tablet_report_status(const common::ObTabletID &tablet_id); int update_tablet_restore_status( const common::ObTabletID &tablet_id, diff --git a/src/storage/ob_micro_block_handle_mgr.cpp b/src/storage/ob_micro_block_handle_mgr.cpp index e8941de05a..31b360de4f 100644 --- a/src/storage/ob_micro_block_handle_mgr.cpp +++ b/src/storage/ob_micro_block_handle_mgr.cpp @@ -105,7 +105,7 @@ int ObMicroBlockDataHandle::get_index_block_data( nullptr, loaded_index_block_data_, allocator_))) { - LOG_WARN("Fail to load index micro block", K(ret), K_(macro_block_id), K(read_info)); + LOG_WARN("Fail to load index micro block", K(ret), K_(macro_block_id), K(read_info), K(micro_block_id)); try_release_loaded_index_block(); } else { index_block = loaded_index_block_data_; @@ -206,21 +206,24 @@ int ObMicroBlockHandleMgr::init( int ObMicroBlockHandleMgr::get_micro_block_handle( const uint64_t tenant_id, - const MacroBlockId macro_id, - const ObIndexBlockRowHeader &idx_header, + const ObMicroIndexInfo &index_block_info, const bool is_data_block, ObMicroBlockDataHandle µ_block_handle) { int ret = OB_SUCCESS; const bool deep_copy_key = true; bool found = false; - int64_t offset = idx_header.get_block_offset(); - int64_t size = idx_header.get_block_size(); + const MacroBlockId ¯o_id = index_block_info.get_macro_id(); + const int64_t offset = index_block_info.get_block_offset(); + const int64_t size = index_block_info.get_block_size(); + const ObIndexBlockRowHeader *idx_header = index_block_info.row_header_; micro_block_handle.reset(); micro_block_handle.allocator_ = &allocator_; if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("Block handle manager is not inited", K(ret)); + } else if (OB_ISNULL(idx_header)) { + LOG_WARN("invalid argument", K(ret)); } else { if (is_multi_) { if (is_ordered_) { @@ -245,7 +248,7 @@ int ObMicroBlockHandleMgr::get_micro_block_handle( } if (OB_FAIL(ret) || found) { - } else if (OB_FAIL(idx_header.fill_micro_des_meta(deep_copy_key, micro_block_handle.des_meta_))) { + } else if (OB_FAIL(idx_header->fill_micro_des_meta(deep_copy_key, micro_block_handle.des_meta_))) { LOG_WARN("Fail to fill micro block deserialize meta", K(ret)); } else { micro_block_handle.tenant_id_ = tenant_id; diff --git a/src/storage/ob_micro_block_handle_mgr.h b/src/storage/ob_micro_block_handle_mgr.h index 6bac9b67d8..181d57f844 100644 --- a/src/storage/ob_micro_block_handle_mgr.h +++ b/src/storage/ob_micro_block_handle_mgr.h @@ -76,8 +76,7 @@ public: int init(const bool is_multi, const bool is_ordered, common::ObIAllocator &allocator); int get_micro_block_handle( const uint64_t tenant_id, - const blocksstable::MacroBlockId macro_id, - const blocksstable::ObIndexBlockRowHeader &idx_header, + const blocksstable::ObMicroIndexInfo &index_block_info, const bool is_data_block, ObMicroBlockDataHandle µ_block_handle); private: diff --git a/src/storage/tablet/ob_table_store_util.cpp b/src/storage/tablet/ob_table_store_util.cpp index 5a30fff0fa..b576da9859 100644 --- a/src/storage/tablet/ob_table_store_util.cpp +++ b/src/storage/tablet/ob_table_store_util.cpp @@ -74,7 +74,12 @@ void ObITableArray::reset_table(const int64_t pos) LOG_ERROR("[MEMORY LEAK] TenantMetaMemMgr is unexpected not equal!!!", K(meta_mem_mgr_), KP(allocator_), KPC(array_[pos])); } else if (0 == array_[pos]->dec_ref()) { if (meta_mem_mgr_->is_used_obj_pool(allocator_)) { - if (array_[pos]->is_sstable()) { + if (OB_UNLIKELY(OB_INVALID_TENANT_ID == MTL_ID() + && array_[pos]->is_sstable() + && reinterpret_cast(array_[pos])->is_small_sstable())) { + FLOG_INFO("this thread doesn't have MTL ctx, push sstable into gc queue", KP(array_[pos]), K(array_[pos]->get_key())); + meta_mem_mgr_->push_table_into_gc_queue(array_[pos], array_[pos]->get_key().table_type_); + } else if (array_[pos]->is_sstable()) { meta_mem_mgr_->gc_sstable(reinterpret_cast(array_[pos])); } else { meta_mem_mgr_->push_table_into_gc_queue(array_[pos], array_[pos]->get_key().table_type_); diff --git a/src/storage/tablet/ob_tablet.cpp b/src/storage/tablet/ob_tablet.cpp index bd43abca7c..19d2888f31 100644 --- a/src/storage/tablet/ob_tablet.cpp +++ b/src/storage/tablet/ob_tablet.cpp @@ -312,6 +312,54 @@ int ObTablet::init( return ret; } +int ObTablet::init( + const ObIArray &table_handles, + const ObTablet &old_tablet, + const ObTabletTxMultiSourceDataUnit &tx_data, + const ObTabletBindingInfo &ddl_data, + const share::ObTabletAutoincSeq &autoinc_seq) +{ + int ret = OB_SUCCESS; + allocator_ = &(MTL(ObTenantMetaMemMgr*)->get_tenant_allocator()); + + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("tablet has been inited", K(ret)); + } else if (OB_UNLIKELY(!old_tablet.is_valid() || 0 == table_handles.count())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("old tablet is invalid", K(ret), K(old_tablet)); + } else if (OB_UNLIKELY(!pointer_hdl_.is_valid()) + || OB_ISNULL(memtable_mgr_) + || OB_ISNULL(log_handler_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("tablet pointer handle is invalid", K(ret), K_(pointer_hdl), K_(memtable_mgr), K_(log_handler)); + } else if (OB_FAIL(tablet_meta_.init(*allocator_, old_tablet.tablet_meta_, old_tablet.get_snapshot_version(), + old_tablet.get_multi_version_start(), tx_data, ddl_data, autoinc_seq, old_tablet.tablet_meta_.max_sync_storage_schema_version_))) { + LOG_WARN("fail to init tablet_meta", K(ret), K(old_tablet.tablet_meta_), K(tx_data), K(ddl_data), K(autoinc_seq)); + } else if (OB_FAIL(table_store_.batch_replace_sstables(*allocator_, this, table_handles, old_tablet.table_store_))) { + LOG_WARN("fail to init table store", K(ret), K(old_tablet), K(table_handles)); + } else if (OB_FAIL(storage_schema_.init(*allocator_, old_tablet.storage_schema_))) { + LOG_WARN("fail to init storage schema", K(ret), K(old_tablet.storage_schema_)); + } else if (OB_FAIL(medium_info_list_.init(*allocator_, &(old_tablet.get_medium_compaction_info_list())))) { + LOG_WARN("fail to init medium info list", K(ret)); + } else if (OB_FAIL(build_read_info(*allocator_))) { + LOG_WARN("fail to build read info", K(ret)); + } else if (OB_FAIL(pre_transform_sstable_root_block(*full_read_info_.get_index_read_info()))) { + LOG_WARN("failed to pre-transform sstable root block", K(ret), K(full_read_info_)); + } else { + if (old_tablet.get_tablet_meta().has_next_tablet_) { + set_next_tablet_guard(old_tablet.next_tablet_guard_); + } + is_inited_ = true; + LOG_INFO("succeeded to init tablet", K(ret), K(old_tablet), KPC(this)); + } + + if (OB_UNLIKELY(!is_inited_)) { + reset(); + } + return ret; +} + int ObTablet::init( const ObBatchUpdateTableStoreParam ¶m, const ObTablet &old_tablet, diff --git a/src/storage/tablet/ob_tablet.h b/src/storage/tablet/ob_tablet.h index 874dfe12d6..17d9ff5cac 100644 --- a/src/storage/tablet/ob_tablet.h +++ b/src/storage/tablet/ob_tablet.h @@ -147,6 +147,13 @@ public: const ObTabletTxMultiSourceDataUnit &tx_data, const ObTabletBindingInfo &ddl_data, const share::ObTabletAutoincSeq &autoinc_seq); + // batch replace sstables without data modification + int init( + const ObIArray &table_handles, + const ObTablet &old_tablet, + const ObTabletTxMultiSourceDataUnit &tx_data, + const ObTabletBindingInfo &ddl_data, + const share::ObTabletAutoincSeq &autoinc_seq); bool is_valid() const; diff --git a/src/storage/tablet/ob_tablet_create_sstable_param.cpp b/src/storage/tablet/ob_tablet_create_sstable_param.cpp index f0ba7959c7..a5ec4b9659 100644 --- a/src/storage/tablet/ob_tablet_create_sstable_param.cpp +++ b/src/storage/tablet/ob_tablet_create_sstable_param.cpp @@ -54,9 +54,12 @@ ObTabletCreateSSTableParam::ObTabletCreateSSTableParam() ddl_scn_(SCN::min_scn()), filled_tx_scn_(SCN::min_scn()), contain_uncommitted_row_(false), + is_meta_root_(false), compressor_type_(ObCompressorType::INVALID_COMPRESSOR), encrypt_id_(0), master_key_id_(0), + nested_offset_(0), + nested_size_(0), data_block_ids_(), other_block_ids_() { diff --git a/src/storage/tablet/ob_tablet_create_sstable_param.h b/src/storage/tablet/ob_tablet_create_sstable_param.h index 9005db07db..e41c2439a4 100644 --- a/src/storage/tablet/ob_tablet_create_sstable_param.h +++ b/src/storage/tablet/ob_tablet_create_sstable_param.h @@ -64,9 +64,12 @@ public: K_(max_merged_trans_version), K_(ddl_scn), K_(contain_uncommitted_row), + K_(is_meta_root), K_(compressor_type), K_(encrypt_id), K_(master_key_id), + K_(nested_offset), + K_(nested_size), KPHEX_(encrypt_key, sizeof(encrypt_key_))); private: static const int64_t DEFAULT_MACRO_BLOCK_CNT = 64; @@ -100,9 +103,12 @@ public: share::SCN ddl_scn_; share::SCN filled_tx_scn_; bool contain_uncommitted_row_; + bool is_meta_root_; common::ObCompressorType compressor_type_; int64_t encrypt_id_; int64_t master_key_id_; + int64_t nested_offset_; + int64_t nested_size_; char encrypt_key_[share::OB_MAX_TABLESPACE_ENCRYPT_KEY_LENGTH]; common::ObSEArray data_block_ids_; common::ObSEArray other_block_ids_; diff --git a/src/storage/tablet/ob_tablet_table_store.cpp b/src/storage/tablet/ob_tablet_table_store.cpp index 6a359d0be2..b79ec0c9c4 100644 --- a/src/storage/tablet/ob_tablet_table_store.cpp +++ b/src/storage/tablet/ob_tablet_table_store.cpp @@ -242,6 +242,102 @@ int ObTabletTableStore::build_ha_new_table_store( return ret; } +int ObTabletTableStore::batch_replace_sstables( + common::ObIAllocator &allocator, + ObTablet *tablet, + const ObIArray &table_handles, + const ObTabletTableStore &old_store) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(tablet) || !old_store.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("init tablet table store get invalid argument", K(ret), KP(tablet), K(old_store)); + } else if (OB_FAIL(init(allocator, tablet))) { + LOG_WARN("failed to init a new empty table store", K(ret)); + } else if (OB_FAIL(inner_replace_sstables(allocator, table_handles, old_store))) { + LOG_WARN("failed to build new table store with old store", K(ret)); + } + return ret; +} + +int ObTabletTableStore::inner_replace_sstables( + common::ObIAllocator &allocator, + const ObIArray &table_handles, + const ObTabletTableStore &old_store) +{ + int ret = OB_SUCCESS; + const ObITableArray &old_extend = old_store.extend_tables_; + // check table key first + ObTableHandleV2 tmp_handle; + for (int64_t i = 0; OB_SUCC(ret) && i < table_handles.count(); ++i) { + const ObITable *table = table_handles.at(i).get_table(); + if (OB_UNLIKELY(nullptr == table || !table->is_sstable())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("table must be sstable", K(ret), KPC(table)); + } else if (OB_FAIL(old_store.get_table(table->get_key(), tmp_handle))) { + LOG_WARN("failed to get the same key sstable in old store", K(ret), KPC(table), K(old_store)); + } + } + ObSEArray major_tables; + ObSEArray minor_tables; + ObSEArray ddl_tables; + if (OB_FAIL(ret)) { + } else if (OB_FAIL(get_replaced_tables(table_handles, old_store.major_tables_, major_tables))) { + LOG_WARN("failed to get replaced major tables", K(ret)); + } else if (OB_FAIL(get_replaced_tables(table_handles, old_store.minor_tables_, minor_tables))) { + LOG_WARN("failed to get replaced minor tables", K(ret)); + } else if (OB_FAIL(get_replaced_tables(table_handles, old_store.ddl_sstables_, ddl_tables))) { + LOG_WARN("failed to get replaced ddl tables", K(ret)); + } else if (!major_tables.empty() && OB_FAIL(major_tables_.init_and_copy(allocator, major_tables))) { + LOG_WARN("failed to init major tables", K(ret)); + } else if (!minor_tables.empty() && OB_FAIL(minor_tables_.init_and_copy(allocator, minor_tables))) { + LOG_WARN("failed to init minor tables", K(ret)); + } else if (!ddl_tables.empty() && OB_FAIL(ddl_sstables_.init_and_copy(allocator, ddl_tables))) { + LOG_WARN("failed to init ddl tables", K(ret)); + } else if (nullptr != old_extend[BUF_MINOR] && OB_FAIL(extend_tables_.assign(BUF_MINOR, old_extend[BUF_MINOR]))) { + LOG_WARN("failed to build buf minor table", K(ret), K(old_extend)); + } else if (OB_FAIL(pull_memtables())) { + LOG_WARN("failed to pull memtable from memtable_mgr", K(ret)); + } else if (OB_FAIL(check_ready_for_read())) { + LOG_WARN("failed to check ready for read", K(ret)); + } else { + int tmp_ret = OB_SUCCESS; + if (OB_SUCCESS != (tmp_ret = init_read_cache())) { + if (OB_SNAPSHOT_DISCARDED != tmp_ret) { + LOG_WARN("failed to cache read iterator", K(tmp_ret)); + } + } + FLOG_INFO("succeed to batch replace table store", K(major_tables_), K(minor_tables_), K(memtables_), K(PRINT_TS(*this))); + } + return ret; +} + +int ObTabletTableStore::get_replaced_tables( + const ObIArray &table_handles, + const ObITableArray &old_tables, + ObSEArray &replaced_tables) const +{ + int ret = OB_SUCCESS; + replaced_tables.reset(); + if (OB_FAIL(old_tables.get_all_tables(replaced_tables))) { + LOG_WARN("failed to get all table from old tables", K(ret), K(old_tables)); + } + ObITable *table = nullptr; + for (int64_t idx = 0; OB_SUCC(ret) && idx < replaced_tables.count(); ++idx) { + if (OB_ISNULL(table = replaced_tables.at(idx))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get unexpected null table", K(ret), K(replaced_tables)); + } + for (int64_t pos = 0; OB_SUCC(ret) && idx < table_handles.count(); ++idx) { + if (table->get_key() == table_handles.at(pos).get_table()->get_key()) { + replaced_tables[idx] = const_cast(table_handles.at(pos).get_table()); + break; + } + } + } + return ret; +} + int ObTabletTableStore::get_table(const ObITable::TableKey &table_key, ObTableHandleV2 &handle) const { int ret = OB_SUCCESS; diff --git a/src/storage/tablet/ob_tablet_table_store.h b/src/storage/tablet/ob_tablet_table_store.h index 8abebcd559..e7f2dd2ddd 100644 --- a/src/storage/tablet/ob_tablet_table_store.h +++ b/src/storage/tablet/ob_tablet_table_store.h @@ -114,6 +114,11 @@ public: ObTablet *tablet, const ObBatchUpdateTableStoreParam ¶m, const ObTabletTableStore &old_store); + int batch_replace_sstables( + common::ObIAllocator &allocator, + ObTablet *tablet, + const ObIArray &table_handles, + const ObTabletTableStore &old_store); private: int build_new_table_store( @@ -202,6 +207,14 @@ private: common::ObIAllocator &allocator, const ObBatchUpdateTableStoreParam ¶m, const ObTabletTableStore &old_store); + int inner_replace_sstables( + common::ObIAllocator &allocator, + const ObIArray &table_handles, + const ObTabletTableStore &old_store); + int get_replaced_tables( + const ObIArray &table_handles, + const ObITableArray &old_tables, + ObSEArray &replaced_tables) const; public: static const int64_t TABLE_STORE_VERSION = 0x0100; diff --git a/unittest/storage/backup/test_backup_utils.cpp b/unittest/storage/backup/test_backup_utils.cpp index d725184c88..8e61324cda 100644 --- a/unittest/storage/backup/test_backup_utils.cpp +++ b/unittest/storage/backup/test_backup_utils.cpp @@ -123,7 +123,10 @@ int ObFakeBackupTabletProvider::get_next_batch_items(common::ObIArray