From aba843bb2b06a851b44f3ea7ef8a5e93d6d89747 Mon Sep 17 00:00:00 2001 From: Kang Date: Sat, 11 Feb 2023 13:38:58 +0800 Subject: [PATCH] [Improvement](inverted index) inverted index query match bitmap cache (#16578) Add cache for inverted index query match bitmap to accelerate common query keyword, especially for keyword matching many rows. Tests result: - large result: matching 99% out of 247 million rows shows 8x speed up. - small result: matching 0.1% out of 247 million rows shows 2x speed up. --- be/src/common/config.h | 3 + be/src/olap/olap_common.h | 8 + .../olap/rowset/segment_v2/column_reader.cpp | 3 +- be/src/olap/rowset/segment_v2/column_reader.h | 2 +- .../segment_v2/inverted_index_cache.cpp | 24 +- .../rowset/segment_v2/inverted_index_cache.h | 100 ++++++- .../segment_v2/inverted_index_reader.cpp | 276 +++++++++++------- .../rowset/segment_v2/inverted_index_reader.h | 68 +++-- be/src/olap/rowset/segment_v2/segment.cpp | 4 +- be/src/olap/rowset/segment_v2/segment.h | 3 +- .../rowset/segment_v2/segment_iterator.cpp | 2 +- be/src/runtime/exec_env_init.cpp | 13 + be/src/vec/exec/scan/new_olap_scan_node.cpp | 17 +- be/src/vec/exec/scan/new_olap_scan_node.h | 8 + be/src/vec/exec/scan/new_olap_scanner.cpp | 15 + .../inverted_index_searcher_cache_test.cpp | 20 +- 16 files changed, 419 insertions(+), 147 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index 7dd3345a70..100664fb97 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -894,6 +894,9 @@ CONF_String(inverted_index_searcher_cache_limit, "10%"); CONF_Bool(enable_write_index_searcher_cache, "true"); CONF_Bool(enable_inverted_index_cache_check_timestamp, "true"); +// inverted index match bitmap cache size +CONF_String(inverted_index_query_cache_limit, "10%"); + // inverted index CONF_mDouble(inverted_index_ram_buffer_size, "512"); CONF_Int32(query_bkd_inverted_index_limit_percent, "5"); // 5% diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index 07b0b28983..35179caf6d 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -344,6 +344,14 @@ struct OlapReaderStatistics { int64_t rows_inverted_index_filtered = 0; int64_t inverted_index_filter_timer = 0; + int64_t inverted_index_query_timer = 0; + int64_t inverted_index_query_cache_hit = 0; + int64_t inverted_index_query_cache_miss = 0; + int64_t inverted_index_query_bitmap_copy_timer = 0; + int64_t inverted_index_query_bitmap_op_timer = 0; + int64_t inverted_index_searcher_open_timer = 0; + int64_t inverted_index_searcher_search_timer = 0; + int64_t inverted_index_searcher_bitmap_timer = 0; int64_t output_index_result_column_timer = 0; // number of segment filtered by column stat when creating seg iterator diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 95e83ee049..abc2726e2a 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -187,10 +187,11 @@ Status ColumnReader::new_bitmap_index_iterator(BitmapIndexIterator** iterator) { } Status ColumnReader::new_inverted_index_iterator(const TabletIndex* index_meta, + OlapReaderStatistics* stats, InvertedIndexIterator** iterator) { RETURN_IF_ERROR(_ensure_inverted_index_loaded(index_meta)); if (_inverted_index) { - RETURN_IF_ERROR(_inverted_index->new_iterator(index_meta, iterator)); + RETURN_IF_ERROR(_inverted_index->new_iterator(index_meta, stats, iterator)); } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index b093e7938e..e462077d65 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -105,7 +105,7 @@ public: // Client should delete returned iterator Status new_bitmap_index_iterator(BitmapIndexIterator** iterator); - Status new_inverted_index_iterator(const TabletIndex* index_meta, + Status new_inverted_index_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, InvertedIndexIterator** iterator); // Seek to the first entry in the column. diff --git a/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp b/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp index 08375739ab..0dc14bcf67 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_cache.cpp @@ -70,7 +70,7 @@ Status InvertedIndexSearcherCache::get_index_searcher(const io::FileSystemSPtr& const std::string& index_dir, const std::string& file_name, InvertedIndexCacheHandle* cache_handle, - bool use_cache) { + OlapReaderStatistics* stats, bool use_cache) { auto file_path = index_dir + "/" + file_name; using namespace std::chrono; @@ -85,12 +85,14 @@ Status InvertedIndexSearcherCache::get_index_searcher(const io::FileSystemSPtr& cache_handle->owned = false; return Status::OK(); } + cache_handle->owned = !use_cache; IndexSearcherPtr index_searcher = nullptr; auto mem_tracker = std::unique_ptr(new MemTracker("InvertedIndexSearcherCacheWithRead")); #ifndef BE_TEST { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); SCOPED_CONSUME_MEM_TRACKER(mem_tracker.get()); index_searcher = build_index_searcher(fs, index_dir, file_name); } @@ -169,5 +171,25 @@ Cache::Handle* InvertedIndexSearcherCache::_insert(const InvertedIndexSearcherCa return lru_handle; } +InvertedIndexQueryCache* InvertedIndexQueryCache::_s_instance = nullptr; + +bool InvertedIndexQueryCache::lookup(const CacheKey& key, InvertedIndexQueryCacheHandle* handle) { + auto lru_handle = _cache->lookup(key.encode()); + if (lru_handle == nullptr) { + return false; + } + *handle = InvertedIndexQueryCacheHandle(_cache.get(), lru_handle); + return true; +} + +void InvertedIndexQueryCache::insert(const CacheKey& key, roaring::Roaring* bitmap, + InvertedIndexQueryCacheHandle* handle) { + auto deleter = [](const doris::CacheKey& key, void* value) { delete (roaring::Roaring*)value; }; + + auto lru_handle = _cache->insert(key.encode(), (void*)bitmap, bitmap->getSizeInBytes(), deleter, + CachePriority::NORMAL); + *handle = InvertedIndexQueryCacheHandle(_cache.get(), lru_handle); +} + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_cache.h b/be/src/olap/rowset/segment_v2/inverted_index_cache.h index 60c733e9cb..cf88c17b02 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_cache.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_cache.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "io/fs/file_system.h" @@ -71,7 +72,7 @@ public: Status get_index_searcher(const io::FileSystemSPtr& fs, const std::string& index_dir, const std::string& file_name, InvertedIndexCacheHandle* cache_handle, - bool use_cache = true); + OlapReaderStatistics* stats, bool use_cache = true); // function `insert` called after inverted index writer close Status insert(const io::FileSystemSPtr& fs, const std::string& index_dir, @@ -163,5 +164,102 @@ private: DISALLOW_COPY_AND_ASSIGN(InvertedIndexCacheHandle); }; +enum class InvertedIndexQueryType; + +class InvertedIndexQueryCacheHandle; + +class InvertedIndexQueryCache { +public: + // cache key + struct CacheKey { + io::Path index_path; // index file path + std::string column_name; // column name + InvertedIndexQueryType query_type; // query type + std::wstring value; // query value + + // Encode to a flat binary which can be used as LRUCache's key + std::string encode() const { + std::string key_buf(index_path.string()); + key_buf.append("/"); + key_buf.append(column_name); + key_buf.append("/"); + key_buf.append(1, static_cast(query_type)); + key_buf.append("/"); + key_buf.append(lucene::util::Misc::toString(value.c_str())); + return key_buf; + } + }; + + using CacheValue = roaring::Roaring; + + // Create global instance of this class + static void create_global_cache(size_t capacity, int32_t index_cache_percentage, + uint32_t num_shards = 16) { + DCHECK(_s_instance == nullptr); + static InvertedIndexQueryCache instance(capacity, index_cache_percentage, num_shards); + _s_instance = &instance; + } + + // Return global instance. + // Client should call create_global_cache before. + static InvertedIndexQueryCache* instance() { return _s_instance; } + + InvertedIndexQueryCache() = delete; + + InvertedIndexQueryCache(size_t capacity, int32_t index_cache_percentage, uint32_t num_shards) { + _cache = std::unique_ptr( + new_lru_cache("InvertedIndexQueryCache", capacity, LRUCacheType::SIZE, num_shards)); + } + + bool lookup(const CacheKey& key, InvertedIndexQueryCacheHandle* handle); + + void insert(const CacheKey& key, roaring::Roaring* bitmap, + InvertedIndexQueryCacheHandle* handle); + +private: + static InvertedIndexQueryCache* _s_instance; + std::unique_ptr _cache {nullptr}; +}; + +class InvertedIndexQueryCacheHandle { +public: + InvertedIndexQueryCacheHandle() {} + + InvertedIndexQueryCacheHandle(Cache* cache, Cache::Handle* handle) + : _cache(cache), _handle(handle) {} + + ~InvertedIndexQueryCacheHandle() { + if (_handle != nullptr) { + _cache->release(_handle); + } + } + + InvertedIndexQueryCacheHandle(InvertedIndexQueryCacheHandle&& other) noexcept { + // we can use std::exchange if we switch c++14 on + std::swap(_cache, other._cache); + std::swap(_handle, other._handle); + } + + InvertedIndexQueryCacheHandle& operator=(InvertedIndexQueryCacheHandle&& other) noexcept { + std::swap(_cache, other._cache); + std::swap(_handle, other._handle); + return *this; + } + + Cache* cache() const { return _cache; } + Slice data() const { return _cache->value_slice(_handle); } + + InvertedIndexQueryCache::CacheValue* match_bitmap() const { + return ((InvertedIndexQueryCache::CacheValue*)_cache->value(_handle)); + } + +private: + Cache* _cache = nullptr; + Cache::Handle* _handle = nullptr; + + // Don't allow copy and assign + DISALLOW_COPY_AND_ASSIGN(InvertedIndexQueryCacheHandle); +}; + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 402a88f2c5..5ed46c6ec4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -100,19 +100,27 @@ std::vector FullTextIndexReader::get_analyse_result( return analyse_result; } -Status FullTextIndexReader::new_iterator(const TabletIndex* index_meta, +Status FullTextIndexReader::new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, InvertedIndexIterator** iterator) { - *iterator = new InvertedIndexIterator(index_meta, this); + *iterator = new InvertedIndexIterator(index_meta, stats, this); return Status::OK(); } -Status FullTextIndexReader::query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); + std::string search_str = reinterpret_cast(query_value)->to_string(); - VLOG_DEBUG << column_name - << " begin to load the fulltext index from clucene, query_str=" << search_str; + LOG(INFO) << column_name << " begin to search the fulltext index from clucene, query_str [" + << search_str << "]"; + + io::Path path(_path); + auto index_dir = path.parent_path(); + auto index_file_name = InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id); + auto index_file_path = index_dir / index_file_name; + std::unique_ptr query; std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); try { @@ -125,75 +133,93 @@ Status FullTextIndexReader::query(const std::string& column_name, const void* qu return Status::Error(); } - switch (query_type) { - case InvertedIndexQueryType::MATCH_ANY_QUERY: { - query.reset(_CLNEW lucene::search::BooleanQuery()); - for (auto token_ws : analyse_result) { - lucene::index::Term* term = - _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); - static_cast(query.get()) - ->add(_CLNEW lucene::search::TermQuery(term), true, - lucene::search::BooleanClause::SHOULD); - _CLDECDELETE(term); + roaring::Roaring query_match_bitmap; + bool first = true; + for (auto token : analyse_result) { + roaring::Roaring* term_match_bitmap = nullptr; + + // try to get term bitmap match result from cache to avoid query index on cache hit + auto cache = InvertedIndexQueryCache::instance(); + // use EQUAL_QUERY type here since cache is for each term/token + InvertedIndexQueryCache::CacheKey cache_key { + index_file_path, column_name, InvertedIndexQueryType::EQUAL_QUERY, token}; + InvertedIndexQueryCacheHandle cache_handle; + if (cache->lookup(cache_key, &cache_handle)) { + stats->inverted_index_query_cache_hit++; + term_match_bitmap = cache_handle.match_bitmap(); + } else { + stats->inverted_index_query_cache_miss++; + term_match_bitmap = new roaring::Roaring(); + std::wstring token_ws = std::wstring(token.begin(), token.end()); + // unique_ptr with custom deleter + std::unique_ptr term { + _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; + query.reset(new lucene::search::TermQuery(term.get())); + + InvertedIndexCacheHandle inverted_index_cache_handle; + InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, + stats); + auto index_searcher = inverted_index_cache_handle.get_index_searcher(); + + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + index_searcher->_search( + query.get(), [&term_match_bitmap, stats](const int32_t docid, + const float_t /*score*/) { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_bitmap_timer); + // docid equal to rowid in segment + term_match_bitmap->add(docid); + }); + } catch (const CLuceneError& e) { + LOG(WARNING) << "CLuceneError occured: " << e.what(); + return Status::Error(); + } + + { + // add to cache + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); + } + } + + // add to query_match_bitmap + if (first) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_copy_timer); + query_match_bitmap = *term_match_bitmap; + first = false; + continue; + } + + switch (query_type) { + case InvertedIndexQueryType::MATCH_ANY_QUERY: { + SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_op_timer); + query_match_bitmap |= *term_match_bitmap; + break; + } + case InvertedIndexQueryType::MATCH_ALL_QUERY: { + SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_op_timer); + query_match_bitmap &= *term_match_bitmap; + break; + } + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { + return Status::Error(); + break; + } + default: { + LOG(ERROR) << "fulltext query do not support query type other than match."; + return Status::Error(); } - break; - } - case InvertedIndexQueryType::MATCH_ALL_QUERY: { - query.reset(_CLNEW lucene::search::BooleanQuery()); - for (auto token_ws : analyse_result) { - lucene::index::Term* term = - _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); - static_cast(query.get()) - ->add(_CLNEW lucene::search::TermQuery(term), true, - lucene::search::BooleanClause::MUST); - _CLDECDELETE(term); } - break; - } - case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { - LOG(WARNING) << "match phrase of fulltext is not supported"; - return Status::Error(); - } - default: - LOG(ERROR) << "fulltext query do not support query type other than match, column: " - << column_name; - return Status::Error(); } + bit_map->swap(query_match_bitmap); + return Status::OK(); } catch (const CLuceneError& e) { LOG(WARNING) << "CLuceneError occured, error msg: " << e.what(); return Status::Error(); } - - io::Path path(_path); - auto index_dir = path.parent_path(); - auto index_file_name = InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id); - - // check index file existence - auto index_file_path = index_dir / index_file_name; - if (!indexExists(index_file_path)) { - LOG(WARNING) << "inverted index path: " << index_file_path.string() << " not exist."; - return Status::Error(); - } - - roaring::Roaring result; - InvertedIndexCacheHandle inverted_index_cache_handle; - InvertedIndexSearcherCache::instance()->get_index_searcher( - _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle); - auto index_searcher = inverted_index_cache_handle.get_index_searcher(); - - try { - index_searcher->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.add(docid); - }); - } catch (const CLuceneError& e) { - LOG(WARNING) << "CLuceneError occured, error msg: " << e.what(); - return Status::Error(); - } - bit_map->swap(result); - return Status::OK(); } InvertedIndexReaderType FullTextIndexReader::type() { @@ -201,32 +227,53 @@ InvertedIndexReaderType FullTextIndexReader::type() { } Status StringTypeInvertedIndexReader::new_iterator(const TabletIndex* index_meta, + OlapReaderStatistics* stats, InvertedIndexIterator** iterator) { - *iterator = new InvertedIndexIterator(index_meta, this); + *iterator = new InvertedIndexIterator(index_meta, stats, this); return Status::OK(); } -Status StringTypeInvertedIndexReader::query(const std::string& column_name, const void* query_value, +Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); + const StringRef* search_query = reinterpret_cast(query_value); auto act_len = strnlen(search_query->data, search_query->size); std::string search_str(search_query->data, act_len); + // std::string search_str = reinterpret_cast(query_value)->to_string(); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end()); std::wstring search_str_ws = std::wstring(search_str.begin(), search_str.end()); - lucene::index::Term* term = - _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()); + // unique_ptr with custom deleter + std::unique_ptr term { + _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; std::unique_ptr query; io::Path path(_path); auto index_dir = path.parent_path(); auto index_file_name = InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id); + auto index_file_path = index_dir / index_file_name; + + // try to get query bitmap result from cache and return immediately on cache hit + InvertedIndexQueryCache::CacheKey cache_key {index_file_path, column_name, query_type, + search_str_ws}; + auto cache = InvertedIndexQueryCache::instance(); + InvertedIndexQueryCacheHandle cache_handle; + if (cache->lookup(cache_key, &cache_handle)) { + stats->inverted_index_query_cache_hit++; + SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_copy_timer); + *bit_map = *cache_handle.match_bitmap(); + return Status::OK(); + } else { + stats->inverted_index_query_cache_miss++; + } // check index file existence - auto index_file_path = index_dir / index_file_name; if (!indexExists(index_file_path)) { LOG(WARNING) << "inverted index path: " << index_file_path.string() << " not exist."; return Status::Error(); @@ -234,28 +281,23 @@ Status StringTypeInvertedIndexReader::query(const std::string& column_name, cons switch (query_type) { case InvertedIndexQueryType::EQUAL_QUERY: { - query.reset(new lucene::search::TermQuery(term)); - _CLDECDELETE(term); + query.reset(new lucene::search::TermQuery(term.get())); break; } case InvertedIndexQueryType::LESS_THAN_QUERY: { - query.reset(new lucene::search::RangeQuery(nullptr, term, false)); - _CLDECDELETE(term); + query.reset(new lucene::search::RangeQuery(nullptr, term.get(), false)); break; } case InvertedIndexQueryType::LESS_EQUAL_QUERY: { - query.reset(new lucene::search::RangeQuery(nullptr, term, true)); - _CLDECDELETE(term); + query.reset(new lucene::search::RangeQuery(nullptr, term.get(), true)); break; } case InvertedIndexQueryType::GREATER_THAN_QUERY: { - query.reset(new lucene::search::RangeQuery(term, nullptr, false)); - _CLDECDELETE(term); + query.reset(new lucene::search::RangeQuery(term.get(), nullptr, false)); break; } case InvertedIndexQueryType::GREATER_EQUAL_QUERY: { - query.reset(new lucene::search::RangeQuery(term, nullptr, true)); - _CLDECDELETE(term); + query.reset(new lucene::search::RangeQuery(term.get(), nullptr, true)); break; } default: @@ -272,13 +314,15 @@ Status StringTypeInvertedIndexReader::query(const std::string& column_name, cons roaring::Roaring result; InvertedIndexCacheHandle inverted_index_cache_handle; InvertedIndexSearcherCache::instance()->get_index_searcher( - _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle); + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats); auto index_searcher = inverted_index_cache_handle.get_index_searcher(); try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); index_searcher->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { + [&result, stats](const int32_t docid, const float_t /*score*/) { // docid equal to rowid in segment + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_bitmap_timer); result.add(docid); }); } catch (const CLuceneError& e) { @@ -286,6 +330,11 @@ Status StringTypeInvertedIndexReader::query(const std::string& column_name, cons return Status::Error(); } + // add to cache + roaring::Roaring* term_match_bitmap = new roaring::Roaring(result); + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); + bit_map->swap(result); return Status::OK(); } @@ -312,14 +361,14 @@ BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path, DorisCompoundDirectory::getDirectory(fs, index_dir.c_str()), index_file_name.c_str()); } -Status BkdIndexReader::new_iterator(const TabletIndex* index_meta, +Status BkdIndexReader::new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, InvertedIndexIterator** iterator) { - *iterator = new InvertedIndexIterator(index_meta, this); + *iterator = new InvertedIndexIterator(index_meta, stats, this); return Status::OK(); } -Status BkdIndexReader::bkd_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status BkdIndexReader::bkd_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr&& r, InvertedIndexVisitor* visitor) { lucene::util::bkd::bkd_reader* tmp_reader; @@ -359,15 +408,15 @@ Status BkdIndexReader::bkd_query(const std::string& column_name, const void* que return Status::OK(); } -Status BkdIndexReader::try_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, uint32_t* count) { uint64_t start = UnixMillis(); auto visitor = std::make_unique(nullptr, query_type, true); std::shared_ptr r; try { - RETURN_IF_ERROR( - bkd_query(column_name, query_value, query_type, std::move(r), visitor.get())); + RETURN_IF_ERROR(bkd_query(stats, column_name, query_value, query_type, std::move(r), + visitor.get())); *count = r->estimate_point_count(visitor.get()); } catch (const CLuceneError& e) { LOG(WARNING) << "BKD Query CLuceneError Occurred, error msg: " << e.what(); @@ -379,21 +428,49 @@ Status BkdIndexReader::try_query(const std::string& column_name, const void* que return Status::OK(); } -Status BkdIndexReader::query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status BkdIndexReader::query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); + + io::Path path(_path); + auto index_dir = path.parent_path(); + auto index_file_name = InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id); + auto index_file_path = index_dir / index_file_name; + // std::string query_str {(const char *)query_value}; + + // // try to get query bitmap result from cache and return immediately on cache hit + // InvertedIndexQueryCache::CacheKey cache_key + // {index_file_path, column_name, query_type, std::wstring(query_str.begin(), query_str.end())}; + // auto cache = InvertedIndexQueryCache::instance(); + // InvertedIndexQueryCacheHandle cache_handle; + // if (cache->lookup(cache_key, &cache_handle)) { + // stats->inverted_index_query_cache_hit++; + // SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_copy_timer); + // *bit_map = *cache_handle.match_bitmap(); + // return Status::OK(); + // } else { + // stats->inverted_index_query_cache_miss++; + // } + uint64_t start = UnixMillis(); auto visitor = std::make_unique(bit_map, query_type); std::shared_ptr r; try { - RETURN_IF_ERROR( - bkd_query(column_name, query_value, query_type, std::move(r), visitor.get())); + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + RETURN_IF_ERROR(bkd_query(stats, column_name, query_value, query_type, std::move(r), + visitor.get())); r->intersect(visitor.get()); } catch (const CLuceneError& e) { LOG(WARNING) << "BKD Query CLuceneError Occurred, error msg: " << e.what(); return Status::Error(); } + // // add to cache + // roaring::Roaring* term_match_bitmap = new roaring::Roaring(*bit_map); + // term_match_bitmap->runOptimize(); + // cache->insert(cache_key, term_match_bitmap, &cache_handle); + LOG(INFO) << "BKD index search time taken: " << UnixMillis() - start << "ms " << " column: " << column_name << " result: " << bit_map->cardinality() << " reader stats: " << r->stats.to_string(); @@ -631,7 +708,8 @@ Status InvertedIndexIterator::read_from_inverted_index(const std::string& column } } - RETURN_IF_ERROR(_reader->query(column_name, query_value, query_type, _analyser_type, bit_map)); + RETURN_IF_ERROR( + _reader->query(_stats, column_name, query_value, query_type, _analyser_type, bit_map)); return Status::OK(); } @@ -645,8 +723,8 @@ Status InvertedIndexIterator::try_read_from_inverted_index(const std::string& co query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY || query_type == InvertedIndexQueryType::LESS_THAN_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { - RETURN_IF_ERROR( - _reader->try_query(column_name, query_value, query_type, _analyser_type, count)); + RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value, query_type, + _analyser_type, count)); } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 1d0bfb5015..c8bdba41e9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -69,13 +69,13 @@ public: virtual ~InvertedIndexReader() = default; // create a new column iterator. Client should delete returned iterator - virtual Status new_iterator(const TabletIndex* index_meta, + virtual Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, InvertedIndexIterator** iterator) = 0; - virtual Status query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - roaring::Roaring* bit_map) = 0; - virtual Status try_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + virtual Status query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) = 0; + virtual Status try_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, uint32_t* count) = 0; virtual InvertedIndexReaderType type() = 0; @@ -98,13 +98,14 @@ public: : InvertedIndexReader(std::move(fs), path, uniq_id) {} ~FullTextIndexReader() override = default; - Status new_iterator(const TabletIndex* index_meta, InvertedIndexIterator** iterator) override; - Status query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - roaring::Roaring* bit_map) override; - Status try_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - uint32_t* count) override { + Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, + InvertedIndexIterator** iterator) override; + Status query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) override; + Status try_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, uint32_t* count) override { return Status::Error(); } @@ -122,13 +123,14 @@ public: : InvertedIndexReader(std::move(fs), path, uniq_id) {} ~StringTypeInvertedIndexReader() override = default; - Status new_iterator(const TabletIndex* index_meta, InvertedIndexIterator** iterator) override; - Status query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - roaring::Roaring* bit_map) override; - Status try_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - uint32_t* count) override { + Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, + InvertedIndexIterator** iterator) override; + Status query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) override; + Status try_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, uint32_t* count) override { return Status::Error(); } InvertedIndexReaderType type() override; @@ -179,16 +181,17 @@ public: } } - Status new_iterator(const TabletIndex* index_meta, InvertedIndexIterator** iterator) override; + Status new_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, + InvertedIndexIterator** iterator) override; - Status query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - roaring::Roaring* bit_map) override; - Status try_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type, - uint32_t* count) override; - Status bkd_query(const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) override; + Status try_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type, uint32_t* count) override; + Status bkd_query(OlapReaderStatistics* stats, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr&& r, InvertedIndexVisitor* visitor); @@ -203,8 +206,10 @@ private: class InvertedIndexIterator { public: - InvertedIndexIterator(const TabletIndex* index_meta, InvertedIndexReader* reader) - : _index_meta(index_meta), _reader(reader) { + InvertedIndexIterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, + InvertedIndexReader* reader) + : _index_meta(index_meta), _stats(stats), _reader(reader) { + // TODO xk maybe change interface to use index _analyser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta->properties())); } @@ -221,6 +226,7 @@ public: private: const TabletIndex* _index_meta; + OlapReaderStatistics* _stats; InvertedIndexReader* _reader; InvertedIndexParserType _analyser_type; }; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 7eeb15a717..1e81c86e08 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -302,10 +302,12 @@ Status Segment::new_bitmap_index_iterator(const TabletColumn& tablet_column, Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, const TabletIndex* index_meta, + OlapReaderStatistics* stats, InvertedIndexIterator** iter) { auto col_unique_id = tablet_column.unique_id(); if (_column_readers.count(col_unique_id) > 0 && index_meta) { - return _column_readers.at(col_unique_id)->new_inverted_index_iterator(index_meta, iter); + return _column_readers.at(col_unique_id) + ->new_inverted_index_iterator(index_meta, stats, iter); } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index ba111def1f..078105eefb 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -81,7 +81,8 @@ public: Status new_bitmap_index_iterator(const TabletColumn& tablet_column, BitmapIndexIterator** iter); Status new_inverted_index_iterator(const TabletColumn& tablet_column, - const TabletIndex* index_meta, InvertedIndexIterator** iter); + const TabletIndex* index_meta, OlapReaderStatistics* stats, + InvertedIndexIterator** iter); const ShortKeyIndexDecoder* get_short_key_index() const { DCHECK(_load_index_once.has_called() && _load_index_once.stored_result().ok()); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index c199038813..7e8e254276 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -745,7 +745,7 @@ Status SegmentIterator::_init_inverted_index_iterators() { if (_inverted_index_iterators.count(unique_id) < 1) { RETURN_IF_ERROR(_segment->new_inverted_index_iterator( _opts.tablet_schema->column(cid), _opts.tablet_schema->get_inverted_index(cid), - &_inverted_index_iterators[unique_id])); + _opts.stats, &_inverted_index_iterators[unique_id])); } } return Status::OK(); diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 8b3146cdc2..a2770ce6f5 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -233,6 +233,19 @@ Status ExecEnv::_init_mem_env() { << PrettyPrinter::print(inverted_index_cache_limit, TUnit::BYTES) << ", origin config value: " << config::inverted_index_searcher_cache_limit; + // use memory limit + int64_t inverted_index_query_cache_limit = + ParseUtil::parse_mem_spec(config::inverted_index_query_cache_limit, + MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent); + while (!is_percent && inverted_index_query_cache_limit > MemInfo::mem_limit() / 2) { + // Reason same as buffer_pool_limit + inverted_index_query_cache_limit = inverted_index_query_cache_limit / 2; + } + InvertedIndexQueryCache::create_global_cache(inverted_index_query_cache_limit, 10); + LOG(INFO) << "Inverted index query match cache memory limit: " + << PrettyPrinter::print(inverted_index_cache_limit, TUnit::BYTES) + << ", origin config value: " << config::inverted_index_query_cache_limit; + // 4. init other managers RETURN_IF_ERROR(_tmp_file_mgr->init()); RETURN_IF_ERROR(_block_spill_mgr->init()); diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp index d50226b47b..154cab3acf 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.cpp +++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp @@ -110,7 +110,22 @@ Status NewOlapScanNode::_init_profile() { _inverted_index_filter_counter = ADD_COUNTER(_segment_profile, "RowsInvertedIndexFiltered", TUnit::UNIT); - _inverted_index_filter_timer = ADD_TIMER(_segment_profile, "InvertedIndexFilterTimer"); + _inverted_index_filter_timer = ADD_TIMER(_segment_profile, "InvertedIndexFilterTime"); + _inverted_index_query_cache_hit_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexQueryCacheHit", TUnit::UNIT); + _inverted_index_query_cache_miss_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexQueryCacheMiss", TUnit::UNIT); + _inverted_index_query_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryTime"); + _inverted_index_query_bitmap_copy_timer = + ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapCopyTime"); + _inverted_index_query_bitmap_op_timer = + ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapOpTime"); + _inverted_index_searcher_open_timer = + ADD_TIMER(_segment_profile, "InvertedIndexSearcherOpenTime"); + _inverted_index_searcher_search_timer = + ADD_TIMER(_segment_profile, "InvertedIndexSearcherSearchTime"); + _inverted_index_searcher_bitmap_timer = + ADD_TIMER(_segment_profile, "InvertedIndexSearcherGenBitmapTime"); _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTimer"); diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h b/be/src/vec/exec/scan/new_olap_scan_node.h index af3a7d0ea5..c7a346b6db 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.h +++ b/be/src/vec/exec/scan/new_olap_scan_node.h @@ -129,6 +129,14 @@ private: RuntimeProfile::Counter* _inverted_index_filter_counter = nullptr; RuntimeProfile::Counter* _inverted_index_filter_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_query_cache_hit_counter = nullptr; + RuntimeProfile::Counter* _inverted_index_query_cache_miss_counter = nullptr; + RuntimeProfile::Counter* _inverted_index_query_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_query_bitmap_copy_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_query_bitmap_op_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_open_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_search_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_bitmap_timer = nullptr; RuntimeProfile::Counter* _output_index_result_column_timer = nullptr; diff --git a/be/src/vec/exec/scan/new_olap_scanner.cpp b/be/src/vec/exec/scan/new_olap_scanner.cpp index a80b9fd529..ba7e2ded37 100644 --- a/be/src/vec/exec/scan/new_olap_scanner.cpp +++ b/be/src/vec/exec/scan/new_olap_scanner.cpp @@ -496,6 +496,21 @@ void NewOlapScanner::_update_counters_before_close() { COUNTER_UPDATE(olap_parent->_inverted_index_filter_counter, stats.rows_inverted_index_filtered); COUNTER_UPDATE(olap_parent->_inverted_index_filter_timer, stats.inverted_index_filter_timer); + COUNTER_UPDATE(olap_parent->_inverted_index_query_cache_hit_counter, + stats.inverted_index_query_cache_hit); + COUNTER_UPDATE(olap_parent->_inverted_index_query_cache_miss_counter, + stats.inverted_index_query_cache_miss); + COUNTER_UPDATE(olap_parent->_inverted_index_query_timer, stats.inverted_index_query_timer); + COUNTER_UPDATE(olap_parent->_inverted_index_query_bitmap_copy_timer, + stats.inverted_index_query_bitmap_copy_timer); + COUNTER_UPDATE(olap_parent->_inverted_index_query_bitmap_op_timer, + stats.inverted_index_query_bitmap_op_timer); + COUNTER_UPDATE(olap_parent->_inverted_index_searcher_open_timer, + stats.inverted_index_searcher_open_timer); + COUNTER_UPDATE(olap_parent->_inverted_index_searcher_search_timer, + stats.inverted_index_searcher_search_timer); + COUNTER_UPDATE(olap_parent->_inverted_index_searcher_bitmap_timer, + stats.inverted_index_searcher_bitmap_timer); COUNTER_UPDATE(olap_parent->_output_index_result_column_timer, stats.output_index_result_column_timer); diff --git a/be/test/olap/rowset/segment_v2/inverted_index_searcher_cache_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_searcher_cache_test.cpp index b2ad7d58ab..38dd7d0699 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_searcher_cache_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_searcher_cache_test.cpp @@ -62,12 +62,14 @@ TEST_F(InvertedIndexSearcherCacheTest, insert_lookup) { status = index_searcher_cache->insert(fs, kTestDir, file_name_2); EXPECT_EQ(Status::OK(), status); + OlapReaderStatistics stats; + // lookup after insert { // case 1: lookup exist entry InvertedIndexCacheHandle inverted_index_cache_handle; status = index_searcher_cache->get_index_searcher(fs, kTestDir, file_name_1, - &inverted_index_cache_handle); + &inverted_index_cache_handle, &stats); EXPECT_EQ(Status::OK(), status); auto cache_value_1 = @@ -76,7 +78,7 @@ TEST_F(InvertedIndexSearcherCacheTest, insert_lookup) { EXPECT_GE(UnixMillis(), cache_value_1->last_visit_time); status = index_searcher_cache->get_index_searcher(fs, kTestDir, file_name_2, - &inverted_index_cache_handle); + &inverted_index_cache_handle, &stats); EXPECT_EQ(Status::OK(), status); auto cache_value_2 = @@ -92,8 +94,8 @@ TEST_F(InvertedIndexSearcherCacheTest, insert_lookup) { // use cache { InvertedIndexCacheHandle inverted_index_cache_handle_1; - status = index_searcher_cache->get_index_searcher(fs, kTestDir, file_name_not_exist_1, - &inverted_index_cache_handle_1); + status = index_searcher_cache->get_index_searcher( + fs, kTestDir, file_name_not_exist_1, &inverted_index_cache_handle_1, &stats); EXPECT_EQ(Status::OK(), status); EXPECT_FALSE(inverted_index_cache_handle_1.owned); } @@ -101,8 +103,8 @@ TEST_F(InvertedIndexSearcherCacheTest, insert_lookup) { // lookup again { InvertedIndexCacheHandle inverted_index_cache_handle_1; - status = index_searcher_cache->get_index_searcher(fs, kTestDir, file_name_not_exist_1, - &inverted_index_cache_handle_1); + status = index_searcher_cache->get_index_searcher( + fs, kTestDir, file_name_not_exist_1, &inverted_index_cache_handle_1, &stats); EXPECT_EQ(Status::OK(), status); EXPECT_FALSE(inverted_index_cache_handle_1.owned); @@ -114,15 +116,15 @@ TEST_F(InvertedIndexSearcherCacheTest, insert_lookup) { // not use cache InvertedIndexCacheHandle inverted_index_cache_handle_2; - status = index_searcher_cache->get_index_searcher(fs, kTestDir, file_name_not_exist_2, - &inverted_index_cache_handle_2, false); + status = index_searcher_cache->get_index_searcher( + fs, kTestDir, file_name_not_exist_2, &inverted_index_cache_handle_2, &stats, false); EXPECT_EQ(Status::OK(), status); EXPECT_TRUE(inverted_index_cache_handle_2.owned); EXPECT_EQ(nullptr, inverted_index_cache_handle_2._cache); EXPECT_EQ(nullptr, inverted_index_cache_handle_2._handle); status = index_searcher_cache->get_index_searcher(fs, kTestDir, file_name_not_exist_2, - &inverted_index_cache_handle_2); + &inverted_index_cache_handle_2, &stats); EXPECT_EQ(Status::OK(), status); EXPECT_FALSE(inverted_index_cache_handle_2.owned); auto cache_value_use_cache_2 =