[Fix](inverted index) fix inverted query cache for chinese tokenizer (#21106)
1. query cache for chinese tokenizer is confusing when just converting w_char to char. 2. seperate query_type from inverted_index_reader to clean code.
This commit is contained in:
@ -238,6 +238,9 @@ Cache::Handle* InvertedIndexSearcherCache::_insert(const InvertedIndexSearcherCa
|
||||
InvertedIndexQueryCache* InvertedIndexQueryCache::_s_instance = nullptr;
|
||||
|
||||
bool InvertedIndexQueryCache::lookup(const CacheKey& key, InvertedIndexQueryCacheHandle* handle) {
|
||||
if (key.encode().empty()) {
|
||||
return false;
|
||||
}
|
||||
auto lru_handle = _cache->lookup(key.encode());
|
||||
if (lru_handle == nullptr) {
|
||||
return false;
|
||||
@ -257,6 +260,9 @@ void InvertedIndexQueryCache::insert(const CacheKey& key, std::shared_ptr<roarin
|
||||
cache_value_ptr->last_visit_time = UnixMillis();
|
||||
cache_value_ptr->bitmap = bitmap;
|
||||
cache_value_ptr->size = bitmap->getSizeInBytes();
|
||||
if (key.encode().empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto lru_handle = _cache->insert(key.encode(), (void*)cache_value_ptr.release(),
|
||||
bitmap->getSizeInBytes(), deleter, CachePriority::NORMAL);
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <CLucene.h> // IWYU pragma: keep
|
||||
#include <CLucene/config/repl_wchar.h>
|
||||
#include <CLucene/util/Misc.h>
|
||||
#include <butil/macros.h>
|
||||
#include <glog/logging.h>
|
||||
@ -35,6 +36,7 @@
|
||||
#include "io/fs/file_system.h"
|
||||
#include "io/fs/path.h"
|
||||
#include "olap/lru_cache.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_query_type.h"
|
||||
#include "runtime/memory/mem_tracker.h"
|
||||
#include "util/slice.h"
|
||||
#include "util/time.h"
|
||||
@ -183,7 +185,6 @@ private:
|
||||
DISALLOW_COPY_AND_ASSIGN(InvertedIndexCacheHandle);
|
||||
};
|
||||
|
||||
enum class InvertedIndexQueryType;
|
||||
class InvertedIndexQueryCacheHandle;
|
||||
|
||||
class InvertedIndexQueryCache {
|
||||
@ -201,9 +202,14 @@ public:
|
||||
key_buf.append("/");
|
||||
key_buf.append(column_name);
|
||||
key_buf.append("/");
|
||||
key_buf.append(1, static_cast<char>(query_type));
|
||||
auto query_type_str = InvertedIndexQueryType_toString(query_type);
|
||||
if (query_type_str.empty()) {
|
||||
return "";
|
||||
}
|
||||
key_buf.append(query_type_str);
|
||||
key_buf.append("/");
|
||||
key_buf.append(lucene::util::Misc::toString(value.c_str()));
|
||||
auto str = lucene_wcstoutf8string(value.c_str(), value.length());
|
||||
key_buf.append(str);
|
||||
return key_buf;
|
||||
}
|
||||
};
|
||||
|
||||
71
be/src/olap/rowset/segment_v2/inverted_index_query_type.h
Normal file
71
be/src/olap/rowset/segment_v2/inverted_index_query_type.h
Normal file
@ -0,0 +1,71 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace doris {
|
||||
namespace segment_v2 {
|
||||
|
||||
enum class InvertedIndexQueryType {
|
||||
UNKNOWN_QUERY = -1,
|
||||
EQUAL_QUERY = 0,
|
||||
LESS_THAN_QUERY = 1,
|
||||
LESS_EQUAL_QUERY = 2,
|
||||
GREATER_THAN_QUERY = 3,
|
||||
GREATER_EQUAL_QUERY = 4,
|
||||
MATCH_ANY_QUERY = 5,
|
||||
MATCH_ALL_QUERY = 6,
|
||||
MATCH_PHRASE_QUERY = 7,
|
||||
};
|
||||
|
||||
inline std::string InvertedIndexQueryType_toString(InvertedIndexQueryType query_type) {
|
||||
switch (query_type) {
|
||||
case InvertedIndexQueryType::UNKNOWN_QUERY: {
|
||||
return "UNKNOWN";
|
||||
}
|
||||
case InvertedIndexQueryType::EQUAL_QUERY: {
|
||||
return "EQ";
|
||||
}
|
||||
case InvertedIndexQueryType::LESS_THAN_QUERY: {
|
||||
return "LT";
|
||||
}
|
||||
case InvertedIndexQueryType::LESS_EQUAL_QUERY: {
|
||||
return "LE";
|
||||
}
|
||||
case InvertedIndexQueryType::GREATER_THAN_QUERY: {
|
||||
return "GT";
|
||||
}
|
||||
case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
|
||||
return "GE";
|
||||
}
|
||||
case InvertedIndexQueryType::MATCH_ANY_QUERY: {
|
||||
return "MANY";
|
||||
}
|
||||
case InvertedIndexQueryType::MATCH_ALL_QUERY: {
|
||||
return "MALL";
|
||||
}
|
||||
case InvertedIndexQueryType::MATCH_PHRASE_QUERY: {
|
||||
return "MPHRASE";
|
||||
}
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
}
|
||||
} // namespace segment_v2
|
||||
} // namespace doris
|
||||
@ -344,6 +344,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string
|
||||
InvertedIndexQueryCache::CacheKey cache_key {index_file_path, column_name,
|
||||
InvertedIndexQueryType::EQUAL_QUERY,
|
||||
token_ws};
|
||||
VLOG_DEBUG << "cache_key:" << cache_key.encode();
|
||||
InvertedIndexQueryCacheHandle cache_handle;
|
||||
if (cache->lookup(cache_key, &cache_handle)) {
|
||||
stats->inverted_index_query_cache_hit++;
|
||||
|
||||
@ -30,6 +30,7 @@
|
||||
#include "io/fs/path.h"
|
||||
#include "olap/inverted_index_parser.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_query_type.h"
|
||||
#include "olap/tablet_schema.h"
|
||||
|
||||
namespace lucene {
|
||||
@ -63,18 +64,6 @@ enum class InvertedIndexReaderType {
|
||||
BKD = 2,
|
||||
};
|
||||
|
||||
enum class InvertedIndexQueryType {
|
||||
UNKNOWN_QUERY = -1,
|
||||
EQUAL_QUERY = 0,
|
||||
LESS_THAN_QUERY = 1,
|
||||
LESS_EQUAL_QUERY = 2,
|
||||
GREATER_THAN_QUERY = 3,
|
||||
GREATER_EQUAL_QUERY = 4,
|
||||
MATCH_ANY_QUERY = 5,
|
||||
MATCH_ALL_QUERY = 6,
|
||||
MATCH_PHRASE_QUERY = 7,
|
||||
};
|
||||
|
||||
class InvertedIndexReader {
|
||||
public:
|
||||
explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string& path,
|
||||
|
||||
@ -27,3 +27,41 @@
|
||||
-- !sql --
|
||||
3 人民可以得到更多实惠
|
||||
|
||||
-- !sql --
|
||||
2 我爱你中国
|
||||
|
||||
-- !sql --
|
||||
2 我爱你中国
|
||||
|
||||
-- !sql --
|
||||
1 我来到北京清华大学
|
||||
|
||||
-- !sql --
|
||||
1 我来到北京清华大学
|
||||
|
||||
-- !sql --
|
||||
1 我来到北京清华大学
|
||||
|
||||
-- !sql --
|
||||
3 人民可以得到更多实惠
|
||||
|
||||
-- !sql --
|
||||
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
|
||||
|
||||
-- !sql --
|
||||
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
|
||||
|
||||
-- !sql --
|
||||
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
|
||||
|
||||
-- !sql --
|
||||
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
|
||||
|
||||
-- !sql --
|
||||
|
||||
-- !sql --
|
||||
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
|
||||
|
||||
-- !sql --
|
||||
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
|
||||
|
||||
|
||||
@ -77,4 +77,37 @@ suite("test_chinese_analyzer"){
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '人民' ORDER BY id;"
|
||||
|
||||
def indexTblName3 = "chinese_analyzer_test3"
|
||||
|
||||
sql "DROP TABLE IF EXISTS ${indexTblName3}"
|
||||
// create 1 replica table
|
||||
sql """
|
||||
CREATE TABLE IF NOT EXISTS ${indexTblName3}(
|
||||
`id`int(11)NULL,
|
||||
`c` text NULL,
|
||||
INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode", "support_phrase"="true") COMMENT ''
|
||||
) ENGINE=OLAP
|
||||
DUPLICATE KEY(`id`)
|
||||
COMMENT 'OLAP'
|
||||
DISTRIBUTED BY HASH(`id`) BUCKETS 1
|
||||
PROPERTIES(
|
||||
"replication_allocation" = "tag.location.default: 1"
|
||||
);
|
||||
"""
|
||||
|
||||
sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '我爱你' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL'我爱你' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '清华' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '大学' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '清华大学' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '人民' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '陕西' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '12345678901' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '12345678' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '1.1.1.1' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '陕西西安' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '陕西省西安市' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'information' ORDER BY id;"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user