[Fix](inverted index) fix inverted query cache for chinese tokenizer (#21106)

1. query cache for chinese tokenizer is confusing when just converting w_char to char.
2. seperate query_type from inverted_index_reader to clean code.
This commit is contained in:
airborne12
2023-06-25 22:04:02 +08:00
committed by GitHub
parent 64790a3a86
commit 1ac8cdec7e
7 changed files with 159 additions and 15 deletions

View File

@ -238,6 +238,9 @@ Cache::Handle* InvertedIndexSearcherCache::_insert(const InvertedIndexSearcherCa
InvertedIndexQueryCache* InvertedIndexQueryCache::_s_instance = nullptr;
bool InvertedIndexQueryCache::lookup(const CacheKey& key, InvertedIndexQueryCacheHandle* handle) {
if (key.encode().empty()) {
return false;
}
auto lru_handle = _cache->lookup(key.encode());
if (lru_handle == nullptr) {
return false;
@ -257,6 +260,9 @@ void InvertedIndexQueryCache::insert(const CacheKey& key, std::shared_ptr<roarin
cache_value_ptr->last_visit_time = UnixMillis();
cache_value_ptr->bitmap = bitmap;
cache_value_ptr->size = bitmap->getSizeInBytes();
if (key.encode().empty()) {
return;
}
auto lru_handle = _cache->insert(key.encode(), (void*)cache_value_ptr.release(),
bitmap->getSizeInBytes(), deleter, CachePriority::NORMAL);

View File

@ -18,6 +18,7 @@
#pragma once
#include <CLucene.h> // IWYU pragma: keep
#include <CLucene/config/repl_wchar.h>
#include <CLucene/util/Misc.h>
#include <butil/macros.h>
#include <glog/logging.h>
@ -35,6 +36,7 @@
#include "io/fs/file_system.h"
#include "io/fs/path.h"
#include "olap/lru_cache.h"
#include "olap/rowset/segment_v2/inverted_index_query_type.h"
#include "runtime/memory/mem_tracker.h"
#include "util/slice.h"
#include "util/time.h"
@ -183,7 +185,6 @@ private:
DISALLOW_COPY_AND_ASSIGN(InvertedIndexCacheHandle);
};
enum class InvertedIndexQueryType;
class InvertedIndexQueryCacheHandle;
class InvertedIndexQueryCache {
@ -201,9 +202,14 @@ public:
key_buf.append("/");
key_buf.append(column_name);
key_buf.append("/");
key_buf.append(1, static_cast<char>(query_type));
auto query_type_str = InvertedIndexQueryType_toString(query_type);
if (query_type_str.empty()) {
return "";
}
key_buf.append(query_type_str);
key_buf.append("/");
key_buf.append(lucene::util::Misc::toString(value.c_str()));
auto str = lucene_wcstoutf8string(value.c_str(), value.length());
key_buf.append(str);
return key_buf;
}
};

View File

@ -0,0 +1,71 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
namespace doris {
namespace segment_v2 {
enum class InvertedIndexQueryType {
UNKNOWN_QUERY = -1,
EQUAL_QUERY = 0,
LESS_THAN_QUERY = 1,
LESS_EQUAL_QUERY = 2,
GREATER_THAN_QUERY = 3,
GREATER_EQUAL_QUERY = 4,
MATCH_ANY_QUERY = 5,
MATCH_ALL_QUERY = 6,
MATCH_PHRASE_QUERY = 7,
};
inline std::string InvertedIndexQueryType_toString(InvertedIndexQueryType query_type) {
switch (query_type) {
case InvertedIndexQueryType::UNKNOWN_QUERY: {
return "UNKNOWN";
}
case InvertedIndexQueryType::EQUAL_QUERY: {
return "EQ";
}
case InvertedIndexQueryType::LESS_THAN_QUERY: {
return "LT";
}
case InvertedIndexQueryType::LESS_EQUAL_QUERY: {
return "LE";
}
case InvertedIndexQueryType::GREATER_THAN_QUERY: {
return "GT";
}
case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
return "GE";
}
case InvertedIndexQueryType::MATCH_ANY_QUERY: {
return "MANY";
}
case InvertedIndexQueryType::MATCH_ALL_QUERY: {
return "MALL";
}
case InvertedIndexQueryType::MATCH_PHRASE_QUERY: {
return "MPHRASE";
}
default:
return "";
}
}
} // namespace segment_v2
} // namespace doris

View File

@ -344,6 +344,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string
InvertedIndexQueryCache::CacheKey cache_key {index_file_path, column_name,
InvertedIndexQueryType::EQUAL_QUERY,
token_ws};
VLOG_DEBUG << "cache_key:" << cache_key.encode();
InvertedIndexQueryCacheHandle cache_handle;
if (cache->lookup(cache_key, &cache_handle)) {
stats->inverted_index_query_cache_hit++;

View File

@ -30,6 +30,7 @@
#include "io/fs/path.h"
#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
#include "olap/rowset/segment_v2/inverted_index_query_type.h"
#include "olap/tablet_schema.h"
namespace lucene {
@ -63,18 +64,6 @@ enum class InvertedIndexReaderType {
BKD = 2,
};
enum class InvertedIndexQueryType {
UNKNOWN_QUERY = -1,
EQUAL_QUERY = 0,
LESS_THAN_QUERY = 1,
LESS_EQUAL_QUERY = 2,
GREATER_THAN_QUERY = 3,
GREATER_EQUAL_QUERY = 4,
MATCH_ANY_QUERY = 5,
MATCH_ALL_QUERY = 6,
MATCH_PHRASE_QUERY = 7,
};
class InvertedIndexReader {
public:
explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string& path,

View File

@ -27,3 +27,41 @@
-- !sql --
3 人民可以得到更多实惠
-- !sql --
2 我爱你中国
-- !sql --
2 我爱你中国
-- !sql --
1 我来到北京清华大学
-- !sql --
1 我来到北京清华大学
-- !sql --
1 我来到北京清华大学
-- !sql --
3 人民可以得到更多实惠
-- !sql --
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
-- !sql --
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
-- !sql --
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
-- !sql --
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
-- !sql --
-- !sql --
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.
-- !sql --
4 陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.

View File

@ -77,4 +77,37 @@ suite("test_chinese_analyzer"){
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '人民' ORDER BY id;"
def indexTblName3 = "chinese_analyzer_test3"
sql "DROP TABLE IF EXISTS ${indexTblName3}"
// create 1 replica table
sql """
CREATE TABLE IF NOT EXISTS ${indexTblName3}(
`id`int(11)NULL,
`c` text NULL,
INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode", "support_phrase"="true") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`id`)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES(
"replication_allocation" = "tag.location.default: 1"
);
"""
sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '我爱你' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL'我爱你' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '清华' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '清华大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '人民' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '陕西' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '12345678901' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '12345678' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '1.1.1.1' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '陕西西安' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '陕西省西安市' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH 'information' ORDER BY id;"
}