Files
doris/be/test/olap/itoken_extractor_test.cpp
Jet He 75aa00d3d0 [Feature](NGram BloomFilter Index) add new ngram bloom filter index to speed up like query (#11579)
This PR implement  the new bloom filter index: NGram bloom filter index, which was proposed in  #10733.
The new index can improve the like query performance greatly, from our some test case , can  get order of magnitude  improve.
For how to use it you can check the docs in this PR, and the index based on the ```enable_function_pushdown```,
you need set it to ```true```, to make the index work for like query.
2022-12-28 18:01:50 +08:00

79 lines
2.8 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/itoken_extractor.h"
#include <gtest/gtest.h>
#include <string>
#include "common/logging.h"
#include "util/utf8_check.h"
namespace doris {
class TestITokenExtractor : public testing::Test {
public:
void SetUp() {}
void TearDown() {}
};
void runNextInString(const ITokenExtractor& extractor, std::string statement,
std::vector<std::string> expect) {
ASSERT_TRUE(validate_utf8(statement.c_str(), statement.length()));
std::vector<std::string> actual;
actual.reserve(expect.size());
size_t pos = 0;
size_t token_start = 0;
size_t token_length = 0;
while (extractor.next_in_string(statement.c_str(), statement.size(), &pos, &token_start,
&token_length)) {
actual.push_back(statement.substr(token_start, token_length));
}
ASSERT_EQ(expect, actual);
}
void runNextInStringLike(const ITokenExtractor& extractor, std::string statement,
std::vector<std::string> expect) {
std::vector<std::string> actual;
actual.reserve(expect.size());
size_t pos = 0;
std::string str;
while (extractor.next_in_string_like(statement.c_str(), statement.length(), &pos, str)) {
actual.push_back(str);
}
ASSERT_EQ(expect, actual);
}
TEST_F(TestITokenExtractor, ngram_extractor) {
std::string statement = u8"预计09发布i13手机。";
std::vector<std::string> expect = {u8"预计", u8"计0", u8"09", u8"9发", u8"发布", u8"布i",
u8"i1", u8"13", u8"3手", u8"手机", u8"机。"};
NgramTokenExtractor ngram_extractor(2);
runNextInString(ngram_extractor, statement, expect);
}
TEST_F(TestITokenExtractor, ngram_like_extractor) {
NgramTokenExtractor ngram_extractor(2);
runNextInStringLike(ngram_extractor, u8"%手机%", {u8"手机"});
runNextInStringLike(ngram_extractor, u8"%机%", {});
runNextInStringLike(ngram_extractor, {u8"i_%手机%"}, {u8"手机"});
runNextInStringLike(ngram_extractor, {u8"\\_手机%"}, {u8"_手", u8"手机"});
}
} // namespace doris