[FTS.BUGFIX] fix compatibility of MySQL default fulltext parser

This commit is contained in:
Tyshawn
2024-04-28 03:58:17 +00:00
committed by ob-robot
parent 1442b06e77
commit c4ed3f10af
16 changed files with 541 additions and 296 deletions

View File

@ -81,16 +81,23 @@ public:
static const char *TEST_FULLTEXT;
static const int64_t TEST_WORD_COUNT = 5;
static const int64_t TEST_WORD_COUNT_WITHOUT_STOPWORD = 4;
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
public:
ObTestAddWord();
ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
virtual ~ObTestAddWord() = default;
virtual int operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len) override;
const int64_t word_len,
const int64_t char_cnt) override;
virtual int64_t get_add_word_count() const override { return ith_word_; }
VIRTUAL_TO_STRING_KV(K_(ith_word));
private:
bool is_min_max_word(const int64_t c_len) const;
int casedown_word(const ObFTWord &src, ObFTWord &dst);
ObCollationType collation_type_;
common::ObIAllocator &allocator_;
const char *words_[TEST_WORD_COUNT];
const char *words_without_stopword_[TEST_WORD_COUNT_WITHOUT_STOPWORD];
int64_t ith_word_;
@ -98,26 +105,57 @@ private:
const char *ObTestAddWord::TEST_FULLTEXT = "OceanBase fulltext search is No.1 in the world.";
ObTestAddWord::ObTestAddWord()
: words_{"oceanbase", "fulltext", "search", "the", "world"},
ObTestAddWord::ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator)
: collation_type_(type),
allocator_(allocator),
words_{"oceanbase", "fulltext", "search", "the", "world"},
words_without_stopword_{"oceanbase", "fulltext", "search", "world"},
ith_word_(0)
{
}
bool ObTestAddWord::is_min_max_word(const int64_t c_len) const
{
return c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN;
}
int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(src.empty())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid src ft word", K(ret), K(src));
} else {
ObString dst_str;
if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
} else {
ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
dst = tmp;
}
}
return ret;
}
int ObTestAddWord::operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len)
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
ObFTWord src_word(word_len, word, collation_type_);
ObFTWord dst_word;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len));
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], word, word_len))) {
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
} else if (is_min_max_word(char_cnt)) {
// skip min/max word
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
LOG_WARN("fail to casedown word", K(ret), K(src_word));
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], dst_word.get_word().ptr(), dst_word.get_word().length()))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]),
KCSTRING(word), K(word_len));
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), K(dst_word));
} else {
++ith_word_;
}
@ -136,17 +174,17 @@ public:
private:
lib::ObPluginParam plugin_param_;
lib::ObFTParserParam ft_parser_param_;
ObTestAddWord add_word_;
ObWhiteSpaceFTParserDesc desc_;
common::ObArenaAllocator allocator_;
ObTestAddWord add_word_;
};
TestDefaultFTParser::TestDefaultFTParser()
: plugin_param_(),
ft_parser_param_(),
add_word_(),
desc_(),
allocator_()
allocator_(),
add_word_(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_)
{
plugin_param_.desc_ = &desc_;
}
@ -190,7 +228,8 @@ TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
{
common::ObArray<ObFTWord> words;
ObNoStopWordAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, allocator_, words);
ObAddWordFlag flag;
ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
const int64_t ft_len = strlen(fulltext);
@ -291,7 +330,7 @@ void ObTestFTPluginHelper::TearDown()
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
// ASSERT_TRUE(nullptr != desc);
//
// ObTestAddWord test_add_word;
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
//}
@ -326,7 +365,7 @@ void ObTestFTPluginHelper::TearDown()
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
// ASSERT_TRUE(nullptr != desc);
//
// ObTestAddWord test_add_word;
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
//
@ -408,7 +447,7 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
ObTestAddWord test_add_word;
ObTestAddWord test_add_word(cs_type_, allocator_);
for (int64_t i = 0; i < words.count(); ++i) {
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
}
@ -514,7 +553,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";
ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
: plugin_name_(STRLEN(name_), name_),
ngram_words_{"Oc", "ce", "ea", "an", "nB", "Ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "No", "in", "th", "he", "wo", "or", "rl", "ld"},
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
allocator_()
{