[FTS.BUGFIX] fix compatibility of MySQL default fulltext parser
This commit is contained in:
@ -81,16 +81,23 @@ public:
|
||||
static const char *TEST_FULLTEXT;
|
||||
static const int64_t TEST_WORD_COUNT = 5;
|
||||
static const int64_t TEST_WORD_COUNT_WITHOUT_STOPWORD = 4;
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
public:
|
||||
ObTestAddWord();
|
||||
ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
|
||||
virtual ~ObTestAddWord() = default;
|
||||
virtual int operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len) override;
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) override;
|
||||
virtual int64_t get_add_word_count() const override { return ith_word_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(ith_word));
|
||||
private:
|
||||
bool is_min_max_word(const int64_t c_len) const;
|
||||
int casedown_word(const ObFTWord &src, ObFTWord &dst);
|
||||
ObCollationType collation_type_;
|
||||
common::ObIAllocator &allocator_;
|
||||
const char *words_[TEST_WORD_COUNT];
|
||||
const char *words_without_stopword_[TEST_WORD_COUNT_WITHOUT_STOPWORD];
|
||||
int64_t ith_word_;
|
||||
@ -98,26 +105,57 @@ private:
|
||||
|
||||
const char *ObTestAddWord::TEST_FULLTEXT = "OceanBase fulltext search is No.1 in the world.";
|
||||
|
||||
ObTestAddWord::ObTestAddWord()
|
||||
: words_{"oceanbase", "fulltext", "search", "the", "world"},
|
||||
ObTestAddWord::ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator)
|
||||
: collation_type_(type),
|
||||
allocator_(allocator),
|
||||
words_{"oceanbase", "fulltext", "search", "the", "world"},
|
||||
words_without_stopword_{"oceanbase", "fulltext", "search", "world"},
|
||||
ith_word_(0)
|
||||
{
|
||||
}
|
||||
|
||||
bool ObTestAddWord::is_min_max_word(const int64_t c_len) const
|
||||
{
|
||||
return c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN;
|
||||
}
|
||||
|
||||
int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(src.empty())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid src ft word", K(ret), K(src));
|
||||
} else {
|
||||
ObString dst_str;
|
||||
if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
|
||||
LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
|
||||
} else {
|
||||
ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
|
||||
dst = tmp;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObTestAddWord::operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len)
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
|
||||
ObFTWord src_word(word_len, word, collation_type_);
|
||||
ObFTWord dst_word;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len));
|
||||
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], word, word_len))) {
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
|
||||
} else if (is_min_max_word(char_cnt)) {
|
||||
// skip min/max word
|
||||
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
|
||||
LOG_WARN("fail to casedown word", K(ret), K(src_word));
|
||||
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], dst_word.get_word().ptr(), dst_word.get_word().length()))) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]),
|
||||
KCSTRING(word), K(word_len));
|
||||
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), K(dst_word));
|
||||
} else {
|
||||
++ith_word_;
|
||||
}
|
||||
@ -136,17 +174,17 @@ public:
|
||||
private:
|
||||
lib::ObPluginParam plugin_param_;
|
||||
lib::ObFTParserParam ft_parser_param_;
|
||||
ObTestAddWord add_word_;
|
||||
ObWhiteSpaceFTParserDesc desc_;
|
||||
common::ObArenaAllocator allocator_;
|
||||
ObTestAddWord add_word_;
|
||||
};
|
||||
|
||||
TestDefaultFTParser::TestDefaultFTParser()
|
||||
: plugin_param_(),
|
||||
ft_parser_param_(),
|
||||
add_word_(),
|
||||
desc_(),
|
||||
allocator_()
|
||||
allocator_(),
|
||||
add_word_(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_)
|
||||
{
|
||||
plugin_param_.desc_ = &desc_;
|
||||
}
|
||||
@ -190,7 +228,8 @@ TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
|
||||
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
|
||||
{
|
||||
common::ObArray<ObFTWord> words;
|
||||
ObNoStopWordAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, allocator_, words);
|
||||
ObAddWordFlag flag;
|
||||
ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
|
||||
const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
|
||||
const int64_t ft_len = strlen(fulltext);
|
||||
|
||||
@ -291,7 +330,7 @@ void ObTestFTPluginHelper::TearDown()
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
|
||||
// ASSERT_TRUE(nullptr != desc);
|
||||
//
|
||||
// ObTestAddWord test_add_word;
|
||||
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
|
||||
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
|
||||
//}
|
||||
@ -326,7 +365,7 @@ void ObTestFTPluginHelper::TearDown()
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
|
||||
// ASSERT_TRUE(nullptr != desc);
|
||||
//
|
||||
// ObTestAddWord test_add_word;
|
||||
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
|
||||
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
|
||||
//
|
||||
@ -408,7 +447,7 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
|
||||
ObTestAddWord test_add_word;
|
||||
ObTestAddWord test_add_word(cs_type_, allocator_);
|
||||
for (int64_t i = 0; i < words.count(); ++i) {
|
||||
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
|
||||
}
|
||||
@ -514,7 +553,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";
|
||||
|
||||
ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
|
||||
: plugin_name_(STRLEN(name_), name_),
|
||||
ngram_words_{"Oc", "ce", "ea", "an", "nB", "Ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "No", "in", "th", "he", "wo", "or", "rl", "ld"},
|
||||
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
|
||||
cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
|
||||
allocator_()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user