[FTS.BUGFIX] fix compatibility of MySQL default fulltext parser

2024-04-28 03:58:17 +00:00
parent 1442b06e77
commit c4ed3f10af
16 changed files with 541 additions and 296 deletions
--- a/unittest/storage/test_fts_plugin.cpp
+++ b/unittest/storage/test_fts_plugin.cpp
@ -81,16 +81,23 @@ public:
  static const char *TEST_FULLTEXT;
  static const int64_t TEST_WORD_COUNT = 5;
  static const int64_t TEST_WORD_COUNT_WITHOUT_STOPWORD = 4;
+  static const int64_t FT_MIN_WORD_LEN = 3;
+  static const int64_t FT_MAX_WORD_LEN = 84;
 public:
-  ObTestAddWord();
+  ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
  virtual ~ObTestAddWord() = default;
  virtual int operator()(
      lib::ObFTParserParam *param,
      const char *word,
-      const int64_t word_len) override;
+      const int64_t word_len,
+      const int64_t char_cnt) override;
  virtual int64_t get_add_word_count() const override { return ith_word_; }
  VIRTUAL_TO_STRING_KV(K_(ith_word));
 private:
+  bool is_min_max_word(const int64_t c_len) const;
+  int casedown_word(const ObFTWord &src, ObFTWord &dst);
+  ObCollationType collation_type_;
+  common::ObIAllocator &allocator_;
  const char *words_[TEST_WORD_COUNT];
  const char *words_without_stopword_[TEST_WORD_COUNT_WITHOUT_STOPWORD];
  int64_t ith_word_;
@ -98,26 +105,57 @@ private:

 const char *ObTestAddWord::TEST_FULLTEXT = "OceanBase fulltext search is No.1 in the world.";

-ObTestAddWord::ObTestAddWord()
-  : words_{"oceanbase", "fulltext", "search", "the", "world"},
+ObTestAddWord::ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator)
+  : collation_type_(type),
+    allocator_(allocator),
+    words_{"oceanbase", "fulltext", "search", "the", "world"},
    words_without_stopword_{"oceanbase", "fulltext", "search", "world"},
    ith_word_(0)
 {
 }

+bool ObTestAddWord::is_min_max_word(const int64_t c_len) const
+{
+  return c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN;
+}
+
+int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
+{
+  int ret = OB_SUCCESS;
+  if (OB_UNLIKELY(src.empty())) {
+    ret = OB_INVALID_ARGUMENT;
+    LOG_WARN("invalid src ft word", K(ret), K(src));
+  } else {
+    ObString dst_str;
+    if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
+      LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
+    } else {
+      ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
+      dst = tmp;
+    }
+  }
+  return ret;
+}
+
 int ObTestAddWord::operator()(
      lib::ObFTParserParam *param,
      const char *word,
-      const int64_t word_len)
+      const int64_t word_len,
+      const int64_t char_cnt)
 {
  int ret = OB_SUCCESS;
-  if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
+  ObFTWord src_word(word_len, word, collation_type_);
+  ObFTWord dst_word;
+  if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
    ret = OB_INVALID_ARGUMENT;
-    LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len));
-  } else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], word, word_len))) {
+    LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
+  } else if (is_min_max_word(char_cnt)) {
+    // skip min/max word
+  } else if (OB_FAIL(casedown_word(src_word, dst_word))) {
+    LOG_WARN("fail to casedown word", K(ret), K(src_word));
+  } else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], dst_word.get_word().ptr(), dst_word.get_word().length()))) {
    ret = OB_ERR_UNEXPECTED;
-    LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]),
-        KCSTRING(word), K(word_len));
+    LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), K(dst_word));
  } else {
    ++ith_word_;
  }
@ -136,17 +174,17 @@ public:
 private:
  lib::ObPluginParam plugin_param_;
  lib::ObFTParserParam ft_parser_param_;
-  ObTestAddWord add_word_;
  ObWhiteSpaceFTParserDesc desc_;
  common::ObArenaAllocator allocator_;
+  ObTestAddWord add_word_;
 };

 TestDefaultFTParser::TestDefaultFTParser()
  : plugin_param_(),
    ft_parser_param_(),
-    add_word_(),
    desc_(),
-    allocator_()
+    allocator_(),
+    add_word_(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_)
 {
  plugin_param_.desc_ = &desc_;
 }
@ -190,7 +228,8 @@ TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
 TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
 {
  common::ObArray<ObFTWord> words;
-  ObNoStopWordAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, allocator_, words);
+  ObAddWordFlag flag;
+  ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
  const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
  const int64_t ft_len = strlen(fulltext);

@ -291,7 +330,7 @@ void ObTestFTPluginHelper::TearDown()
 //  ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
 //  ASSERT_TRUE(nullptr != desc);
 //
-//  ObTestAddWord test_add_word;
+//  ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
 //  ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
 //        strlen(TEST_FULLTEXT), allocator_, test_add_word));
 //}
@ -326,7 +365,7 @@ void ObTestFTPluginHelper::TearDown()
 //  ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
 //  ASSERT_TRUE(nullptr != desc);
 //
-//  ObTestAddWord test_add_word;
+//  ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
 //  ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
 //        strlen(TEST_FULLTEXT), allocator_, test_add_word));
 //
@ -408,7 +447,7 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
  ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
        std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));

-  ObTestAddWord test_add_word;
+  ObTestAddWord test_add_word(cs_type_, allocator_);
  for (int64_t i = 0; i < words.count(); ++i) {
    ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
  }
@ -514,7 +553,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";

 ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
  : plugin_name_(STRLEN(name_), name_),
-    ngram_words_{"Oc", "ce", "ea", "an", "nB", "Ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "No", "in", "th", "he", "wo", "or", "rl", "ld"},
+    ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
    cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
    allocator_()
 {