diff --git a/deps/oblib/src/lib/ob_plugin.h b/deps/oblib/src/lib/ob_plugin.h index c80944f1e..676b31b33 100644 --- a/deps/oblib/src/lib/ob_plugin.h +++ b/deps/oblib/src/lib/ob_plugin.h @@ -223,7 +223,11 @@ public: public: ObIAddWord() = default; virtual ~ObIAddWord() = default; - virtual int operator()(ObFTParserParam *param, const char *word, const int64_t word_len) = 0; + virtual int operator()( + ObFTParserParam *param, + const char *word, + const int64_t word_len, + const int64_t char_cnt) = 0; virtual int64_t get_add_word_count() const = 0; DECLARE_PURE_VIRTUAL_TO_STRING; }; @@ -247,9 +251,9 @@ public: && 0 < ft_length_ && 0 <= parser_version_; } - inline int add_word(ObFTParserParam *param, const char *word, int64_t word_len) + inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt) { - return (*add_word_)(param, word, word_len); + return (*add_word_)(param, word, word_len, char_cnt); } inline void reset() { diff --git a/src/storage/CMakeLists.txt b/src/storage/CMakeLists.txt index 82727852b..60c4a4f2e 100644 --- a/src/storage/CMakeLists.txt +++ b/src/storage/CMakeLists.txt @@ -170,6 +170,7 @@ ob_set_subtarget(ob_storage ckpt ) ob_set_subtarget(ob_storage fts + fts/ob_beng_ft_parser.cpp fts/ob_fts_plugin_mgr.cpp fts/ob_fts_plugin_helper.cpp fts/ob_fts_stop_word.cpp diff --git a/src/storage/fts/ob_beng_ft_parser.cpp b/src/storage/fts/ob_beng_ft_parser.cpp new file mode 100644 index 000000000..446915b73 --- /dev/null +++ b/src/storage/fts/ob_beng_ft_parser.cpp @@ -0,0 +1,179 @@ +/** + * Copyright (c) 2023 OceanBase + * OceanBase is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#define USING_LOG_PREFIX STORAGE_FTS + +#include "lib/string/ob_string.h" +#include "storage/fts/ob_beng_ft_parser.h" + +using namespace oceanbase::common; + +namespace oceanbase +{ +namespace storage +{ + +/*static*/ int ObBEngFTParser::segment( + lib::ObFTParserParam *param, + const char *ft, + const int64_t ft_len) +{ + int ret = OB_SUCCESS; + ObDatum doc; + doc.set_string(ft, ft_len); + ObBEngFTParser parser; + share::ObITokenStream *token_stream = nullptr; + if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len)); + } else if (OB_FAIL(parser.init(param))) { + LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param)); + } else if (FALSE_IT(doc.set_string(ft, ft_len))) { + } else if (OB_FAIL(parser.segment(doc, token_stream))) { + LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len)); + } else if (OB_ISNULL(token_stream)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("token stream is nullptr", K(ret), KP(token_stream)); + } else { + ObDatum token; + int64_t token_freq = 0; + while (OB_SUCC(ret)) { + if (OB_FAIL(token_stream->get_next(token, token_freq))) { + if (OB_ITER_END != ret) { + LOG_WARN("fail to get next token", K(ret), KPC(token_stream)); + } + } else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) { + LOG_WARN("fail to add word", K(ret), K(token), KPC(param)); + } + } + if (OB_ITER_END == ret) { + ret = OB_SUCCESS; + } + } + return ret; +} + +/*static*/ int ObBEngFTParser::add_word( + lib::ObFTParserParam *param, + common::ObIAllocator *allocator, + const char *word, + int64_t word_len) +{ + int ret = OB_SUCCESS; + char *buf = nullptr; + if (OB_ISNULL(param) + || OB_ISNULL(allocator) + || OB_ISNULL(word) + || OB_UNLIKELY(0 >= word_len)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len)); + } else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) { + LOG_DEBUG("skip too small or large word", K(ret), K(word_len)); + } else if (OB_ISNULL(buf = static_cast(allocator->alloc(word_len)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("fail to allocate word memory", K(ret), K(word_len)); + } else if (FALSE_IT(MEMCPY(buf, word, word_len))) { + } else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) { + LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word))); + } else { + LOG_DEBUG("succeed to add word", K(ObString(word_len, word))); + } + return ret; +} + +int ObBEngFTParser::init(lib::ObFTParserParam *param) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("init twice", K(ret), K(is_inited_)); + } else if (OB_ISNULL(param) || OB_UNLIKELY(!param->is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("param is nullptr", K(ret), KPC(param)); + } else if (OB_UNLIKELY(UINT32_MAX < param->ft_length_)) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_)); + } else { + analysis_ctx_.cs_ = param->cs_; + analysis_ctx_.filter_stopword_ = false; + analysis_ctx_.need_grouping_ = false; + if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) { + LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_)); + } else { + is_inited_ = true; + } + } + if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { + reset(); + } + return ret; +} + +int ObBEngFTParser::segment( + const common::ObDatum &doc, + share::ObITokenStream *&token_stream) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(doc.ptr_) || OB_UNLIKELY(0 >= doc.len_)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), KP(doc.ptr_), K(doc.len_)); + } else if (OB_UNLIKELY(UINT32_MAX < doc.len_)) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(doc.len_)); + } else if (OB_FAIL(english_analyzer_.analyze(doc, token_stream))) { + LOG_WARN("fail to analyze document", K(ret), K(english_analyzer_), KP(doc.ptr_), K(doc.len_)); + } + return ret; +} + +void ObBEngFTParser::reset() +{ + analysis_ctx_.reset(); + english_analyzer_.reset(); + is_inited_ = false; +} + +ObBasicEnglishFTParserDesc::ObBasicEnglishFTParserDesc() + : is_inited_(false) +{ +} + +int ObBasicEnglishFTParserDesc::init(lib::ObPluginParam *param) +{ + is_inited_ = true; + return OB_SUCCESS; +} + +int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param) +{ + reset(); + return OB_SUCCESS; +} + +int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); + } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", K(ret), KPC(param)); + } else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) { + LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param), + K(param->fulltext_), K(param->ft_length_)); + } + return ret; +} + +} // end namespace storage +} // end namespace oceanbase diff --git a/src/storage/fts/ob_beng_ft_parser.h b/src/storage/fts/ob_beng_ft_parser.h new file mode 100644 index 000000000..1f83d1d86 --- /dev/null +++ b/src/storage/fts/ob_beng_ft_parser.h @@ -0,0 +1,83 @@ +/** + * Copyright (c) 2023 OceanBase + * OceanBase is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#ifndef OB_BENG_FT_PARSER_H_ +#define OB_BENG_FT_PARSER_H_ + +#include "lib/ob_plugin.h" +#include "lib/utility/ob_macro_utils.h" +#include "lib/utility/ob_print_utils.h" +#include "share/text_analysis/ob_text_analyzer.h" + +namespace oceanbase +{ +namespace storage +{ + +class ObBEngFTParser final +{ +public: + static const int64_t FT_MIN_WORD_LEN = 3; + static const int64_t FT_MAX_WORD_LEN = 84; +public: + static int segment( + lib::ObFTParserParam *param, + const char *fulltext, + const int64_t ft_len); + +private: + ObBEngFTParser() + : analysis_ctx_(), + english_analyzer_(), + is_inited_(false) + {} + ~ObBEngFTParser() = default; + + static int add_word( + lib::ObFTParserParam *param, + common::ObIAllocator *allocator, + const char *word, + int64_t word_len); + int init(lib::ObFTParserParam *param); + void reset(); + int segment( + const common::ObDatum &doc, + share::ObITokenStream *&token_stream); + TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited)); + +private: + share::ObTextAnalysisCtx analysis_ctx_; + share::ObEnglishTextAnalyzer english_analyzer_; + bool is_inited_; + + DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser); +}; + +class ObBasicEnglishFTParserDesc final : public lib::ObIFTParserDesc +{ +public: + ObBasicEnglishFTParserDesc(); + virtual ~ObBasicEnglishFTParserDesc() = default; + virtual int init(lib::ObPluginParam *param) override; + virtual int deinit(lib::ObPluginParam *param) override; + virtual int segment(lib::ObFTParserParam *param) const override; + OB_INLINE void reset() { is_inited_ = false; } +private: + bool is_inited_; +}; + +static ObBasicEnglishFTParserDesc beng_parser; + +} // end namespace storage +} // end namespace oceanbase + +#endif // OB_BENG_FT_PARSER_H_ diff --git a/src/storage/fts/ob_fts_buildin_parser_register.ipp b/src/storage/fts/ob_fts_buildin_parser_register.ipp index 997e70cdc..8f38ba4bc 100644 --- a/src/storage/fts/ob_fts_buildin_parser_register.ipp +++ b/src/storage/fts/ob_fts_buildin_parser_register.ipp @@ -15,6 +15,7 @@ #include "storage/fts/ob_whitespace_ft_parser.h" #include "storage/fts/ob_ngram_ft_parser.h" +#include "storage/fts/ob_beng_ft_parser.h" ///////////////////////////////////// Default fulltext parser ////////////////////////////////////////// @@ -23,10 +24,10 @@ OB_DECLARE_PLUGIN(whitespace_parser) oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type "space", // name OB_PLUGIN_AUTHOR_OCEANBASE, // author - "This is a default space parser plugin.", // brief specification + "This is a default whitespace parser plugin.", // brief specification 0x00001, // version oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license - &oceanbase::storage::whitespace_parser, // default space parser plugin instance + &oceanbase::storage::whitespace_parser, // default space parser plugin instance }; OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser); @@ -46,4 +47,19 @@ OB_DECLARE_PLUGIN(ngram_parser) OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser); +///////////////////////////////////// Default fulltext parser ////////////////////////////////////////// + +OB_DECLARE_PLUGIN(beng_parser) +{ + oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type + "beng", // name + OB_PLUGIN_AUTHOR_OCEANBASE, // author + "This is a basic english parser plugin.", // brief specification + 0x00001, // version + oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license + &oceanbase::storage::beng_parser, // default space parser plugin instance +}; + +OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser); + #endif // OB_FTS_BUILD_IN_PARSER_REGISTER_H_ diff --git a/src/storage/fts/ob_fts_plugin_helper.cpp b/src/storage/fts/ob_fts_plugin_helper.cpp index eacf4bf12..e972758d1 100644 --- a/src/storage/fts/ob_fts_plugin_helper.cpp +++ b/src/storage/fts/ob_fts_plugin_helper.cpp @@ -145,7 +145,7 @@ ObFTParseHelper::ObFTParseHelper() allocator_(nullptr), parser_desc_(nullptr), parser_name_(), - filter_stopword_(false), + add_word_flag_(), is_inited_(false) { } @@ -178,7 +178,9 @@ int ObFTParseHelper::init( LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler)); } else { plugin_param_.desc_ = parser_desc_; - filter_stopword_ = need_stopword_list(parser_name_); + if (need_min_max_word(parser_name_)) { add_word_flag_.set_min_max_word(); } + if (need_castdn(parser_name_)) { add_word_flag_.set_casedown(); } + if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word(); } allocator_ = allocator; is_inited_ = true; } @@ -193,7 +195,7 @@ void ObFTParseHelper::reset() parser_desc_ = nullptr; plugin_param_.reset(); allocator_ = nullptr; - filter_stopword_ = false; + add_word_flag_.clear(); is_inited_ = false; } @@ -220,17 +222,14 @@ int ObFTParseHelper::segment( LOG_WARN("unexpected error, charset info is nullptr", K(ret), K(type)); } else { words.reuse(); - lib::ObFTParserParam::ObIAddWord *add_word = nullptr; - if (OB_FAIL(alloc_add_word(type, words, add_word))) { - LOG_WARN("fail to allocate add word", K(ret), K(type)); - } else if (OB_FAIL(segment(parser_name_.get_parser_version(), parser_desc_, cs, fulltext, fulltext_len, *allocator_, - *add_word))) { + ObAddWord add_word(type, add_word_flag_, *allocator_, words); + if (OB_FAIL(segment(parser_name_.get_parser_version(), parser_desc_, cs, fulltext, fulltext_len, *allocator_, + add_word))) { LOG_WARN("fail to segment fulltext", K(ret), K(parser_name_), KP(parser_desc_), KP(cs), KP(fulltext), K(fulltext_len), KP(allocator_)); } else { - doc_length = add_word->get_add_word_count(); + doc_length = add_word.get_add_word_count(); } - free_add_word(add_word); } LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words)); return ret; @@ -238,45 +237,23 @@ int ObFTParseHelper::segment( bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser) { - share::ObPluginName name("space"); - return parser.get_parser_name() == name; + share::ObPluginName space("space"); + share::ObPluginName beng("beng"); + return parser.get_parser_name() == space || parser.get_parser_name() == beng; } -int ObFTParseHelper::alloc_add_word( - const ObCollationType &type, - common::ObIArray &words, - lib::ObFTParserParam::ObIAddWord *&add_word) const +bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser) { - int ret = OB_SUCCESS; - common::ObMemAttr mem_attr(MTL_ID(), "FTAddWord"); - void *buf = nullptr; - const int64_t buf_size = filter_stopword_ ? sizeof(ObStopWordAddWord) : sizeof(ObNoStopWordAddWord); - if (OB_NOT_NULL(add_word)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("add word isn't nullptr", K(ret), KPC(add_word)); - } else if (OB_ISNULL(buf = ob_malloc(buf_size, mem_attr))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("fail to allocate memory", K(ret), K(buf_size)); - } else if (filter_stopword_) { - add_word = new (buf) ObStopWordAddWord(type, *allocator_, words); - } else { - add_word = new (buf) ObNoStopWordAddWord(type, *allocator_, words); - } - if (OB_FAIL(ret) && OB_NOT_NULL(buf)) { - ob_free(buf); - buf = nullptr; - add_word = nullptr; - } - return ret; + share::ObPluginName space("space"); + share::ObPluginName beng("beng"); + return parser.get_parser_name() == space || parser.get_parser_name() == beng; } -void ObFTParseHelper::free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const +bool ObFTParseHelper::need_castdn(const ObFTParser &parser) { - if (OB_NOT_NULL(add_word)) { - add_word->~ObIAddWord(); - ob_free(static_cast(add_word)); - add_word = nullptr; - } + share::ObPluginName space("space"); + share::ObPluginName ngram("ngram"); + return parser.get_parser_name() == space || parser.get_parser_name() == ngram; } } // end namespace storage diff --git a/src/storage/fts/ob_fts_plugin_helper.h b/src/storage/fts/ob_fts_plugin_helper.h index 895c775c9..6dcb07306 100644 --- a/src/storage/fts/ob_fts_plugin_helper.h +++ b/src/storage/fts/ob_fts_plugin_helper.h @@ -107,6 +107,8 @@ private: common::ObIAllocator &allocator, lib::ObFTParserParam::ObIAddWord &add_word); static bool need_stopword_list(const ObFTParser &parser); + static bool need_castdn(const ObFTParser &parser); + static bool need_min_max_word(const ObFTParser &parser); int alloc_add_word( const ObCollationType &type, @@ -119,7 +121,7 @@ private: common::ObIAllocator *allocator_; lib::ObIFTParserDesc *parser_desc_; ObFTParser parser_name_; - bool filter_stopword_; + ObAddWordFlag add_word_flag_; bool is_inited_; private: diff --git a/src/storage/fts/ob_fts_plugin_mgr.cpp b/src/storage/fts/ob_fts_plugin_mgr.cpp index 9aca2e5f6..e3186dfb0 100644 --- a/src/storage/fts/ob_fts_plugin_mgr.cpp +++ b/src/storage/fts/ob_fts_plugin_mgr.cpp @@ -56,6 +56,8 @@ int ObTenantFTPluginMgr::register_plugins() LOG_WARN("fail to register default fulltext parser", K(ret)); } else if (OB_FAIL(register_plugin())) { LOG_WARN("fail to register ngram fulltext parser", K(ret)); + } else if (OB_FAIL(register_plugin())) { + LOG_WARN("fail to register basic english fulltext parser", K(ret)); } return ret; } @@ -67,6 +69,8 @@ void ObTenantFTPluginMgr::unregister_plugins() LOG_ERROR("fail to unregister default fulltext parser", K(ret)); } else if (OB_FAIL(unregister_plugin())) { LOG_ERROR("fail to unregister ngram fulltext parser", K(ret)); + } else if (OB_FAIL(unregister_plugin())) { + LOG_ERROR("fail to unregister basic english fulltext parser", K(ret)); } } @@ -133,6 +137,8 @@ int ObTenantFTPluginMgr::init_plugin_handler() LOG_WARN("fail to set default fulltext parser", K(ret)); } else if (OB_FAIL(set_plugin_handler())) { LOG_WARN("fail to set ngram fulltext parser", K(ret)); + } else if (OB_FAIL(set_plugin_handler())) { + LOG_WARN("fail to set basic english fulltext parser", K(ret)); } return ret; } diff --git a/src/storage/fts/ob_fts_stop_word.cpp b/src/storage/fts/ob_fts_stop_word.cpp index d57df7fc0..ae03a79d7 100644 --- a/src/storage/fts/ob_fts_stop_word.cpp +++ b/src/storage/fts/ob_fts_stop_word.cpp @@ -20,99 +20,87 @@ namespace oceanbase namespace storage { -ObNoStopWordAddWord::ObNoStopWordAddWord( +ObAddWord::ObAddWord( const ObCollationType &type, + const ObAddWordFlag &flag, common::ObIAllocator &allocator, common::ObIArray &word) : collation_type_(type), allocator_(allocator), words_(word), - word_count_(0) + min_max_word_cnt_(0), + non_stopword_cnt_(0), + stopword_cnt_(0), + flag_(flag) { } -int ObNoStopWordAddWord::operator()( +int ObAddWord::operator()( lib::ObFTParserParam *param, const char *word, - const int64_t word_len) -{ - int ret = OB_SUCCESS; - char *w_buf = nullptr; - if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); - } else if (OB_ISNULL(w_buf = static_cast(allocator_.alloc(word_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("fail to allocate memory for fulltext word", K(ret), K(word_len)); - } else { - MEMCPY(w_buf, word, word_len); - ObFTWord ft_word(word_len, w_buf, collation_type_); - if (OB_FAIL(words_.push_back(ft_word))) { - LOG_WARN("fail to push word into words array", K(ret), K(ft_word)); - } else { - ++word_count_; - } - } - if (OB_FAIL(ret)) { - if (OB_NOT_NULL(w_buf)) { - allocator_.free(w_buf); - w_buf = nullptr; - } - } - LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len)); - return ret; -} - -ObStopWordAddWord::ObStopWordAddWord( - const ObCollationType &type, - common::ObIAllocator &allocator, - common::ObIArray &word) - : collation_type_(type), - allocator_(allocator), - words_(word), - non_stopword_count_(0), - stopword_count_(0) -{ -} - -int ObStopWordAddWord::operator()( - lib::ObFTParserParam *param, - const char *word, - const int64_t word_len) + const int64_t word_len, + const int64_t char_cnt) { int ret = OB_SUCCESS; bool is_stopword = false; - ObFTWord ft_word(word_len, word, collation_type_); + ObFTWord src_word(word_len, word, collation_type_); + ObFTWord dst_word; if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); - } else if (OB_FAIL(OB_FT_PLUGIN_MGR.check_stopword(ft_word, is_stopword))) { - LOG_WARN("fail to check stopword", K(ret)); - } else if (is_stopword) { - // the word is stop word, just skip it. - ++stopword_count_; + } else if (is_min_max_word(char_cnt)) { + ++min_max_word_cnt_; + LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt)); + } else if (OB_FAIL(casedown_word(src_word, dst_word))) { + LOG_WARN("fail to casedown word", K(ret), K(src_word)); + } else if (check_stopword(dst_word, is_stopword)) { + LOG_WARN("fail to check stopword", K(ret), K(dst_word)); + } else if (OB_UNLIKELY(is_stopword)) { + ++stopword_cnt_; + LOG_DEBUG("skip stopword", K(ret), K(dst_word)); + } else if (OB_FAIL(words_.push_back(dst_word))) { + LOG_WARN("fail to push word into words array", K(ret), K(dst_word)); } else { - char *w_buf = nullptr; - if (OB_ISNULL(w_buf = static_cast(allocator_.alloc(word_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("fail to allocate memory for fulltext word", K(ret), K(word_len)); - } else { - MEMCPY(w_buf, word, word_len); - ObFTWord non_stopword_ft_word(word_len, w_buf, collation_type_); - if (OB_FAIL(words_.push_back(non_stopword_ft_word))) { - LOG_WARN("fail to push word into words array", K(ret), K(non_stopword_ft_word)); - } else { - ++non_stopword_count_; - } - } - if (OB_FAIL(ret)) { - if (OB_NOT_NULL(w_buf)) { - allocator_.free(w_buf); - w_buf = nullptr; - } - } + ++non_stopword_cnt_; + LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word)); + } + return ret; +} + +bool ObAddWord::is_min_max_word(const int64_t c_len) const +{ + return flag_.min_max_word() && (c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN); +} + +int ObAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(src.empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid src ft word", K(ret), K(src)); + } else if (flag_.casedown()) { + ObString dst_str; + if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) { + LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_)); + } else { + ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_); + dst = tmp; + } + } else { + dst = src; + } + return ret; +} + +int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(ft_word.empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), K(ft_word)); + } else if (flag_.stopword() && OB_FAIL(OB_FT_PLUGIN_MGR.check_stopword(ft_word, is_stopword))) { + LOG_WARN("fail to check stopword", K(ret)); } - LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(is_stopword)); return ret; } diff --git a/src/storage/fts/ob_fts_stop_word.h b/src/storage/fts/ob_fts_stop_word.h index 44f5cba16..0135bda7e 100644 --- a/src/storage/fts/ob_fts_stop_word.h +++ b/src/storage/fts/ob_fts_stop_word.h @@ -63,49 +63,38 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = { "www" }; -class ObNoStopWordAddWord final : public lib::ObFTParserParam::ObIAddWord +class ObAddWord final : public lib::ObFTParserParam::ObIAddWord { public: - ObNoStopWordAddWord( + ObAddWord( const ObCollationType &type, + const ObAddWordFlag &flag, common::ObIAllocator &allocator, common::ObIArray &word); - virtual ~ObNoStopWordAddWord() = default; + virtual ~ObAddWord() = default; virtual int operator()( lib::ObFTParserParam *param, const char *word, - const int64_t word_len) override; - virtual int64_t get_add_word_count() const override { return word_count_; } - VIRTUAL_TO_STRING_KV(K_(collation_type), K_(word_count), K_(words)); + const int64_t word_len, + const int64_t char_cnt) override; + virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; } + VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt), + K_(words)); +public: + static const int64_t FT_MIN_WORD_LEN = 3; + static const int64_t FT_MAX_WORD_LEN = 84; private: - OB_INLINE common::ObIArray &get_words() { return words_; } + bool is_min_max_word(const int64_t c_len) const; + int casedown_word(const ObFTWord &src, ObFTWord &dst); + int check_stopword(const ObFTWord &word, bool &is_stopword); private: ObCollationType collation_type_; common::ObIAllocator &allocator_; common::ObIArray &words_; - int64_t word_count_; -}; - -class ObStopWordAddWord final : public lib::ObFTParserParam::ObIAddWord -{ -public: - ObStopWordAddWord( - const ObCollationType &type, - common::ObIAllocator &allocator, - common::ObIArray &word); - virtual ~ObStopWordAddWord() = default; - virtual int operator()( - lib::ObFTParserParam *param, - const char *word, - const int64_t word_len) override; - virtual int64_t get_add_word_count() const { return non_stopword_count_; } - VIRTUAL_TO_STRING_KV(K_(collation_type), K_(non_stopword_count), K_(stopword_count), K_(words)); -private: - ObCollationType collation_type_; - common::ObIAllocator &allocator_; - common::ObIArray &words_; - int64_t non_stopword_count_; - int64_t stopword_count_; + int64_t min_max_word_cnt_; + int64_t non_stopword_cnt_; + int64_t stopword_cnt_; + ObAddWordFlag flag_; }; } // end namespace storage diff --git a/src/storage/fts/ob_fts_struct.h b/src/storage/fts/ob_fts_struct.h index c912103f9..939410ca6 100644 --- a/src/storage/fts/ob_fts_struct.h +++ b/src/storage/fts/ob_fts_struct.h @@ -76,6 +76,36 @@ public: int64_t word_cnt_; }; +class ObAddWordFlag final +{ +private: + static const uint64_t AWF_NONE = 0; + static const uint64_t AWF_MIN_MAX_WORD = 1 << 0; // filter words that are less than a minimum or greater + // than a maximum word length. + static const uint64_t AWF_STOPWORD = 1 << 1; // filter by sotp word table. + static const uint64_t AWF_CASEDOWN = 1 << 2; // convert characters from uppercase to lowercase. +public: + ObAddWordFlag() : flag_(AWF_NONE) {} + ~ObAddWordFlag() = default; +private: + void set_flag(const uint64_t flag) { flag_ |= flag; } + void clear_flag(const uint64_t flag) { flag_ &= ~flag; } + bool has_flag(const uint64 flag) const { return (flag_ & flag) == flag; } +public: + void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); } + void set_stop_word() { set_flag(AWF_STOPWORD); } + void set_casedown() { set_flag(AWF_CASEDOWN); } + void clear() { flag_ = AWF_NONE; } + void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); } + void clear_stop_word() { clear_flag(AWF_STOPWORD); } + void clear_casedown() { clear_flag(AWF_CASEDOWN); } + bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); } + bool stopword() const { return has_flag(AWF_STOPWORD); } + bool casedown() const { return has_flag(AWF_CASEDOWN); } +private: + uint64_t flag_; +}; + } // end namespace storage } // end namespace oceanbase diff --git a/src/storage/fts/ob_ngram_ft_parser.cpp b/src/storage/fts/ob_ngram_ft_parser.cpp index 135ddc466..1540feef1 100644 --- a/src/storage/fts/ob_ngram_ft_parser.cpp +++ b/src/storage/fts/ob_ngram_ft_parser.cpp @@ -56,8 +56,8 @@ namespace storage ++c_nums; } if (NGRAM_TOKEN_SIZE == c_nums) { - if (OB_FAIL(add_word(param, start, next - start))) { - LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next)); + if (OB_FAIL(add_word(param, start, next - start, c_nums))) { + LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums)); } else { start += ob_mbcharlen_ptr(cs, start, end); c_nums = NGRAM_TOKEN_SIZE - 1; @@ -71,7 +71,8 @@ namespace storage /*static*/ int ObNgramFTParser::add_word( lib::ObFTParserParam *param, const char *word, - int64_t word_len) + const int64_t word_len, + const int64_t char_cnt) { int ret = OB_SUCCESS; if (OB_ISNULL(param) @@ -79,8 +80,8 @@ namespace storage || OB_UNLIKELY(0 >= word_len)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); - } else if (OB_FAIL(param->add_word(param, word, word_len))) { - LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, word))); + } else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) { + LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word))); } return ret; } diff --git a/src/storage/fts/ob_ngram_ft_parser.h b/src/storage/fts/ob_ngram_ft_parser.h index 64522b27b..5cbd931d0 100644 --- a/src/storage/fts/ob_ngram_ft_parser.h +++ b/src/storage/fts/ob_ngram_ft_parser.h @@ -37,7 +37,8 @@ private: static int add_word( lib::ObFTParserParam *param, const char *word, - int64_t word_len); + const int64_t word_len, + const int64_t char_cnt); private: DISABLE_COPY_ASSIGN(ObNgramFTParser); }; diff --git a/src/storage/fts/ob_whitespace_ft_parser.cpp b/src/storage/fts/ob_whitespace_ft_parser.cpp index b12c17bc9..ce21ab766 100644 --- a/src/storage/fts/ob_whitespace_ft_parser.cpp +++ b/src/storage/fts/ob_whitespace_ft_parser.cpp @@ -22,37 +22,51 @@ namespace oceanbase namespace storage { -/*static*/ int ObSpaceFTParser::segment( +#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') + +int ObSpaceFTParser::segment( lib::ObFTParserParam *param, const char *ft, const int64_t ft_len) { int ret = OB_SUCCESS; - ObDatum doc; - doc.set_string(ft, ft_len); - ObSpaceFTParser parser; - share::ObITokenStream *token_stream = nullptr; + const char *start = ft; + const char *next = start; + const char *end = start + ft_len; + int mbl = 0; if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len)); - } else if (OB_FAIL(parser.init(param))) { - LOG_WARN("fail to initialize space parser", K(ret), KPC(param)); - } else if (FALSE_IT(doc.set_string(ft, ft_len))) { - } else if (OB_FAIL(parser.segment(doc, token_stream))) { - LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len)); - } else if (OB_ISNULL(token_stream)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("token stream is nullptr", K(ret), KP(token_stream)); } else { - ObDatum token; - int64_t token_freq = 0; - while (OB_SUCC(ret)) { - if (OB_FAIL(token_stream->get_next(token, token_freq))) { - if (OB_ITER_END != ret) { - LOG_WARN("fail to get next token", K(ret), KPC(token_stream)); + const ObCharsetInfo *cs = param->cs_; + while (OB_SUCC(ret) && next < end) { + while (next < end) { + int ctype; + mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end); + if (true_word_char(ctype, *next)) { + break; + } + next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + } + if (next >= end) { + ret = OB_ITER_END; + } else { + int64_t c_nums = 0; + start = next; + while (next < end) { + int ctype; + mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end); + if (!true_word_char(ctype, *next)) { + break; + } + ++c_nums; + next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + } + if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) { + LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next)); + } else { + start = next; } - } else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) { - LOG_WARN("fail to add word", K(ret), K(token), KPC(param)); } } if (OB_ITER_END == ret) { @@ -62,86 +76,24 @@ namespace storage return ret; } -/*static*/ int ObSpaceFTParser::add_word( +int ObSpaceFTParser::add_word( lib::ObFTParserParam *param, - common::ObIAllocator *allocator, const char *word, - int64_t word_len) + const int64_t word_len, + const int64_t char_cnt) { int ret = OB_SUCCESS; - char *buf = nullptr; if (OB_ISNULL(param) - || OB_ISNULL(allocator) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len)); - } else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) { - LOG_DEBUG("skip too small or large word", K(ret), K(word_len)); - } else if (OB_ISNULL(buf = static_cast(allocator->alloc(word_len)))) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("fail to allocate word memory", K(ret), K(word_len)); - } else if (FALSE_IT(MEMCPY(buf, word, word_len))) { - } else if (OB_FAIL(param->add_word(param, buf, word_len))) { - LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word))); - } else { - LOG_DEBUG("succeed to add word", K(ObString(word_len, word))); + LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); + } else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) { + LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word))); } return ret; } -int ObSpaceFTParser::init(lib::ObFTParserParam *param) -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(is_inited_)) { - ret = OB_INIT_TWICE; - LOG_WARN("init twice", K(ret), K(is_inited_)); - } else if (OB_ISNULL(param) || OB_UNLIKELY(!param->is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("param is nullptr", K(ret), KPC(param)); - } else if (OB_UNLIKELY(UINT32_MAX < param->ft_length_)) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_)); - } else { - analysis_ctx_.cs_ = param->cs_; - analysis_ctx_.filter_stopword_ = false; - analysis_ctx_.need_grouping_ = false; - if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) { - LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_)); - } else { - is_inited_ = true; - } - } - if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { - reset(); - } - return ret; -} - -int ObSpaceFTParser::segment( - const common::ObDatum &doc, - share::ObITokenStream *&token_stream) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(doc.ptr_) || OB_UNLIKELY(0 >= doc.len_)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KP(doc.ptr_), K(doc.len_)); - } else if (OB_UNLIKELY(UINT32_MAX < doc.len_)) { - ret = OB_NOT_SUPPORTED; - LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(doc.len_)); - } else if (OB_FAIL(english_analyzer_.analyze(doc, token_stream))) { - LOG_WARN("fail to analyze document", K(ret), K(english_analyzer_), KP(doc.ptr_), K(doc.len_)); - } - return ret; -} - -void ObSpaceFTParser::reset() -{ - analysis_ctx_.reset(); - english_analyzer_.reset(); - is_inited_ = false; -} - ObWhiteSpaceFTParserDesc::ObWhiteSpaceFTParserDesc() : is_inited_(false) { diff --git a/src/storage/fts/ob_whitespace_ft_parser.h b/src/storage/fts/ob_whitespace_ft_parser.h index 11e0d2f59..9420c7ad9 100644 --- a/src/storage/fts/ob_whitespace_ft_parser.h +++ b/src/storage/fts/ob_whitespace_ft_parser.h @@ -10,8 +10,8 @@ * See the Mulan PubL v2 for more details. */ -#ifndef OB_DEFAULT_FT_PARSER_H_ -#define OB_DEFAULT_FT_PARSER_H_ +#ifndef OB_WHITESPACE_FT_PARSER_H_ +#define OB_WHITESPACE_FT_PARSER_H_ #include "lib/ob_plugin.h" #include "lib/utility/ob_macro_utils.h" @@ -26,40 +26,18 @@ namespace storage class ObSpaceFTParser final { public: - static const int64_t FT_MIN_WORD_LEN = 3; - static const int64_t FT_MAX_WORD_LEN = 84; -public: + ObSpaceFTParser() = default; + ~ObSpaceFTParser() = default; static int segment( lib::ObFTParserParam *param, const char *fulltext, const int64_t ft_len); - private: - ObSpaceFTParser() - : analysis_ctx_(), - english_analyzer_(), - is_inited_(false) - {} - ~ObSpaceFTParser() = default; - static int add_word( lib::ObFTParserParam *param, - common::ObIAllocator *allocator, const char *word, - int64_t word_len); - int init(lib::ObFTParserParam *param); - void reset(); - int segment( - const common::ObDatum &doc, - share::ObITokenStream *&token_stream); - TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited)); - -private: - share::ObTextAnalysisCtx analysis_ctx_; - share::ObEnglishTextAnalyzer english_analyzer_; - bool is_inited_; - - DISALLOW_COPY_AND_ASSIGN(ObSpaceFTParser); + const int64_t word_len, + const int64_t char_cnt); }; class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc @@ -76,8 +54,7 @@ private: }; static ObWhiteSpaceFTParserDesc whitespace_parser; - } // end namespace storage } // end namespace oceanbase -#endif // OB_DEFAULT_FT_PARSER_H_ +#endif // OB_WHITESPACE_FT_PARSER_H_ diff --git a/unittest/storage/test_fts_plugin.cpp b/unittest/storage/test_fts_plugin.cpp index 40ae8a08b..81f3b8d7a 100644 --- a/unittest/storage/test_fts_plugin.cpp +++ b/unittest/storage/test_fts_plugin.cpp @@ -81,16 +81,23 @@ public: static const char *TEST_FULLTEXT; static const int64_t TEST_WORD_COUNT = 5; static const int64_t TEST_WORD_COUNT_WITHOUT_STOPWORD = 4; + static const int64_t FT_MIN_WORD_LEN = 3; + static const int64_t FT_MAX_WORD_LEN = 84; public: - ObTestAddWord(); + ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator); virtual ~ObTestAddWord() = default; virtual int operator()( lib::ObFTParserParam *param, const char *word, - const int64_t word_len) override; + const int64_t word_len, + const int64_t char_cnt) override; virtual int64_t get_add_word_count() const override { return ith_word_; } VIRTUAL_TO_STRING_KV(K_(ith_word)); private: + bool is_min_max_word(const int64_t c_len) const; + int casedown_word(const ObFTWord &src, ObFTWord &dst); + ObCollationType collation_type_; + common::ObIAllocator &allocator_; const char *words_[TEST_WORD_COUNT]; const char *words_without_stopword_[TEST_WORD_COUNT_WITHOUT_STOPWORD]; int64_t ith_word_; @@ -98,26 +105,57 @@ private: const char *ObTestAddWord::TEST_FULLTEXT = "OceanBase fulltext search is No.1 in the world."; -ObTestAddWord::ObTestAddWord() - : words_{"oceanbase", "fulltext", "search", "the", "world"}, +ObTestAddWord::ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator) + : collation_type_(type), + allocator_(allocator), + words_{"oceanbase", "fulltext", "search", "the", "world"}, words_without_stopword_{"oceanbase", "fulltext", "search", "world"}, ith_word_(0) { } +bool ObTestAddWord::is_min_max_word(const int64_t c_len) const +{ + return c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN; +} + +int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(src.empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid src ft word", K(ret), K(src)); + } else { + ObString dst_str; + if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) { + LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_)); + } else { + ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_); + dst = tmp; + } + } + return ret; +} + int ObTestAddWord::operator()( lib::ObFTParserParam *param, const char *word, - const int64_t word_len) + const int64_t word_len, + const int64_t char_cnt) { int ret = OB_SUCCESS; - if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) { + ObFTWord src_word(word_len, word, collation_type_); + ObFTWord dst_word; + if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len)); - } else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], word, word_len))) { + LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt)); + } else if (is_min_max_word(char_cnt)) { + // skip min/max word + } else if (OB_FAIL(casedown_word(src_word, dst_word))) { + LOG_WARN("fail to casedown word", K(ret), K(src_word)); + } else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], dst_word.get_word().ptr(), dst_word.get_word().length()))) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), - KCSTRING(word), K(word_len)); + LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), K(dst_word)); } else { ++ith_word_; } @@ -136,17 +174,17 @@ public: private: lib::ObPluginParam plugin_param_; lib::ObFTParserParam ft_parser_param_; - ObTestAddWord add_word_; ObWhiteSpaceFTParserDesc desc_; common::ObArenaAllocator allocator_; + ObTestAddWord add_word_; }; TestDefaultFTParser::TestDefaultFTParser() : plugin_param_(), ft_parser_param_(), - add_word_(), desc_(), - allocator_() + allocator_(), + add_word_(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_) { plugin_param_.desc_ = &desc_; } @@ -190,7 +228,8 @@ TEST_F(TestDefaultFTParser, test_space_ft_parser_segment) TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268) { common::ObArray words; - ObNoStopWordAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, allocator_, words); + ObAddWordFlag flag; + ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words); const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 "; const int64_t ft_len = strlen(fulltext); @@ -291,7 +330,7 @@ void ObTestFTPluginHelper::TearDown() // ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc)); // ASSERT_TRUE(nullptr != desc); // -// ObTestAddWord test_add_word; +// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_); // ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT, // strlen(TEST_FULLTEXT), allocator_, test_add_word)); //} @@ -326,7 +365,7 @@ void ObTestFTPluginHelper::TearDown() // ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc)); // ASSERT_TRUE(nullptr != desc); // -// ObTestAddWord test_add_word; +// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_); // ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT, // strlen(TEST_FULLTEXT), allocator_, test_add_word)); // @@ -408,7 +447,7 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext) ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); - ObTestAddWord test_add_word; + ObTestAddWord test_add_word(cs_type_, allocator_); for (int64_t i = 0; i < words.count(); ++i) { ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length())); } @@ -514,7 +553,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1"; ObTestNgramFTParseHelper::ObTestNgramFTParseHelper() : plugin_name_(STRLEN(name_), name_), - ngram_words_{"Oc", "ce", "ea", "an", "nB", "Ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "No", "in", "th", "he", "wo", "or", "rl", "ld"}, + ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"}, cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN), allocator_() {