From 89e696f309421bcae0c2b8f786729bb63e0d55d0 Mon Sep 17 00:00:00 2001 From: Tyshawn Date: Mon, 17 Jun 2024 09:38:01 +0000 Subject: [PATCH] [FTS] Adjust plugin tokenizer interface for fulltext search --- deps/oblib/src/lib/ob_plugin.h | 57 ++--- src/sql/das/ob_das_domain_utils.cpp | 30 +-- src/sql/das/ob_das_domain_utils.h | 13 +- src/sql/das/ob_text_retrieval_op.cpp | 14 +- src/storage/fts/ob_beng_ft_parser.cpp | 129 +++++----- src/storage/fts/ob_beng_ft_parser.h | 38 +-- .../fts/ob_fts_buildin_parser_register.ipp | 44 ++-- src/storage/fts/ob_fts_plugin_helper.cpp | 80 +++++-- src/storage/fts/ob_fts_plugin_helper.h | 17 +- src/storage/fts/ob_fts_stop_word.cpp | 48 +++- src/storage/fts/ob_fts_stop_word.h | 19 +- src/storage/fts/ob_fts_struct.h | 10 +- src/storage/fts/ob_ngram_ft_parser.cpp | 152 ++++++++---- src/storage/fts/ob_ngram_ft_parser.h | 34 ++- src/storage/fts/ob_whitespace_ft_parser.cpp | 142 ++++++++--- src/storage/fts/ob_whitespace_ft_parser.h | 33 ++- unittest/storage/mock_ft_parser.cpp | 2 +- unittest/storage/mock_ft_parser.h | 4 +- unittest/storage/test_fts_plugin.cpp | 222 ++++++++++++------ 19 files changed, 676 insertions(+), 412 deletions(-) diff --git a/deps/oblib/src/lib/ob_plugin.h b/deps/oblib/src/lib/ob_plugin.h index 676b31b331..a64d5e98b7 100644 --- a/deps/oblib/src/lib/ob_plugin.h +++ b/deps/oblib/src/lib/ob_plugin.h @@ -114,7 +114,7 @@ enum class ObPluginType : uint64_t // define plugin license enum class ObPluginLicenseType : uint64_t { - OB_MULAN_V2_LICENSE = 1, // Mulan PubL v2 license + OB_Mulan_PubL_V2_LICENSE = 1, // Mulan PubL v2 license OB_MAX_PLUGIN_LICENSE_TYPE = 2, // max plugin license type }; @@ -186,7 +186,7 @@ public: && nullptr != author_ && nullptr != spec_ && PLUGIN_VERSION == version_ - && (ObPluginLicenseType::OB_MULAN_V2_LICENSE <= license_ + && (ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE <= license_ && license_ < ObPluginLicenseType::OB_MAX_PLUGIN_LICENSE_TYPE) && nullptr != desc_; } @@ -217,24 +217,9 @@ public: class ObFTParserParam final { -public: - class ObIAddWord - { - public: - ObIAddWord() = default; - virtual ~ObIAddWord() = default; - virtual int operator()( - ObFTParserParam *param, - const char *word, - const int64_t word_len, - const int64_t char_cnt) = 0; - virtual int64_t get_add_word_count() const = 0; - DECLARE_PURE_VIRTUAL_TO_STRING; - }; public: ObFTParserParam() : allocator_(nullptr), - add_word_(nullptr), cs_(nullptr), fulltext_(nullptr), ft_length_(0), @@ -245,36 +230,42 @@ public: inline bool is_valid() const { return nullptr != allocator_ - && nullptr != add_word_ && nullptr != cs_ && nullptr != fulltext_ && 0 < ft_length_ && 0 <= parser_version_; } - inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt) - { - return (*add_word_)(param, word, word_len, char_cnt); - } inline void reset() { allocator_ = nullptr; - add_word_ = nullptr; cs_ = nullptr; fulltext_ = nullptr; ft_length_ = 0; parser_version_ = 0; } - TO_STRING_KV(KP_(allocator), KP_(add_word), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version)); + TO_STRING_KV(KP_(allocator), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version)); public: common::ObIAllocator *allocator_; - ObIAddWord *add_word_; const ObCharsetInfo *cs_; const char *fulltext_; int64_t ft_length_; int64_t parser_version_; }; +class ObITokenIterator +{ +public: + ObITokenIterator() = default; + virtual ~ObITokenIterator() = default; + virtual int get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_cnt, + int64_t &word_freq) = 0; + DECLARE_PURE_VIRTUAL_TO_STRING; +}; + // fulltext parser descriptor interface for domain index // - splitting a document into many tokenizations. class ObIFTParserDesc : public ObIPluginDesc @@ -286,12 +277,22 @@ public: /** * split fulltext into multiple word segments * - * @param[in] fulltext, the document to be tokenized. - * @param[out] words, the word segmentation after splitting. + * @param[in] param, the document to be tokenized and parameters related to word segmentation. + * @param[out] iter, the tokenized words' iterator. * * @return error code, such as, OB_SUCCESS, OB_INVALID_ARGUMENT, ... */ - virtual int segment(ObFTParserParam *param) const = 0; + virtual int segment(ObFTParserParam *param, ObITokenIterator *&iter) const = 0; + + /** + * Release resources held by the iterator and free token iterator. + */ + virtual void free_token_iter(ObFTParserParam *param, ObITokenIterator *&iter) const + { + if (OB_NOT_NULL(iter)) { + iter->~ObITokenIterator(); + } + } }; } // end namespace lib diff --git a/src/sql/das/ob_das_domain_utils.cpp b/src/sql/das/ob_das_domain_utils.cpp index d2123b1977..9309c6f390 100644 --- a/src/sql/das/ob_das_domain_utils.cpp +++ b/src/sql/das/ob_das_domain_utils.cpp @@ -207,30 +207,16 @@ int ObDASDomainUtils::generate_spatial_index_rows( ObFTWordMap &words_count) { int ret = OB_SUCCESS; - common::ObSEArray words; if (OB_ISNULL(helper) || OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type || ObCollationType::CS_TYPE_EXTENDED_MARK < type) || OB_UNLIKELY(!words_count.created())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created())); - } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) { + } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) { LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext)); - } else { - for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) { - const ObFTWord &ft_word = words.at(i); - int64_t word_count = 0; - if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) { - LOG_WARN("fail to get ft word", K(ret), K(ft_word)); - } else { - word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count; - if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) { - LOG_WARN("fail to set ft word and count", K(ret), K(ft_word)); - } - } - } } - STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words), K(type)); + STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words_count.size()), K(type)); return ret; } @@ -484,6 +470,7 @@ void ObDomainDMLIterator::reset() row_projector_ = nullptr; das_ctdef_ = nullptr; main_ctdef_ = nullptr; + allocator_.reset(); } void ObDomainDMLIterator::set_ctdef( @@ -520,10 +507,12 @@ int ObDomainDMLIterator::get_next_domain_row(ObNewRow *&row) while (OB_SUCC(ret) && !got_row) { if (row_idx_ >= rows_.count()) { rows_.reuse(); + allocator_.reuse(); row_idx_ = 0; if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected error, not domain index", K(ret), K(das_ctdef_->table_param_.get_data_table())); + } else if (FAILEDx(write_iter_.get_next_row(sr))) { if (OB_ITER_END != ret) { LOG_WARN("get next row from result iterator failed", K(ret)); @@ -562,6 +551,7 @@ int ObDomainDMLIterator::get_next_domain_rows(ObNewRow *&row, int64_t &row_count while (OB_SUCC(ret) && !got_row) { if (row_idx_ >= rows_.count()) { rows_.reuse(); + allocator_.reuse(); row_idx_ = 0; if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) { ret = OB_ERR_UNEXPECTED; @@ -757,7 +747,7 @@ int ObFTDMLIterator::get_ft_and_doc_id( const ObChunkDatumStore::StoredRow *store_row, ObString &doc_id, ObString &ft, - common::ObObjMeta &ft_meta) const + common::ObObjMeta &ft_meta) { int ret = OB_SUCCESS; const uint64_t doc_id_col_id = das_ctdef_->table_param_.get_data_table().get_doc_id_col_id(); @@ -793,7 +783,7 @@ int ObFTDMLIterator::get_ft_and_doc_id_for_update( const ObChunkDatumStore::StoredRow *store_row, ObString &doc_id, ObString &ft, - common::ObObjMeta &ft_meta) const + common::ObObjMeta &ft_meta) { int ret = OB_SUCCESS; const uint64_t rowkey_col_cnt = das_ctdef_->table_param_.get_data_table().get_rowkey_column_num(); @@ -863,7 +853,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data( const ObChunkDatumStore::StoredRow *store_row, int64_t& multivalue_idx, int64_t& multivalue_arr_idx, - ObString &multivalue_data) const + ObString &multivalue_data) { int ret = OB_SUCCESS; multivalue_idx = OB_INVALID_ID; @@ -910,7 +900,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data_for_update( const ObChunkDatumStore::StoredRow *store_row, int64_t& multivalue_idx, int64_t& multivalue_arr_idx, - ObString &multivalue_data) const + ObString &multivalue_data) { int ret = OB_SUCCESS; bool found = false; diff --git a/src/sql/das/ob_das_domain_utils.h b/src/sql/das/ob_das_domain_utils.h index 95d61b5073..63568f2bc0 100644 --- a/src/sql/das/ob_das_domain_utils.h +++ b/src/sql/das/ob_das_domain_utils.h @@ -13,6 +13,7 @@ #ifndef OCEANBASE_DAS_DOMAIN_UTILS_H #define OCEANBASE_DAS_DOMAIN_UTILS_H +#include "lib/allocator/page_arena.h" #include "lib/hash/ob_hashset.h" #include "sql/das/ob_das_dml_ctx_define.h" #include "storage/fts/ob_fts_plugin_helper.h" @@ -56,8 +57,6 @@ public: const IntFixedArray &row_projector, const ObDASWriteBuffer::DmlRow &dml_row, ObDomainIndexRow &domain_rows); -private: - typedef common::hash::ObHashMap ObFTWordMap; private: static int segment_and_calc_word_count( common::ObIAllocator &allocator, @@ -126,7 +125,7 @@ protected: ObDASWriteBuffer::Iterator &write_iter_; const ObDASDMLBaseCtDef *das_ctdef_; const ObDASDMLBaseCtDef *main_ctdef_; - common::ObIAllocator &allocator_; + common::ObArenaAllocator allocator_; bool is_update_; private: DISALLOW_COPY_AND_ASSIGN(ObDomainDMLIterator); @@ -178,13 +177,13 @@ private: const ObChunkDatumStore::StoredRow *store_row, int64_t& multivalue_idx, int64_t& multivalue_arr_idx, - ObString &multivalue_data) const; + ObString &multivalue_data); int get_multivlaue_json_data_for_update( const ObChunkDatumStore::StoredRow *store_row, int64_t& multivalue_idx, int64_t& multivalue_arr_idx, - ObString &multivalue_data) const; + ObString &multivalue_data); }; @@ -214,12 +213,12 @@ protected: const ObChunkDatumStore::StoredRow *store_row, ObString &doc_id, ObString &ft, - common::ObObjMeta &ft_meta) const; + common::ObObjMeta &ft_meta); int get_ft_and_doc_id_for_update( const ObChunkDatumStore::StoredRow *store_row, ObString &doc_id, ObString &ft, - common::ObObjMeta &ft_meta) const; + common::ObObjMeta &ft_meta); private: storage::ObFTParseHelper ft_parse_helper_; diff --git a/src/sql/das/ob_text_retrieval_op.cpp b/src/sql/das/ob_text_retrieval_op.cpp index f951591dd3..40ca0ad66b 100644 --- a/src/sql/das/ob_text_retrieval_op.cpp +++ b/src/sql/das/ob_text_retrieval_op.cpp @@ -312,21 +312,9 @@ int ObTextRetrievalMerge::init_query_tokens(const ObDASIRScanCtDef *ir_ctdef, Ob } else if (OB_FAIL(token_map.create(ft_word_bkt_cnt, common::ObMemAttr(MTL_ID(), "FTWordMap")))) { LOG_WARN("failed to create token map", K(ret)); } else if (OB_FAIL(tokenize_helper.segment( - cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, tokens))) { + cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, token_map))) { LOG_WARN("failed to segment"); } else { - for (int64_t i = 0; OB_SUCC(ret) && i < tokens.count(); ++i) { - const ObFTWord &token = tokens.at(i); - int64_t word_count = 0; - if (OB_FAIL(token_map.get_refactored(token, word_count)) && OB_HASH_NOT_EXIST != ret) { - LOG_WARN("fail to get ft word", K(ret), K(token)); - } else { - word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count; - if (OB_FAIL(token_map.set_refactored(token, word_count, 1/*overwrite*/))) { - LOG_WARN("fail to set ft word and count", K(ret), K(token)); - } - } - } for (hash::ObHashMap::const_iterator iter = token_map.begin(); OB_SUCC(ret) && iter != token_map.end(); ++iter) { diff --git a/src/storage/fts/ob_beng_ft_parser.cpp b/src/storage/fts/ob_beng_ft_parser.cpp index 446915b73a..6c84bbce62 100644 --- a/src/storage/fts/ob_beng_ft_parser.cpp +++ b/src/storage/fts/ob_beng_ft_parser.cpp @@ -22,70 +22,43 @@ namespace oceanbase namespace storage { -/*static*/ int ObBEngFTParser::segment( - lib::ObFTParserParam *param, - const char *ft, - const int64_t ft_len) -{ - int ret = OB_SUCCESS; - ObDatum doc; - doc.set_string(ft, ft_len); - ObBEngFTParser parser; - share::ObITokenStream *token_stream = nullptr; - if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len)); - } else if (OB_FAIL(parser.init(param))) { - LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param)); - } else if (FALSE_IT(doc.set_string(ft, ft_len))) { - } else if (OB_FAIL(parser.segment(doc, token_stream))) { - LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len)); - } else if (OB_ISNULL(token_stream)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("token stream is nullptr", K(ret), KP(token_stream)); - } else { - ObDatum token; - int64_t token_freq = 0; - while (OB_SUCC(ret)) { - if (OB_FAIL(token_stream->get_next(token, token_freq))) { - if (OB_ITER_END != ret) { - LOG_WARN("fail to get next token", K(ret), KPC(token_stream)); - } - } else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) { - LOG_WARN("fail to add word", K(ret), K(token), KPC(param)); - } - } - if (OB_ITER_END == ret) { - ret = OB_SUCCESS; - } - } - return ret; -} - -/*static*/ int ObBEngFTParser::add_word( - lib::ObFTParserParam *param, - common::ObIAllocator *allocator, - const char *word, - int64_t word_len) +int ObBEngFTParser::get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_len, + int64_t &word_freq) { int ret = OB_SUCCESS; + ObDatum token; + int64_t token_freq = 0; char *buf = nullptr; - if (OB_ISNULL(param) - || OB_ISNULL(allocator) - || OB_ISNULL(word) - || OB_UNLIKELY(0 >= word_len)) { + word = nullptr; + word_len = 0; + char_len = 0; + word_freq = 0; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("beng ft parser isn't initialized", K(ret), K(is_inited_)); + } else if (OB_ISNULL(token_stream_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_)); + } else if (OB_FAIL(token_stream_->get_next(token, token_freq))) { + if (OB_ITER_END != ret) { + LOG_WARN("fail to get next token", K(ret), KPC(token_stream_)); + } + } else if (OB_ISNULL(token.ptr_) || OB_UNLIKELY(0 >= token.len_ || 0 >= token_freq)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len)); - } else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) { - LOG_DEBUG("skip too small or large word", K(ret), K(word_len)); - } else if (OB_ISNULL(buf = static_cast(allocator->alloc(word_len)))) { + LOG_WARN("invalid arguments", K(ret), KP(token.ptr_), K(token.len_), K(token_freq)); + } else if (OB_ISNULL(buf = static_cast(allocator_.alloc(token.len_)))) { ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("fail to allocate word memory", K(ret), K(word_len)); - } else if (FALSE_IT(MEMCPY(buf, word, word_len))) { - } else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) { - LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word))); + LOG_WARN("fail to allocate word memory", K(ret), K(token.len_)); } else { - LOG_DEBUG("succeed to add word", K(ObString(word_len, word))); + MEMCPY(buf, token.ptr_, token.len_); + word = buf; + word_len = token.len_; + char_len = token.len_; + word_freq = token_freq; + LOG_DEBUG("succeed to add word", K(ObString(word_len, word)), K(word_freq)); } return ret; } @@ -103,13 +76,20 @@ int ObBEngFTParser::init(lib::ObFTParserParam *param) ret = OB_NOT_SUPPORTED; LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_)); } else { + doc_.set_string(param->fulltext_, param->ft_length_); analysis_ctx_.cs_ = param->cs_; analysis_ctx_.filter_stopword_ = false; analysis_ctx_.need_grouping_ = false; if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) { LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_)); + } else if (OB_FAIL(segment(doc_, token_stream_))) { + LOG_WARN("fail to segment fulltext by parser", K(ret), KP(param->fulltext_), K(param->ft_length_)); + } else if (OB_ISNULL(token_stream_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_)); } else { is_inited_ = true; + LOG_DEBUG("succeed to init beng parser", K(ret), K(english_analyzer_), KPC(token_stream_), K(doc_)); } } if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { @@ -139,6 +119,8 @@ void ObBEngFTParser::reset() { analysis_ctx_.reset(); english_analyzer_.reset(); + doc_.reset(); + token_stream_ = nullptr; is_inited_ = false; } @@ -159,20 +141,43 @@ int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param) return OB_SUCCESS; } -int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const +int ObBasicEnglishFTParserDesc::segment( + lib::ObFTParserParam *param, + lib::ObITokenIterator *&iter) const { int ret = OB_SUCCESS; + void *buf = nullptr; if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), KPC(param)); - } else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) { - LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param), - K(param->fulltext_), K(param->ft_length_)); + } else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObBEngFTParser)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("fail to allocate basic english ft parser", K(ret)); + } else { + ObBEngFTParser *parser = new (buf) ObBEngFTParser(*(param->allocator_)); + if (OB_FAIL(parser->init(param))) { + LOG_WARN("fail to init basic english parser", K(ret), KPC(param)); + } else { + iter = parser; + } } return ret; + return ret; +} + +void ObBasicEnglishFTParserDesc::free_token_iter( + lib::ObFTParserParam *param, + lib::ObITokenIterator *&iter) const +{ + if (OB_NOT_NULL(iter)) { + abort_unless(nullptr != param); + abort_unless(nullptr != param->allocator_); + iter->~ObITokenIterator(); + param->allocator_->free(iter); + } } } // end namespace storage diff --git a/src/storage/fts/ob_beng_ft_parser.h b/src/storage/fts/ob_beng_ft_parser.h index 1f83d1d868..55869526d6 100644 --- a/src/storage/fts/ob_beng_ft_parser.h +++ b/src/storage/fts/ob_beng_ft_parser.h @@ -23,40 +23,41 @@ namespace oceanbase namespace storage { -class ObBEngFTParser final +class ObBEngFTParser final : public lib::ObITokenIterator { public: static const int64_t FT_MIN_WORD_LEN = 3; static const int64_t FT_MAX_WORD_LEN = 84; public: - static int segment( - lib::ObFTParserParam *param, - const char *fulltext, - const int64_t ft_len); - -private: - ObBEngFTParser() - : analysis_ctx_(), + explicit ObBEngFTParser(common::ObIAllocator &allocator) + : allocator_(allocator), + analysis_ctx_(), english_analyzer_(), + doc_(), + token_stream_(nullptr), is_inited_(false) {} - ~ObBEngFTParser() = default; + ~ObBEngFTParser() { reset(); } - static int add_word( - lib::ObFTParserParam *param, - common::ObIAllocator *allocator, - const char *word, - int64_t word_len); int init(lib::ObFTParserParam *param); void reset(); + virtual int get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_len, + int64_t &word_freq) override; + + VIRTUAL_TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), KP_(token_stream), K_(is_inited)); +private: int segment( const common::ObDatum &doc, share::ObITokenStream *&token_stream); - TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited)); - private: + common::ObIAllocator &allocator_; share::ObTextAnalysisCtx analysis_ctx_; share::ObEnglishTextAnalyzer english_analyzer_; + common::ObDatum doc_; + share::ObITokenStream *token_stream_; bool is_inited_; DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser); @@ -69,7 +70,8 @@ public: virtual ~ObBasicEnglishFTParserDesc() = default; virtual int init(lib::ObPluginParam *param) override; virtual int deinit(lib::ObPluginParam *param) override; - virtual int segment(lib::ObFTParserParam *param) const override; + virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; + virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; OB_INLINE void reset() { is_inited_ = false; } private: bool is_inited_; diff --git a/src/storage/fts/ob_fts_buildin_parser_register.ipp b/src/storage/fts/ob_fts_buildin_parser_register.ipp index 8f38ba4bc1..9e27bcb906 100644 --- a/src/storage/fts/ob_fts_buildin_parser_register.ipp +++ b/src/storage/fts/ob_fts_buildin_parser_register.ipp @@ -21,13 +21,13 @@ OB_DECLARE_PLUGIN(whitespace_parser) { - oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type - "space", // name - OB_PLUGIN_AUTHOR_OCEANBASE, // author - "This is a default whitespace parser plugin.", // brief specification - 0x00001, // version - oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license - &oceanbase::storage::whitespace_parser, // default space parser plugin instance + oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type + "space", // name + OB_PLUGIN_AUTHOR_OCEANBASE, // author + "This is a default whitespace parser plugin.", // brief specification + 0x00001, // version + oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license + &oceanbase::storage::whitespace_parser, // default space parser plugin instance }; OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser); @@ -36,28 +36,28 @@ OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser OB_DECLARE_PLUGIN(ngram_parser) { - oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type - "ngram", // name - OB_PLUGIN_AUTHOR_OCEANBASE, // author - "This is a ngram fulltext parser plugin.", // brief specification - 0x00001, // version - oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license - &oceanbase::storage::ngram_parser, // ngram parser plugin instance + oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type + "ngram", // name + OB_PLUGIN_AUTHOR_OCEANBASE, // author + "This is a ngram fulltext parser plugin.", // brief specification + 0x00001, // version + oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license + &oceanbase::storage::ngram_parser, // ngram parser plugin instance }; OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser); -///////////////////////////////////// Default fulltext parser ////////////////////////////////////////// +///////////////////////////////////// BEng fulltext parser ////////////////////////////////////////// OB_DECLARE_PLUGIN(beng_parser) { - oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type - "beng", // name - OB_PLUGIN_AUTHOR_OCEANBASE, // author - "This is a basic english parser plugin.", // brief specification - 0x00001, // version - oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license - &oceanbase::storage::beng_parser, // default space parser plugin instance + oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type + "beng", // name + OB_PLUGIN_AUTHOR_OCEANBASE, // author + "This is a basic english parser plugin.", // brief specification + 0x00001, // version + oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license + &oceanbase::storage::beng_parser, // basic english parser plugin instance }; OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser); diff --git a/src/storage/fts/ob_fts_plugin_helper.cpp b/src/storage/fts/ob_fts_plugin_helper.cpp index e972758d1d..9a44e991f2 100644 --- a/src/storage/fts/ob_fts_plugin_helper.cpp +++ b/src/storage/fts/ob_fts_plugin_helper.cpp @@ -119,7 +119,7 @@ int ObFTParseHelper::segment( const char *ft, const int64_t ft_len, common::ObIAllocator &allocator, - lib::ObFTParserParam::ObIAddWord &add_word) + ObAddWord &add_word) { int ret = OB_SUCCESS; if (OB_UNLIKELY(parser_version < 0 || nullptr == parser_desc || nullptr == cs || nullptr == ft || 0 >= ft_len)) { @@ -127,14 +127,38 @@ int ObFTParseHelper::segment( LOG_WARN("invalid arguments", K(ret), K(parser_version), KP(parser_desc), KP(cs), K(ft), K(ft_len)); } else { lib::ObFTParserParam param; + lib::ObITokenIterator *iter = nullptr; param.allocator_ = &allocator; - param.add_word_ = &add_word; param.cs_ = cs; param.fulltext_ = ft; param.ft_length_ = ft_len; param.parser_version_ = parser_version; - if (OB_FAIL(parser_desc->segment(¶m))) { + if (OB_FAIL(parser_desc->segment(¶m, iter))) { LOG_WARN("fail to segment", K(ret), K(param)); + } else if (OB_ISNULL(iter)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error, token iterator is nullptr", K(ret), KP(iter)); + } else { + const char *word = nullptr; + int64_t word_len = 0; + int64_t char_cnt = 0; + int64_t word_freq = 0; + while (OB_SUCC(ret)) { + if (OB_FAIL(iter->get_next_token(word, word_len, char_cnt, word_freq))) { + if (OB_ITER_END != ret) { + LOG_WARN("fail to get next token", K(ret), KPC(iter)); + } + } else if (OB_FAIL(add_word.process_word(word, word_len, char_cnt, word_freq))) { + LOG_WARN("fail to process one word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq)); + } + } + if (OB_ITER_END == ret) { + ret = OB_SUCCESS; + } + } + if (OB_NOT_NULL(iter)) { + parser_desc->free_token_iter(¶m, iter); + iter = nullptr; } } return ret; @@ -176,11 +200,10 @@ int ObFTParseHelper::init( LOG_WARN("unexpected error, parse handler is nullptr", K(ret), KP(parse_handler)); } else if (OB_FAIL(get_fulltext_parser_desc(*parse_handler, parser_desc_))) { LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler)); + } else if (OB_FAIL(set_add_word_flag(parser_name_))) { + LOG_WARN("fail to set add word flag", K(ret), K(parser_name_)); } else { plugin_param_.desc_ = parser_desc_; - if (need_min_max_word(parser_name_)) { add_word_flag_.set_min_max_word(); } - if (need_castdn(parser_name_)) { add_word_flag_.set_casedown(); } - if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word(); } allocator_ = allocator; is_inited_ = true; } @@ -204,7 +227,7 @@ int ObFTParseHelper::segment( const char *fulltext, const int64_t fulltext_len, int64_t &doc_length, - common::ObIArray &words) const + ObFTWordMap &words) const { int ret = OB_SUCCESS; const ObCharsetInfo *cs = nullptr; @@ -231,29 +254,34 @@ int ObFTParseHelper::segment( doc_length = add_word.get_add_word_count(); } } - LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words)); + LOG_DEBUG("ft parse segment", K(ret), K(type), K(add_word_flag_), K(parser_name_), + K(ObString(fulltext_len, fulltext)), K(words.size())); return ret; } -bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser) +int ObFTParseHelper::set_add_word_flag(const ObFTParser &parser) { - share::ObPluginName space("space"); - share::ObPluginName beng("beng"); - return parser.get_parser_name() == space || parser.get_parser_name() == beng; -} - -bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser) -{ - share::ObPluginName space("space"); - share::ObPluginName beng("beng"); - return parser.get_parser_name() == space || parser.get_parser_name() == beng; -} - -bool ObFTParseHelper::need_castdn(const ObFTParser &parser) -{ - share::ObPluginName space("space"); - share::ObPluginName ngram("ngram"); - return parser.get_parser_name() == space || parser.get_parser_name() == ngram; + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!parser.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), K(parser)); + } else if (share::ObPluginName("space") == parser.get_parser_name()) { + add_word_flag_.set_min_max_word(); + add_word_flag_.set_stop_word(); + add_word_flag_.set_casedown(); + add_word_flag_.set_groupby_word(); + } else if (share::ObPluginName("beng") == parser.get_parser_name()) { + add_word_flag_.set_min_max_word(); + add_word_flag_.set_stop_word(); + add_word_flag_.set_groupby_word(); + } else if (share::ObPluginName("ngram") == parser.get_parser_name()) { + add_word_flag_.set_casedown(); + add_word_flag_.set_groupby_word(); + } else { + ret = OB_NOT_SUPPORTED; + LOG_WARN("unsupported parser for fulltext search", K(ret), K(parser)); + } + return ret; } } // end namespace storage diff --git a/src/storage/fts/ob_fts_plugin_helper.h b/src/storage/fts/ob_fts_plugin_helper.h index 6dcb07306d..74334692c0 100644 --- a/src/storage/fts/ob_fts_plugin_helper.h +++ b/src/storage/fts/ob_fts_plugin_helper.h @@ -25,6 +25,8 @@ namespace oceanbase namespace storage { +class ObAddWord; + class ObFTParser final { public: @@ -89,7 +91,7 @@ public: const char *fulltext, const int64_t fulltext_len, int64_t &doc_length, - common::ObIArray &words) const; + ObFTWordMap &words) const; const ObFTParser &get_parser_name() const { return parser_name_; } void reset(); @@ -105,17 +107,8 @@ private: const char *fulltext, const int64_t fulltext_len, common::ObIAllocator &allocator, - lib::ObFTParserParam::ObIAddWord &add_word); - static bool need_stopword_list(const ObFTParser &parser); - static bool need_castdn(const ObFTParser &parser); - static bool need_min_max_word(const ObFTParser &parser); - - int alloc_add_word( - const ObCollationType &type, - common::ObIArray &words, - lib::ObFTParserParam::ObIAddWord *&add_word) const; - void free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const; - + ObAddWord &add_word); + int set_add_word_flag(const ObFTParser &parser); private: lib::ObPluginParam plugin_param_; common::ObIAllocator *allocator_; diff --git a/src/storage/fts/ob_fts_stop_word.cpp b/src/storage/fts/ob_fts_stop_word.cpp index ae03a79d76..dbe4a3ca66 100644 --- a/src/storage/fts/ob_fts_stop_word.cpp +++ b/src/storage/fts/ob_fts_stop_word.cpp @@ -24,10 +24,10 @@ ObAddWord::ObAddWord( const ObCollationType &type, const ObAddWordFlag &flag, common::ObIAllocator &allocator, - common::ObIArray &word) + ObFTWordMap &word_map) : collation_type_(type), allocator_(allocator), - words_(word), + word_map_(word_map), min_max_word_cnt_(0), non_stopword_cnt_(0), stopword_cnt_(0), @@ -35,19 +35,19 @@ ObAddWord::ObAddWord( { } -int ObAddWord::operator()( - lib::ObFTParserParam *param, +int ObAddWord::process_word( const char *word, const int64_t word_len, - const int64_t char_cnt) + const int64_t char_cnt, + const int64_t word_freq) { int ret = OB_SUCCESS; bool is_stopword = false; ObFTWord src_word(word_len, word, collation_type_); ObFTWord dst_word; - if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) { + if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt || 0 >= word_freq)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); + LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq)); } else if (is_min_max_word(char_cnt)) { ++min_max_word_cnt_; LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt)); @@ -58,11 +58,11 @@ int ObAddWord::operator()( } else if (OB_UNLIKELY(is_stopword)) { ++stopword_cnt_; LOG_DEBUG("skip stopword", K(ret), K(dst_word)); - } else if (OB_FAIL(words_.push_back(dst_word))) { - LOG_WARN("fail to push word into words array", K(ret), K(dst_word)); + } else if (OB_FAIL(groupby_word(dst_word, word_freq))) { + LOG_WARN("fail to groupby word into word map", K(ret), K(dst_word), K(word_freq)); } else { - ++non_stopword_cnt_; - LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word)); + non_stopword_cnt_ += word_freq; + LOG_DEBUG("add word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq), K(src_word), K(dst_word)); } return ret; } @@ -104,5 +104,31 @@ int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword) return ret; } +int ObAddWord::groupby_word(const ObFTWord &word, const int64_t word_freq) +{ + int ret = OB_SUCCESS; + int64_t word_count = 0; + if (OB_UNLIKELY(word.empty() || word_freq <= 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), K(word), K(word_freq)); + } else if (!flag_.groupby_word()) { + if (OB_FAIL(word_map_.set_refactored(word, 1/*word count*/))) { + LOG_WARN("fail to set fulltext word and count", K(ret), K(word)); + } + } else if (OB_FAIL(word_map_.get_refactored(word, word_count)) && OB_HASH_NOT_EXIST != ret) { + LOG_WARN("fail to get fulltext word", K(ret), K(word)); + } else { + if (OB_HASH_NOT_EXIST == ret) { + word_count = 1; + } else { + word_count += word_freq; + } + if (OB_FAIL(word_map_.set_refactored(word, word_count, 1/*overwrite*/))) { + LOG_WARN("fail to set fulltext word and count", K(ret), K(word), K(word_count)); + } + } + return ret; +} + } // end namespace storage } // end namespace oceanbase diff --git a/src/storage/fts/ob_fts_stop_word.h b/src/storage/fts/ob_fts_stop_word.h index 0135bda7e4..ae53b602ee 100644 --- a/src/storage/fts/ob_fts_stop_word.h +++ b/src/storage/fts/ob_fts_stop_word.h @@ -63,23 +63,23 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = { "www" }; -class ObAddWord final : public lib::ObFTParserParam::ObIAddWord +class ObAddWord final { public: ObAddWord( const ObCollationType &type, const ObAddWordFlag &flag, common::ObIAllocator &allocator, - common::ObIArray &word); - virtual ~ObAddWord() = default; - virtual int operator()( - lib::ObFTParserParam *param, + ObFTWordMap &word_map); + ~ObAddWord() = default; + int process_word( const char *word, const int64_t word_len, - const int64_t char_cnt) override; - virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; } + const int64_t char_cnt, + const int64_t word_freq); + virtual int64_t get_add_word_count() const { return non_stopword_cnt_; } VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt), - K_(words)); + K(word_map_.size())); public: static const int64_t FT_MIN_WORD_LEN = 3; static const int64_t FT_MAX_WORD_LEN = 84; @@ -87,10 +87,11 @@ private: bool is_min_max_word(const int64_t c_len) const; int casedown_word(const ObFTWord &src, ObFTWord &dst); int check_stopword(const ObFTWord &word, bool &is_stopword); + int groupby_word(const ObFTWord &word, const int64_t word_cnt); private: ObCollationType collation_type_; common::ObIAllocator &allocator_; - common::ObIArray &words_; + ObFTWordMap &word_map_; int64_t min_max_word_cnt_; int64_t non_stopword_cnt_; int64_t stopword_cnt_; diff --git a/src/storage/fts/ob_fts_struct.h b/src/storage/fts/ob_fts_struct.h index 939410ca62..45479c1c20 100644 --- a/src/storage/fts/ob_fts_struct.h +++ b/src/storage/fts/ob_fts_struct.h @@ -14,6 +14,7 @@ #define OB_FTS_STRUCT_H_ #include "lib/charset/ob_charset.h" +#include "lib/hash/ob_hashmap.h" namespace oceanbase { @@ -34,7 +35,7 @@ public: hash_val = ObCharset::hash(type_, word_); return common::OB_SUCCESS; } - OB_INLINE uint64_t hash() const { return word_.hash(); } + OB_INLINE uint64_t hash() const { return ObCharset::hash(type_, word_); } OB_INLINE bool empty() const { return word_.empty(); } OB_INLINE bool operator ==(const ObFTWord &other) const @@ -76,6 +77,8 @@ public: int64_t word_cnt_; }; +typedef common::hash::ObHashMap ObFTWordMap; + class ObAddWordFlag final { private: @@ -84,6 +87,7 @@ private: // than a maximum word length. static const uint64_t AWF_STOPWORD = 1 << 1; // filter by sotp word table. static const uint64_t AWF_CASEDOWN = 1 << 2; // convert characters from uppercase to lowercase. + static const uint64_t AWF_GROUPBY_WORD = 1 << 3; // distinct and word aggregation public: ObAddWordFlag() : flag_(AWF_NONE) {} ~ObAddWordFlag() = default; @@ -95,13 +99,17 @@ public: void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); } void set_stop_word() { set_flag(AWF_STOPWORD); } void set_casedown() { set_flag(AWF_CASEDOWN); } + void set_groupby_word() { set_flag(AWF_GROUPBY_WORD); } void clear() { flag_ = AWF_NONE; } void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); } void clear_stop_word() { clear_flag(AWF_STOPWORD); } void clear_casedown() { clear_flag(AWF_CASEDOWN); } + void clear_groupby_word() { clear_flag(AWF_GROUPBY_WORD); } bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); } bool stopword() const { return has_flag(AWF_STOPWORD); } bool casedown() const { return has_flag(AWF_CASEDOWN); } + bool groupby_word() const { return has_flag(AWF_GROUPBY_WORD); } + TO_STRING_KV(K_(flag)); private: uint64_t flag_; }; diff --git a/src/storage/fts/ob_ngram_ft_parser.cpp b/src/storage/fts/ob_ngram_ft_parser.cpp index 1540feef1c..e23b1dc683 100644 --- a/src/storage/fts/ob_ngram_ft_parser.cpp +++ b/src/storage/fts/ob_ngram_ft_parser.cpp @@ -24,24 +24,81 @@ namespace storage #define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') -/*static*/ int ObNgramFTParser::segment( - lib::ObFTParserParam *param, - const char *fulltext, - const int64_t ft_len) + +ObNgramFTParser::ObNgramFTParser() + : cs_(nullptr), + start_(nullptr), + next_(nullptr), + end_(nullptr), + c_nums_(0), + is_inited_(false) +{} + +ObNgramFTParser::~ObNgramFTParser() +{ + reset(); +} + +void ObNgramFTParser::reset() +{ + cs_ = nullptr; + start_ = nullptr; + next_ = nullptr; + end_ = nullptr; + c_nums_ = 0; + is_inited_ = false; +} + +int ObNgramFTParser::init(lib::ObFTParserParam *param) { int ret = OB_SUCCESS; - int64_t c_nums = 0; - const char *start = fulltext; - const char *next = start; - const char *end = start + ft_len; - if (OB_ISNULL(param) || OB_ISNULL(fulltext) || OB_UNLIKELY(ft_len <= 0)) { + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("init twice", K(ret), KPC(param), KPC(this)); + } else if (OB_ISNULL(param) + || OB_ISNULL(param->cs_) + || OB_ISNULL(param->fulltext_) + || OB_UNLIKELY(0 >= param->ft_length_)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KP(param), KP(fulltext), K(ft_len)); + LOG_WARN("invalid arguments", K(ret), KPC(param)); } else { - const ObCharsetInfo *cs = param->cs_; - while (OB_SUCC(ret) && next < end) { + cs_ = param->cs_; + start_ = param->fulltext_; + next_ = start_; + end_ = start_ + param->ft_length_; + c_nums_ = 0; + is_inited_ = true; + } + if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { + reset(); + } + return ret; +} + +int ObNgramFTParser::get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_len, + int64_t &word_freq) +{ + int ret = OB_SUCCESS; + word = nullptr; + word_len = 0; + char_len = 0; + word_freq = 0; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("ngram ft parser isn't initialized", K(ret), K(is_inited_)); + } else { + int64_t c_nums = c_nums_; + const char *start = start_; + const char *next = next_; + const char *end = end_; + const ObCharsetInfo *cs = cs_; + do { const int64_t c_len = ob_mbcharlen_ptr(cs, next, end); if (next + c_len > end || 0 == c_len) { // if char is invalid, just skip the rest of doc. + ret = OB_ITER_END; break; } else { int ctype; @@ -50,38 +107,31 @@ namespace storage start = next + 1; next = start; c_nums = 0; + if (next == end) { + ret = OB_ITER_END; + } continue; } next += c_len; ++c_nums; } if (NGRAM_TOKEN_SIZE == c_nums) { - if (OB_FAIL(add_word(param, start, next - start, c_nums))) { - LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums)); - } else { - start += ob_mbcharlen_ptr(cs, start, end); - c_nums = NGRAM_TOKEN_SIZE - 1; - } + word = start; + word_len = next - start; + char_len = c_nums; + word_freq = 1; + start += ob_mbcharlen_ptr(cs, start, end); + c_nums = NGRAM_TOKEN_SIZE - 1; + break; } + } while (OB_SUCC(ret) && next < end); + if (OB_ITER_END == ret || OB_SUCCESS == ret) { + start_ = start; + next_ = next; + end_ = end; + c_nums_ = c_nums; } - } - return ret; -} - -/*static*/ int ObNgramFTParser::add_word( - lib::ObFTParserParam *param, - const char *word, - const int64_t word_len, - const int64_t char_cnt) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(param) - || OB_ISNULL(word) - || OB_UNLIKELY(0 >= word_len)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); - } else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) { - LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word))); + LOG_DEBUG("next word", K(ret), K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_)); } return ret; } @@ -103,21 +153,43 @@ int ObNgramFTParserDesc::deinit(lib::ObPluginParam *param) return OB_SUCCESS; } -int ObNgramFTParserDesc::segment(lib::ObFTParserParam *param) const +int ObNgramFTParserDesc::segment( + lib::ObFTParserParam *param, + lib::ObITokenIterator *&iter) const { int ret = OB_SUCCESS; + void *buf = nullptr; if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; LOG_WARN("ngram ft parser desc hasn't be initialized", K(ret), K(is_inited_)); } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), KPC(param)); - } else if (OB_FAIL(ObNgramFTParser::segment(param, param->fulltext_, param->ft_length_))) { - LOG_WARN("fail to segment words for fulltext by ngram", K(ret), KPC(param), - K(param->fulltext_), K(param->ft_length_)); + } else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObNgramFTParser)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("fail to allocate ngram ft parser", K(ret)); + } else { + ObNgramFTParser *parser = new (buf) ObNgramFTParser(); + if (OB_FAIL(parser->init(param))) { + LOG_WARN("fail to init ngram fulltext parser", K(ret), KPC(param)); + } else { + iter = parser; + } } return ret; } +void ObNgramFTParserDesc::free_token_iter( + lib::ObFTParserParam *param, + lib::ObITokenIterator *&iter) const +{ + if (OB_NOT_NULL(iter)) { + abort_unless(nullptr != param); + abort_unless(nullptr != param->allocator_); + iter->~ObITokenIterator(); + param->allocator_->free(iter); + } +} + } // end namespace storage } // end namespace oceanbase diff --git a/src/storage/fts/ob_ngram_ft_parser.h b/src/storage/fts/ob_ngram_ft_parser.h index 5cbd931d0e..fb32f1a093 100644 --- a/src/storage/fts/ob_ngram_ft_parser.h +++ b/src/storage/fts/ob_ngram_ft_parser.h @@ -22,23 +22,30 @@ namespace oceanbase namespace storage { -class ObNgramFTParser final +class ObNgramFTParser final : public lib::ObITokenIterator { public: static const int64_t NGRAM_TOKEN_SIZE = 2; // TODO: @jinzhu, please apply one system variable later, and keep the same as mysql. public: - ObNgramFTParser() = default; - ~ObNgramFTParser() = default; - static int segment( - lib::ObFTParserParam *param, - const char *fulltext, - const int64_t ft_len); + ObNgramFTParser(); + virtual ~ObNgramFTParser(); + + int init(lib::ObFTParserParam *param); + void reset(); + virtual int get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_len, + int64_t &word_freq) override; + + VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited)); private: - static int add_word( - lib::ObFTParserParam *param, - const char *word, - const int64_t word_len, - const int64_t char_cnt); + const ObCharsetInfo *cs_; + const char *start_; + const char *next_; + const char *end_; + int64_t c_nums_; + bool is_inited_; private: DISABLE_COPY_ASSIGN(ObNgramFTParser); }; @@ -50,7 +57,8 @@ public: virtual ~ObNgramFTParserDesc() = default; virtual int init(lib::ObPluginParam *param) override; virtual int deinit(lib::ObPluginParam *param) override; - virtual int segment(lib::ObFTParserParam *param) const override; + virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; + virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; OB_INLINE void reset() { is_inited_ = false; } private: bool is_inited_; diff --git a/src/storage/fts/ob_whitespace_ft_parser.cpp b/src/storage/fts/ob_whitespace_ft_parser.cpp index ce21ab766a..403790204f 100644 --- a/src/storage/fts/ob_whitespace_ft_parser.cpp +++ b/src/storage/fts/ob_whitespace_ft_parser.cpp @@ -24,22 +24,74 @@ namespace storage #define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') -int ObSpaceFTParser::segment( - lib::ObFTParserParam *param, - const char *ft, - const int64_t ft_len) +ObSpaceFTParser::ObSpaceFTParser() + : cs_(nullptr), + start_(nullptr), + next_(nullptr), + end_(nullptr), + is_inited_(false) +{} + +ObSpaceFTParser::~ObSpaceFTParser() +{ + reset(); +} + +void ObSpaceFTParser::reset() +{ + cs_ = nullptr; + start_ = nullptr; + next_ = nullptr; + end_ = nullptr; + is_inited_ = false; +} + +int ObSpaceFTParser::init(lib::ObFTParserParam *param) { int ret = OB_SUCCESS; - const char *start = ft; - const char *next = start; - const char *end = start + ft_len; - int mbl = 0; - if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) { + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("init twice", K(ret), KPC(param), KPC(this)); + } else if (OB_ISNULL(param) + || OB_ISNULL(param->cs_) + || OB_ISNULL(param->fulltext_) + || OB_UNLIKELY(0 >= param->ft_length_)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len)); + LOG_WARN("invalid arguments", K(ret), KPC(param)); } else { - const ObCharsetInfo *cs = param->cs_; - while (OB_SUCC(ret) && next < end) { + cs_ = param->cs_; + start_ = param->fulltext_; + next_ = start_; + end_ = start_ + param->ft_length_; + is_inited_ = true; + } + if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { + reset(); + } + return ret; +} + +int ObSpaceFTParser::get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_len, + int64_t &word_freq) +{ + int ret = OB_SUCCESS; + int mbl = 0; + word = nullptr; + word_len = 0; + char_len = 0; + word_freq = 0; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("space ft parser isn't initialized", K(ret), K(is_inited_)); + } else { + const char *start = start_; + const char *next = next_; + const char *end = end_; + const ObCharsetInfo *cs = cs_; + do { while (next < end) { int ctype; mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end); @@ -62,34 +114,24 @@ int ObSpaceFTParser::segment( ++c_nums; next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); } - if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) { - LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next)); + if (0 < c_nums) { + word = start; + word_len = next - start; + char_len = c_nums; + word_freq = 1; + start = next; + break; } else { start = next; } } + } while (OB_SUCC(ret) && next < end); + if (OB_ITER_END == ret || OB_SUCCESS == ret) { + start_ = start; + next_ = next; + end_ = end; } - if (OB_ITER_END == ret) { - ret = OB_SUCCESS; - } - } - return ret; -} - -int ObSpaceFTParser::add_word( - lib::ObFTParserParam *param, - const char *word, - const int64_t word_len, - const int64_t char_cnt) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(param) - || OB_ISNULL(word) - || OB_UNLIKELY(0 >= word_len)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); - } else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) { - LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word))); + LOG_DEBUG("next word", K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_)); } return ret; } @@ -111,21 +153,43 @@ int ObWhiteSpaceFTParserDesc::deinit(lib::ObPluginParam *param) return OB_SUCCESS; } -int ObWhiteSpaceFTParserDesc::segment(lib::ObFTParserParam *param) const +int ObWhiteSpaceFTParserDesc::segment( + lib::ObFTParserParam *param, + lib::ObITokenIterator *&iter) const { int ret = OB_SUCCESS; + void *buf = nullptr; if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), KPC(param)); - } else if (OB_FAIL(ObSpaceFTParser::segment(param, param->fulltext_, param->ft_length_))) { - LOG_WARN("fail to segment words for fulltext by spaces", K(ret), KPC(param), - K(param->fulltext_), K(param->ft_length_)); + } else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObSpaceFTParser)))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("fail to allocate space ft parser", K(ret)); + } else { + ObSpaceFTParser *parser = new (buf) ObSpaceFTParser(); + if (OB_FAIL(parser->init(param))) { + LOG_WARN("fail to init whitespace fulltext parser", K(ret), KPC(param)); + } else { + iter = parser; + } } return ret; } +void ObWhiteSpaceFTParserDesc::free_token_iter( + lib::ObFTParserParam *param, + lib::ObITokenIterator *&iter) const +{ + if (OB_NOT_NULL(iter)) { + abort_unless(nullptr != param); + abort_unless(nullptr != param->allocator_); + iter->~ObITokenIterator(); + param->allocator_->free(iter); + } +} + } // end namespace storage } // end namespace oceanbase diff --git a/src/storage/fts/ob_whitespace_ft_parser.h b/src/storage/fts/ob_whitespace_ft_parser.h index 9420c7ad92..3dc0ed6856 100644 --- a/src/storage/fts/ob_whitespace_ft_parser.h +++ b/src/storage/fts/ob_whitespace_ft_parser.h @@ -23,21 +23,27 @@ namespace oceanbase namespace storage { -class ObSpaceFTParser final +class ObSpaceFTParser final : public lib::ObITokenIterator { public: - ObSpaceFTParser() = default; - ~ObSpaceFTParser() = default; - static int segment( - lib::ObFTParserParam *param, - const char *fulltext, - const int64_t ft_len); + ObSpaceFTParser(); + virtual ~ObSpaceFTParser(); + + int init(lib::ObFTParserParam *param); + void reset(); + virtual int get_next_token( + const char *&word, + int64_t &word_len, + int64_t &char_len, + int64_t &word_freq) override; + + VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited)); private: - static int add_word( - lib::ObFTParserParam *param, - const char *word, - const int64_t word_len, - const int64_t char_cnt); + const ObCharsetInfo *cs_; + const char *start_; + const char *next_; + const char *end_; + bool is_inited_; }; class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc @@ -47,7 +53,8 @@ public: virtual ~ObWhiteSpaceFTParserDesc() = default; virtual int init(lib::ObPluginParam *param) override; virtual int deinit(lib::ObPluginParam *param) override; - virtual int segment(lib::ObFTParserParam *param) const override; + virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; + virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; OB_INLINE void reset() { is_inited_ = false; } private: bool is_inited_; diff --git a/unittest/storage/mock_ft_parser.cpp b/unittest/storage/mock_ft_parser.cpp index a24cfdde83..a62a4d04f0 100644 --- a/unittest/storage/mock_ft_parser.cpp +++ b/unittest/storage/mock_ft_parser.cpp @@ -19,6 +19,6 @@ OB_DECLARE_PLUGIN(mock_ft_parser) OB_PLUGIN_AUTHOR_OCEANBASE, "This is mock fulltext parser plugin.", 0x00001, - oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, + oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, &oceanbase::storage::mock_ft_parser, }; diff --git a/unittest/storage/mock_ft_parser.h b/unittest/storage/mock_ft_parser.h index 1c5d45cef4..e1d028dfcc 100644 --- a/unittest/storage/mock_ft_parser.h +++ b/unittest/storage/mock_ft_parser.h @@ -27,7 +27,7 @@ public: virtual ~ObMockFTParserDesc() = default; virtual int init(lib::ObPluginParam *param) override; virtual int deinit(lib::ObPluginParam *param) override; - virtual int segment(lib::ObFTParserParam *param) const override; + virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; }; int ObMockFTParserDesc::init(lib::ObPluginParam *param) @@ -42,7 +42,7 @@ int ObMockFTParserDesc::deinit(lib::ObPluginParam *param) return OB_SUCCESS; } -int ObMockFTParserDesc::segment(lib::ObFTParserParam *param) const +int ObMockFTParserDesc::segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const { UNUSED(param); return OB_SUCCESS; diff --git a/unittest/storage/test_fts_plugin.cpp b/unittest/storage/test_fts_plugin.cpp index 81f3b8d7af..139f7a03ff 100644 --- a/unittest/storage/test_fts_plugin.cpp +++ b/unittest/storage/test_fts_plugin.cpp @@ -49,33 +49,19 @@ int segment_and_calc_word_count( { int ret = OB_SUCCESS; int64_t doc_length = 0; - common::ObSEArray words; if (OB_ISNULL(helper) || OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type || ObCollationType::CS_TYPE_EXTENDED_MARK < type) || OB_UNLIKELY(!words_count.created())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created())); - } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) { + } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) { LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext)); - } else { - for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) { - const ObFTWord &ft_word = words.at(i); - int64_t word_count = 0; - if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) { - LOG_WARN("fail to get ft word", K(ret), K(ft_word)); - } else { - word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count; - if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) { - LOG_WARN("fail to set ft word and count", K(ret), K(ft_word)); - } - } - } } return ret; } -class ObTestAddWord final : public lib::ObFTParserParam::ObIAddWord +class ObTestAddWord final { public: static const char *TEST_FULLTEXT; @@ -85,14 +71,16 @@ public: static const int64_t FT_MAX_WORD_LEN = 84; public: ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator); - virtual ~ObTestAddWord() = default; - virtual int operator()( - lib::ObFTParserParam *param, + ~ObTestAddWord() = default; + int check_words(lib::ObITokenIterator *iter); + int64_t get_add_word_count() const { return ith_word_; } + static int64_t get_word_cnt_without_stopword() { return TEST_WORD_COUNT_WITHOUT_STOPWORD; } + VIRTUAL_TO_STRING_KV(K_(ith_word)); +private: + int check_ith_word( const char *word, const int64_t word_len, - const int64_t char_cnt) override; - virtual int64_t get_add_word_count() const override { return ith_word_; } - VIRTUAL_TO_STRING_KV(K_(ith_word)); + const int64_t char_cnt); private: bool is_min_max_word(const int64_t c_len) const; int casedown_word(const ObFTWord &src, ObFTWord &dst); @@ -137,8 +125,32 @@ int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst) return ret; } -int ObTestAddWord::operator()( - lib::ObFTParserParam *param, +int ObTestAddWord::check_words(lib::ObITokenIterator *iter) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(iter)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arguments", K(ret), KP(iter)); + } else { + const char *word = nullptr; + int64_t word_len = 0; + int64_t char_len = 0; + int64_t word_freq = 0; + while (OB_SUCC(ret)) { + if (OB_FAIL(iter->get_next_token(word, word_len, char_len, word_freq))) { + LOG_WARN("fail to get next token", K(ret), KPC(iter)); + } else if (OB_FAIL(check_ith_word(word, word_len, char_len))) { + LOG_WARN("fail to check ith word", K(ret), KP(word), K(word_len), K(char_len)); + } + } + if (OB_ITER_END == ret) { + ret = OB_SUCCESS; + } + } + return ret; +} + +int ObTestAddWord::check_ith_word( const char *word, const int64_t word_len, const int64_t char_cnt) @@ -146,9 +158,9 @@ int ObTestAddWord::operator()( int ret = OB_SUCCESS; ObFTWord src_word(word_len, word, collation_type_); ObFTWord dst_word; - if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) { + if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt)); + LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt)); } else if (is_min_max_word(char_cnt)) { // skip min/max word } else if (OB_FAIL(casedown_word(src_word, dst_word))) { @@ -194,7 +206,6 @@ void TestDefaultFTParser::SetUp() ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_)); ft_parser_param_.allocator_ = &allocator_; - ft_parser_param_.add_word_ = &add_word_; ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_UTF8MB4_BIN); ft_parser_param_.parser_version_ = 0x00001; ASSERT_TRUE(nullptr != ft_parser_param_.cs_); @@ -209,54 +220,74 @@ void TestDefaultFTParser::TearDown() TEST_F(TestDefaultFTParser, test_space_ft_parser_segment) { + ObSpaceFTParser parser; const char *fulltext = ObTestAddWord::TEST_FULLTEXT; const int64_t ft_len = strlen(fulltext); - ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(nullptr, nullptr, 0)); - ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, nullptr, 0)); - ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, 0)); - ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, -1)); + ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(nullptr)); + + ft_parser_param_.fulltext_ = nullptr; + ft_parser_param_.ft_length_ = 0; + ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_)); + + ft_parser_param_.fulltext_ = fulltext; + ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_)); + + ft_parser_param_.ft_length_ = -1; + ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_)); ft_parser_param_.fulltext_ = fulltext; ft_parser_param_.ft_length_ = ft_len; LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); - ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len)); + ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_)); + ASSERT_EQ(OB_SUCCESS, add_word_.check_words(&parser)); LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); } TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268) { - common::ObArray words; - ObAddWordFlag flag; - ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words); + ObSpaceFTParser parser; const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 "; const int64_t ft_len = strlen(fulltext); ft_parser_param_.fulltext_ = fulltext; ft_parser_param_.ft_length_ = ft_len; - ft_parser_param_.add_word_ = &add_word; ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI); LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); - ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len)); - LOG_INFO("after space segment", KCSTRING(fulltext), K(words), K(ft_len), K(ft_parser_param_)); + ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_)); + const char *word = nullptr; + int64_t word_len = 0; + int64_t char_len = 0; + int64_t word_freq = 0; + int ret = OB_SUCCESS; + while (OB_SUCC(ret)) { + if (OB_FAIL(parser.get_next_token(word, word_len, char_len, word_freq))) { + LOG_WARN("fail to get next token", K(ret), K(parser)); + } else { + LOG_INFO("succeed to get next token", K(ret), K(ObString(word_len, word)), K(char_len)); + } + } + LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); } TEST_F(TestDefaultFTParser, test_default_ft_parser_desc) { - ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_)); + ObITokenIterator *iter = nullptr; + ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_, iter)); ft_parser_param_.fulltext_ = ObTestAddWord::TEST_FULLTEXT; ft_parser_param_.ft_length_ = strlen(ft_parser_param_.fulltext_); - ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_)); + ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_, iter)); + ASSERT_EQ(OB_SUCCESS, add_word_.check_words(iter)); ASSERT_EQ(OB_SUCCESS, desc_.deinit(&plugin_param_)); - ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_)); + ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_, iter)); ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_)); - ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr)); + ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr, iter)); } class ObTestFTPluginHelper : public ::testing::Test @@ -442,29 +473,35 @@ void ObTestFTParseHelper::TearDownTestCase() TEST_F(ObTestFTParseHelper, test_parse_fulltext) { - common::ObSEArray words; + ObFTWordMap ft_word_map; + ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse")); int64_t doc_length = 0; ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, - std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); + std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); ObTestAddWord test_add_word(cs_type_, allocator_); - for (int64_t i = 0; i < words.count(); ++i) { - ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length())); + ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); + for (int64_t i = 0; i < ft_word_map.size(); ++i) { + int64_t word_cnt = 0; + ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_); + ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt)); + ASSERT_TRUE(word_cnt >= 1); } - ObFTWordMap ft_word_map; - ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse")); + ft_word_map.clear(); ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_, cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map)); - ASSERT_EQ(words.count(), ft_word_map.size()); + ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); - ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); - ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words)); - ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words)); + ft_word_map.clear(); + ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); + ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, ft_word_map)); + ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, ft_word_map)); parse_helper_.reset(); + ft_word_map.clear(); ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, - std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); + std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(nullptr, plugin_name_)); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(&allocator_, ObString())); @@ -472,9 +509,9 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext) ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_INVALID, ObTestAddWord::TEST_FULLTEXT, - std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); + std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_EXTENDED_MARK, ObTestAddWord::TEST_FULLTEXT, - std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); + std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_)); @@ -484,57 +521,80 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext) parse_helper_.reset(); ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, - std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); - for (int64_t i = 0; i < words.count(); ++i) { - ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length())); + std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); + ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); + for (int64_t i = 0; i < ft_word_map.size(); ++i) { + int64_t word_cnt = 0; + ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_); + ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt)); + ASSERT_TRUE(word_cnt >= 1); + } + parse_helper_.reset(); + ft_word_map.clear(); + ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, "beng.1")); + ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, + std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); + ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); + for (int64_t i = 0; i < ft_word_map.size(); ++i) { + int64_t word_cnt = 0; + ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_); + ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt)); + ASSERT_TRUE(word_cnt >= 1); } } TEST_F(ObTestFTParseHelper, test_min_and_max_word_len) { - common::ObSEArray words; + ObFTWordMap words; + ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse")); int64_t doc_length = 0; // word len = 2; const char *word_len_2 = "ab"; ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_2, std::strlen(word_len_2), doc_length, words)); - ASSERT_EQ(0, words.count()); + ASSERT_EQ(0, words.size()); // word len = 3; const char *word_len_3 = "abc"; + words.clear(); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_3, std::strlen(word_len_3), doc_length, words)); - ASSERT_EQ(1, words.count()); + ASSERT_EQ(1, words.size()); // word len = 4; const char *word_len_4 = "abcd"; + words.clear(); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_4, std::strlen(word_len_4), doc_length, words)); - ASSERT_EQ(1, words.count()); + ASSERT_EQ(1, words.size()); // word len = 76; const char *word_len_76 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + words.clear(); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_76, std::strlen(word_len_76), doc_length, words)); - ASSERT_EQ(1, words.count()); + ASSERT_EQ(1, words.size()); // word len = 84; const char *word_len_84 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz123456"; + words.clear(); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_84, std::strlen(word_len_84), doc_length, words)); - ASSERT_EQ(1, words.count()); + ASSERT_EQ(1, words.size()); // word len = 85; const char *word_len_85 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567"; + words.clear(); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_85, std::strlen(word_len_85), doc_length, words)); - ASSERT_EQ(0, words.count()); + ASSERT_EQ(0, words.size()); } class ObTestNgramFTParseHelper : public ::testing::Test { public: static const char *name_; - static const int64_t TEST_WORD_COUNT = 29; + static const int64_t TEST_WORD_COUNT = 27; typedef common::hash::ObHashMap ObFTWordMap; public: ObTestNgramFTParseHelper(); virtual ~ObTestNgramFTParseHelper() = default; + static int64_t get_word_count() { return TEST_WORD_COUNT; } static void SetUpTestCase(); static void TearDownTestCase(); @@ -553,7 +613,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1"; ObTestNgramFTParseHelper::ObTestNgramFTParseHelper() : plugin_name_(STRLEN(name_), name_), - ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"}, + ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"}, cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN), allocator_() { @@ -583,26 +643,33 @@ void ObTestNgramFTParseHelper::TearDownTestCase() TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext) { + ObFTWordMap words; + ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse")); int64_t doc_length = 0; - common::ObSEArray words; ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); - for (int64_t i = 0; i < words.count(); ++i) { - ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length())); + ASSERT_EQ(get_word_count(), words.size()); + for (int64_t i = 0; i < words.size(); ++i) { + int64_t word_cnt = 0; + ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_); + ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt)); + ASSERT_TRUE(word_cnt >= 1); } ObFTWordMap ft_word_map; - ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse")); + ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse")); ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_, cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map)); - ASSERT_EQ(words.count(), ft_word_map.size() + 2); + ASSERT_EQ(words.size(), ft_word_map.size()); + words.clear(); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words)); ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words)); parse_helper_.reset(); + words.clear(); ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); @@ -620,14 +687,19 @@ TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext) ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_)); parse_helper_.reset(); + words.clear(); ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); parse_helper_.reset(); ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); - for (int64_t i = 0; i < words.count(); ++i) { - ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length())); + ASSERT_EQ(get_word_count(), words.size()); + for (int64_t i = 0; i < words.size(); ++i) { + int64_t word_cnt = 0; + ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_); + ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt)); + ASSERT_TRUE(word_cnt >= 1); } } @@ -638,7 +710,7 @@ int main(int argc, char **argv) { system("rm -rf test_fts_plugin.log"); OB_LOGGER.set_file_name("test_fts_plugin.log", true); - OB_LOGGER.set_log_level("INFO"); + OB_LOGGER.set_log_level("DEBUG"); oceanbase::storage::ObTestFTPluginHelper::file_name = argv[0]; testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS();