[FTS] Adjust plugin tokenizer interface for fulltext search
This commit is contained in:
		
							
								
								
									
										57
									
								
								deps/oblib/src/lib/ob_plugin.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										57
									
								
								deps/oblib/src/lib/ob_plugin.h
									
									
									
									
										vendored
									
									
								
							| @ -114,7 +114,7 @@ enum class ObPluginType : uint64_t | |||||||
| // define plugin license | // define plugin license | ||||||
| enum class ObPluginLicenseType : uint64_t | enum class ObPluginLicenseType : uint64_t | ||||||
| { | { | ||||||
|   OB_MULAN_V2_LICENSE = 1,        // Mulan PubL v2 license |   OB_Mulan_PubL_V2_LICENSE = 1,   // Mulan PubL v2 license | ||||||
|   OB_MAX_PLUGIN_LICENSE_TYPE = 2, // max plugin license type |   OB_MAX_PLUGIN_LICENSE_TYPE = 2, // max plugin license type | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @ -186,7 +186,7 @@ public: | |||||||
|         && nullptr != author_ |         && nullptr != author_ | ||||||
|         && nullptr != spec_ |         && nullptr != spec_ | ||||||
|         && PLUGIN_VERSION == version_ |         && PLUGIN_VERSION == version_ | ||||||
|         && (ObPluginLicenseType::OB_MULAN_V2_LICENSE <= license_ |         && (ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE <= license_ | ||||||
|             && license_ < ObPluginLicenseType::OB_MAX_PLUGIN_LICENSE_TYPE) |             && license_ < ObPluginLicenseType::OB_MAX_PLUGIN_LICENSE_TYPE) | ||||||
|         && nullptr != desc_; |         && nullptr != desc_; | ||||||
|   } |   } | ||||||
| @ -217,24 +217,9 @@ public: | |||||||
|  |  | ||||||
| class ObFTParserParam final | class ObFTParserParam final | ||||||
| { | { | ||||||
| public: |  | ||||||
|   class ObIAddWord |  | ||||||
|   { |  | ||||||
|   public: |  | ||||||
|     ObIAddWord() = default; |  | ||||||
|     virtual ~ObIAddWord() = default; |  | ||||||
|     virtual int operator()( |  | ||||||
|         ObFTParserParam *param, |  | ||||||
|         const char *word, |  | ||||||
|         const int64_t word_len, |  | ||||||
|         const int64_t char_cnt) = 0; |  | ||||||
|     virtual int64_t get_add_word_count() const = 0; |  | ||||||
|     DECLARE_PURE_VIRTUAL_TO_STRING; |  | ||||||
|   }; |  | ||||||
| public: | public: | ||||||
|   ObFTParserParam() |   ObFTParserParam() | ||||||
|     : allocator_(nullptr), |     : allocator_(nullptr), | ||||||
|       add_word_(nullptr), |  | ||||||
|       cs_(nullptr), |       cs_(nullptr), | ||||||
|       fulltext_(nullptr), |       fulltext_(nullptr), | ||||||
|       ft_length_(0), |       ft_length_(0), | ||||||
| @ -245,36 +230,42 @@ public: | |||||||
|   inline bool is_valid() const |   inline bool is_valid() const | ||||||
|   { |   { | ||||||
|     return nullptr != allocator_ |     return nullptr != allocator_ | ||||||
|         && nullptr != add_word_ |  | ||||||
|         && nullptr != cs_ |         && nullptr != cs_ | ||||||
|         && nullptr != fulltext_ |         && nullptr != fulltext_ | ||||||
|         && 0 < ft_length_ |         && 0 < ft_length_ | ||||||
|         && 0 <= parser_version_; |         && 0 <= parser_version_; | ||||||
|   } |   } | ||||||
|   inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt) |  | ||||||
|   { |  | ||||||
|     return (*add_word_)(param, word, word_len, char_cnt); |  | ||||||
|   } |  | ||||||
|   inline void reset() |   inline void reset() | ||||||
|   { |   { | ||||||
|     allocator_ = nullptr; |     allocator_ = nullptr; | ||||||
|     add_word_ = nullptr; |  | ||||||
|     cs_ = nullptr; |     cs_ = nullptr; | ||||||
|     fulltext_ = nullptr; |     fulltext_ = nullptr; | ||||||
|     ft_length_ = 0; |     ft_length_ = 0; | ||||||
|     parser_version_ = 0; |     parser_version_ = 0; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   TO_STRING_KV(KP_(allocator), KP_(add_word), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version)); |   TO_STRING_KV(KP_(allocator), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version)); | ||||||
| public: | public: | ||||||
|   common::ObIAllocator *allocator_; |   common::ObIAllocator *allocator_; | ||||||
|   ObIAddWord *add_word_; |  | ||||||
|   const ObCharsetInfo *cs_; |   const ObCharsetInfo *cs_; | ||||||
|   const char *fulltext_; |   const char *fulltext_; | ||||||
|   int64_t ft_length_; |   int64_t ft_length_; | ||||||
|   int64_t parser_version_; |   int64_t parser_version_; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | class ObITokenIterator | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |   ObITokenIterator() = default; | ||||||
|  |   virtual ~ObITokenIterator() = default; | ||||||
|  |   virtual int get_next_token( | ||||||
|  |       const char *&word, | ||||||
|  |       int64_t &word_len, | ||||||
|  |       int64_t &char_cnt, | ||||||
|  |       int64_t &word_freq) = 0; | ||||||
|  |   DECLARE_PURE_VIRTUAL_TO_STRING; | ||||||
|  | }; | ||||||
|  |  | ||||||
| // fulltext parser descriptor interface for domain index | // fulltext parser descriptor interface for domain index | ||||||
| //  - splitting a document into many tokenizations. | //  - splitting a document into many tokenizations. | ||||||
| class ObIFTParserDesc : public ObIPluginDesc | class ObIFTParserDesc : public ObIPluginDesc | ||||||
| @ -286,12 +277,22 @@ public: | |||||||
|   /** |   /** | ||||||
|    * split fulltext into multiple word segments |    * split fulltext into multiple word segments | ||||||
|    * |    * | ||||||
|    * @param[in]  fulltext, the document to be tokenized. |    * @param[in]  param, the document to be tokenized and parameters related to word segmentation. | ||||||
|    * @param[out] words, the word segmentation after splitting. |    * @param[out] iter, the tokenized words' iterator. | ||||||
|    * |    * | ||||||
|    * @return error code, such as, OB_SUCCESS, OB_INVALID_ARGUMENT, ... |    * @return error code, such as, OB_SUCCESS, OB_INVALID_ARGUMENT, ... | ||||||
|    */ |    */ | ||||||
|   virtual int segment(ObFTParserParam *param) const = 0; |   virtual int segment(ObFTParserParam *param, ObITokenIterator *&iter) const = 0; | ||||||
|  |  | ||||||
|  |   /** | ||||||
|  |    * Release resources held by the iterator and free token iterator. | ||||||
|  |    */ | ||||||
|  |   virtual void free_token_iter(ObFTParserParam *param, ObITokenIterator *&iter) const | ||||||
|  |   { | ||||||
|  |     if (OB_NOT_NULL(iter)) { | ||||||
|  |       iter->~ObITokenIterator(); | ||||||
|  |     } | ||||||
|  |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| } // end namespace lib | } // end namespace lib | ||||||
|  | |||||||
| @ -207,30 +207,16 @@ int ObDASDomainUtils::generate_spatial_index_rows( | |||||||
|     ObFTWordMap &words_count) |     ObFTWordMap &words_count) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   common::ObSEArray<ObFTWord, 256> words; |  | ||||||
|   if (OB_ISNULL(helper) |   if (OB_ISNULL(helper) | ||||||
|       || OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type |       || OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type | ||||||
|                   || ObCollationType::CS_TYPE_EXTENDED_MARK < type) |                   || ObCollationType::CS_TYPE_EXTENDED_MARK < type) | ||||||
|       || OB_UNLIKELY(!words_count.created())) { |       || OB_UNLIKELY(!words_count.created())) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created())); |     LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created())); | ||||||
|   } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) { |   } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) { | ||||||
|     LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext)); |     LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext)); | ||||||
|   } else { |  | ||||||
|     for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) { |  | ||||||
|       const ObFTWord &ft_word = words.at(i); |  | ||||||
|       int64_t word_count = 0; |  | ||||||
|       if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) { |  | ||||||
|         LOG_WARN("fail to get ft word", K(ret), K(ft_word)); |  | ||||||
|       } else { |  | ||||||
|         word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count; |  | ||||||
|         if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) { |  | ||||||
|           LOG_WARN("fail to set ft word and count", K(ret), K(ft_word)); |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|   STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words), K(type)); |   STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words_count.size()), K(type)); | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| @ -484,6 +470,7 @@ void ObDomainDMLIterator::reset() | |||||||
|   row_projector_ = nullptr; |   row_projector_ = nullptr; | ||||||
|   das_ctdef_ = nullptr; |   das_ctdef_ = nullptr; | ||||||
|   main_ctdef_ = nullptr; |   main_ctdef_ = nullptr; | ||||||
|  |   allocator_.reset(); | ||||||
| } | } | ||||||
|  |  | ||||||
| void ObDomainDMLIterator::set_ctdef( | void ObDomainDMLIterator::set_ctdef( | ||||||
| @ -520,10 +507,12 @@ int ObDomainDMLIterator::get_next_domain_row(ObNewRow *&row) | |||||||
|   while (OB_SUCC(ret) && !got_row) { |   while (OB_SUCC(ret) && !got_row) { | ||||||
|     if (row_idx_ >= rows_.count()) { |     if (row_idx_ >= rows_.count()) { | ||||||
|       rows_.reuse(); |       rows_.reuse(); | ||||||
|  |       allocator_.reuse(); | ||||||
|       row_idx_ = 0; |       row_idx_ = 0; | ||||||
|       if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) { |       if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) { | ||||||
|         ret = OB_ERR_UNEXPECTED; |         ret = OB_ERR_UNEXPECTED; | ||||||
|         LOG_WARN("unexpected error, not domain index", K(ret), K(das_ctdef_->table_param_.get_data_table())); |         LOG_WARN("unexpected error, not domain index", K(ret), K(das_ctdef_->table_param_.get_data_table())); | ||||||
|  |  | ||||||
|       } else if (FAILEDx(write_iter_.get_next_row(sr))) { |       } else if (FAILEDx(write_iter_.get_next_row(sr))) { | ||||||
|         if (OB_ITER_END != ret) { |         if (OB_ITER_END != ret) { | ||||||
|           LOG_WARN("get next row from result iterator failed", K(ret)); |           LOG_WARN("get next row from result iterator failed", K(ret)); | ||||||
| @ -562,6 +551,7 @@ int ObDomainDMLIterator::get_next_domain_rows(ObNewRow *&row, int64_t &row_count | |||||||
|     while (OB_SUCC(ret) && !got_row) { |     while (OB_SUCC(ret) && !got_row) { | ||||||
|       if (row_idx_ >= rows_.count()) { |       if (row_idx_ >= rows_.count()) { | ||||||
|         rows_.reuse(); |         rows_.reuse(); | ||||||
|  |         allocator_.reuse(); | ||||||
|         row_idx_ = 0; |         row_idx_ = 0; | ||||||
|         if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) { |         if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) { | ||||||
|           ret = OB_ERR_UNEXPECTED; |           ret = OB_ERR_UNEXPECTED; | ||||||
| @ -757,7 +747,7 @@ int ObFTDMLIterator::get_ft_and_doc_id( | |||||||
|     const ObChunkDatumStore::StoredRow *store_row, |     const ObChunkDatumStore::StoredRow *store_row, | ||||||
|     ObString &doc_id, |     ObString &doc_id, | ||||||
|     ObString &ft, |     ObString &ft, | ||||||
|     common::ObObjMeta &ft_meta) const |     common::ObObjMeta &ft_meta) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   const uint64_t doc_id_col_id = das_ctdef_->table_param_.get_data_table().get_doc_id_col_id(); |   const uint64_t doc_id_col_id = das_ctdef_->table_param_.get_data_table().get_doc_id_col_id(); | ||||||
| @ -793,7 +783,7 @@ int ObFTDMLIterator::get_ft_and_doc_id_for_update( | |||||||
|     const ObChunkDatumStore::StoredRow *store_row, |     const ObChunkDatumStore::StoredRow *store_row, | ||||||
|     ObString &doc_id, |     ObString &doc_id, | ||||||
|     ObString &ft, |     ObString &ft, | ||||||
|     common::ObObjMeta &ft_meta) const |     common::ObObjMeta &ft_meta) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   const uint64_t rowkey_col_cnt = das_ctdef_->table_param_.get_data_table().get_rowkey_column_num(); |   const uint64_t rowkey_col_cnt = das_ctdef_->table_param_.get_data_table().get_rowkey_column_num(); | ||||||
| @ -863,7 +853,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data( | |||||||
|     const ObChunkDatumStore::StoredRow *store_row, |     const ObChunkDatumStore::StoredRow *store_row, | ||||||
|     int64_t& multivalue_idx, |     int64_t& multivalue_idx, | ||||||
|     int64_t& multivalue_arr_idx, |     int64_t& multivalue_arr_idx, | ||||||
|     ObString &multivalue_data) const |     ObString &multivalue_data) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   multivalue_idx = OB_INVALID_ID; |   multivalue_idx = OB_INVALID_ID; | ||||||
| @ -910,7 +900,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data_for_update( | |||||||
|     const ObChunkDatumStore::StoredRow *store_row, |     const ObChunkDatumStore::StoredRow *store_row, | ||||||
|     int64_t& multivalue_idx, |     int64_t& multivalue_idx, | ||||||
|     int64_t& multivalue_arr_idx, |     int64_t& multivalue_arr_idx, | ||||||
|     ObString &multivalue_data) const |     ObString &multivalue_data) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   bool found = false; |   bool found = false; | ||||||
|  | |||||||
| @ -13,6 +13,7 @@ | |||||||
| #ifndef OCEANBASE_DAS_DOMAIN_UTILS_H | #ifndef OCEANBASE_DAS_DOMAIN_UTILS_H | ||||||
| #define OCEANBASE_DAS_DOMAIN_UTILS_H | #define OCEANBASE_DAS_DOMAIN_UTILS_H | ||||||
|  |  | ||||||
|  | #include "lib/allocator/page_arena.h" | ||||||
| #include "lib/hash/ob_hashset.h" | #include "lib/hash/ob_hashset.h" | ||||||
| #include "sql/das/ob_das_dml_ctx_define.h" | #include "sql/das/ob_das_dml_ctx_define.h" | ||||||
| #include "storage/fts/ob_fts_plugin_helper.h" | #include "storage/fts/ob_fts_plugin_helper.h" | ||||||
| @ -56,8 +57,6 @@ public: | |||||||
|       const IntFixedArray &row_projector, |       const IntFixedArray &row_projector, | ||||||
|       const ObDASWriteBuffer::DmlRow &dml_row, |       const ObDASWriteBuffer::DmlRow &dml_row, | ||||||
|       ObDomainIndexRow &domain_rows); |       ObDomainIndexRow &domain_rows); | ||||||
| private: |  | ||||||
|   typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap; |  | ||||||
| private: | private: | ||||||
|   static int segment_and_calc_word_count( |   static int segment_and_calc_word_count( | ||||||
|       common::ObIAllocator &allocator, |       common::ObIAllocator &allocator, | ||||||
| @ -126,7 +125,7 @@ protected: | |||||||
|   ObDASWriteBuffer::Iterator &write_iter_; |   ObDASWriteBuffer::Iterator &write_iter_; | ||||||
|   const ObDASDMLBaseCtDef *das_ctdef_; |   const ObDASDMLBaseCtDef *das_ctdef_; | ||||||
|   const ObDASDMLBaseCtDef *main_ctdef_; |   const ObDASDMLBaseCtDef *main_ctdef_; | ||||||
|   common::ObIAllocator &allocator_; |   common::ObArenaAllocator allocator_; | ||||||
|   bool is_update_; |   bool is_update_; | ||||||
| private: | private: | ||||||
|   DISALLOW_COPY_AND_ASSIGN(ObDomainDMLIterator); |   DISALLOW_COPY_AND_ASSIGN(ObDomainDMLIterator); | ||||||
| @ -178,13 +177,13 @@ private: | |||||||
|     const ObChunkDatumStore::StoredRow *store_row, |     const ObChunkDatumStore::StoredRow *store_row, | ||||||
|     int64_t& multivalue_idx, |     int64_t& multivalue_idx, | ||||||
|     int64_t& multivalue_arr_idx, |     int64_t& multivalue_arr_idx, | ||||||
|     ObString &multivalue_data) const; |     ObString &multivalue_data); | ||||||
|  |  | ||||||
|   int get_multivlaue_json_data_for_update( |   int get_multivlaue_json_data_for_update( | ||||||
|     const ObChunkDatumStore::StoredRow *store_row, |     const ObChunkDatumStore::StoredRow *store_row, | ||||||
|     int64_t& multivalue_idx, |     int64_t& multivalue_idx, | ||||||
|     int64_t& multivalue_arr_idx, |     int64_t& multivalue_arr_idx, | ||||||
|     ObString &multivalue_data) const; |     ObString &multivalue_data); | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| @ -214,12 +213,12 @@ protected: | |||||||
|       const ObChunkDatumStore::StoredRow *store_row, |       const ObChunkDatumStore::StoredRow *store_row, | ||||||
|       ObString &doc_id, |       ObString &doc_id, | ||||||
|       ObString &ft, |       ObString &ft, | ||||||
|       common::ObObjMeta &ft_meta) const; |       common::ObObjMeta &ft_meta); | ||||||
|   int get_ft_and_doc_id_for_update( |   int get_ft_and_doc_id_for_update( | ||||||
|       const ObChunkDatumStore::StoredRow *store_row, |       const ObChunkDatumStore::StoredRow *store_row, | ||||||
|       ObString &doc_id, |       ObString &doc_id, | ||||||
|       ObString &ft, |       ObString &ft, | ||||||
|       common::ObObjMeta &ft_meta) const; |       common::ObObjMeta &ft_meta); | ||||||
|  |  | ||||||
| private: | private: | ||||||
|   storage::ObFTParseHelper ft_parse_helper_; |   storage::ObFTParseHelper ft_parse_helper_; | ||||||
|  | |||||||
| @ -312,21 +312,9 @@ int ObTextRetrievalMerge::init_query_tokens(const ObDASIRScanCtDef *ir_ctdef, Ob | |||||||
|     } else if (OB_FAIL(token_map.create(ft_word_bkt_cnt, common::ObMemAttr(MTL_ID(), "FTWordMap")))) { |     } else if (OB_FAIL(token_map.create(ft_word_bkt_cnt, common::ObMemAttr(MTL_ID(), "FTWordMap")))) { | ||||||
|       LOG_WARN("failed to create token map", K(ret)); |       LOG_WARN("failed to create token map", K(ret)); | ||||||
|     } else if (OB_FAIL(tokenize_helper.segment( |     } else if (OB_FAIL(tokenize_helper.segment( | ||||||
|         cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, tokens))) { |         cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, token_map))) { | ||||||
|       LOG_WARN("failed to segment"); |       LOG_WARN("failed to segment"); | ||||||
|     } else { |     } else { | ||||||
|       for (int64_t i = 0; OB_SUCC(ret) && i < tokens.count(); ++i) { |  | ||||||
|         const ObFTWord &token = tokens.at(i); |  | ||||||
|         int64_t word_count = 0; |  | ||||||
|         if (OB_FAIL(token_map.get_refactored(token, word_count)) && OB_HASH_NOT_EXIST != ret) { |  | ||||||
|           LOG_WARN("fail to get ft word", K(ret), K(token)); |  | ||||||
|         } else { |  | ||||||
|           word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count; |  | ||||||
|           if (OB_FAIL(token_map.set_refactored(token, word_count, 1/*overwrite*/))) { |  | ||||||
|             LOG_WARN("fail to set ft word and count", K(ret), K(token)); |  | ||||||
|           } |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
|       for (hash::ObHashMap<ObFTWord, int64_t>::const_iterator iter = token_map.begin(); |       for (hash::ObHashMap<ObFTWord, int64_t>::const_iterator iter = token_map.begin(); | ||||||
|           OB_SUCC(ret) && iter != token_map.end(); |           OB_SUCC(ret) && iter != token_map.end(); | ||||||
|           ++iter) { |           ++iter) { | ||||||
|  | |||||||
| @ -22,70 +22,43 @@ namespace oceanbase | |||||||
| namespace storage | namespace storage | ||||||
| { | { | ||||||
|  |  | ||||||
| /*static*/ int ObBEngFTParser::segment( | int ObBEngFTParser::get_next_token( | ||||||
|     lib::ObFTParserParam *param, |     const char *&word, | ||||||
|     const char *ft, |     int64_t &word_len, | ||||||
|     const int64_t ft_len) |     int64_t &char_len, | ||||||
| { |     int64_t &word_freq) | ||||||
|   int ret = OB_SUCCESS; |  | ||||||
|   ObDatum doc; |  | ||||||
|   doc.set_string(ft, ft_len); |  | ||||||
|   ObBEngFTParser parser; |  | ||||||
|   share::ObITokenStream *token_stream = nullptr; |  | ||||||
|   if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) { |  | ||||||
|     ret = OB_INVALID_ARGUMENT; |  | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len)); |  | ||||||
|   } else if (OB_FAIL(parser.init(param))) { |  | ||||||
|     LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param)); |  | ||||||
|   } else if (FALSE_IT(doc.set_string(ft, ft_len))) { |  | ||||||
|   } else if (OB_FAIL(parser.segment(doc, token_stream))) { |  | ||||||
|     LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len)); |  | ||||||
|   } else if (OB_ISNULL(token_stream)) { |  | ||||||
|     ret = OB_ERR_UNEXPECTED; |  | ||||||
|     LOG_WARN("token stream is nullptr", K(ret), KP(token_stream)); |  | ||||||
|   } else { |  | ||||||
|     ObDatum token; |  | ||||||
|     int64_t token_freq = 0; |  | ||||||
|     while (OB_SUCC(ret)) { |  | ||||||
|       if (OB_FAIL(token_stream->get_next(token, token_freq))) { |  | ||||||
|         if (OB_ITER_END != ret) { |  | ||||||
|           LOG_WARN("fail to get next token", K(ret), KPC(token_stream)); |  | ||||||
|         } |  | ||||||
|       } else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) { |  | ||||||
|         LOG_WARN("fail to add word", K(ret), K(token), KPC(param)); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if (OB_ITER_END == ret) { |  | ||||||
|       ret = OB_SUCCESS; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*static*/ int ObBEngFTParser::add_word( |  | ||||||
|     lib::ObFTParserParam *param, |  | ||||||
|     common::ObIAllocator *allocator, |  | ||||||
|     const char *word, |  | ||||||
|     int64_t word_len) |  | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|  |   ObDatum token; | ||||||
|  |   int64_t token_freq = 0; | ||||||
|   char *buf = nullptr; |   char *buf = nullptr; | ||||||
|   if (OB_ISNULL(param) |   word = nullptr; | ||||||
|       || OB_ISNULL(allocator) |   word_len = 0; | ||||||
|       || OB_ISNULL(word) |   char_len = 0; | ||||||
|       || OB_UNLIKELY(0 >= word_len)) { |   word_freq = 0; | ||||||
|  |   if (OB_UNLIKELY(!is_inited_)) { | ||||||
|  |     ret = OB_NOT_INIT; | ||||||
|  |     LOG_WARN("beng ft parser isn't initialized", K(ret), K(is_inited_)); | ||||||
|  |   } else if (OB_ISNULL(token_stream_)) { | ||||||
|  |     ret = OB_ERR_UNEXPECTED; | ||||||
|  |     LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_)); | ||||||
|  |   } else if (OB_FAIL(token_stream_->get_next(token, token_freq))) { | ||||||
|  |     if (OB_ITER_END != ret) { | ||||||
|  |       LOG_WARN("fail to get next token", K(ret), KPC(token_stream_)); | ||||||
|  |     } | ||||||
|  |   } else if (OB_ISNULL(token.ptr_) || OB_UNLIKELY(0 >= token.len_ || 0 >= token_freq)) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len)); |     LOG_WARN("invalid arguments", K(ret), KP(token.ptr_), K(token.len_), K(token_freq)); | ||||||
|   } else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) { |   } else if (OB_ISNULL(buf = static_cast<char *>(allocator_.alloc(token.len_)))) { | ||||||
|     LOG_DEBUG("skip too small or large word", K(ret), K(word_len)); |  | ||||||
|   } else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) { |  | ||||||
|     ret = OB_ALLOCATE_MEMORY_FAILED; |     ret = OB_ALLOCATE_MEMORY_FAILED; | ||||||
|     LOG_WARN("fail to allocate word memory", K(ret), K(word_len)); |     LOG_WARN("fail to allocate word memory", K(ret), K(token.len_)); | ||||||
|   } else if (FALSE_IT(MEMCPY(buf, word, word_len))) { |  | ||||||
|   } else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) { |  | ||||||
|     LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word))); |  | ||||||
|   } else { |   } else { | ||||||
|     LOG_DEBUG("succeed to add word", K(ObString(word_len, word))); |     MEMCPY(buf, token.ptr_, token.len_); | ||||||
|  |     word = buf; | ||||||
|  |     word_len = token.len_; | ||||||
|  |     char_len = token.len_; | ||||||
|  |     word_freq = token_freq; | ||||||
|  |     LOG_DEBUG("succeed to add word", K(ObString(word_len, word)), K(word_freq)); | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @ -103,13 +76,20 @@ int ObBEngFTParser::init(lib::ObFTParserParam *param) | |||||||
|     ret = OB_NOT_SUPPORTED; |     ret = OB_NOT_SUPPORTED; | ||||||
|     LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_)); |     LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_)); | ||||||
|   } else { |   } else { | ||||||
|  |     doc_.set_string(param->fulltext_, param->ft_length_); | ||||||
|     analysis_ctx_.cs_ = param->cs_; |     analysis_ctx_.cs_ = param->cs_; | ||||||
|     analysis_ctx_.filter_stopword_ = false; |     analysis_ctx_.filter_stopword_ = false; | ||||||
|     analysis_ctx_.need_grouping_ = false; |     analysis_ctx_.need_grouping_ = false; | ||||||
|     if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) { |     if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) { | ||||||
|       LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_)); |       LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_)); | ||||||
|  |     } else if (OB_FAIL(segment(doc_, token_stream_))) { | ||||||
|  |       LOG_WARN("fail to segment fulltext by parser", K(ret), KP(param->fulltext_), K(param->ft_length_)); | ||||||
|  |     } else if (OB_ISNULL(token_stream_)) { | ||||||
|  |       ret = OB_ERR_UNEXPECTED; | ||||||
|  |       LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_)); | ||||||
|     } else { |     } else { | ||||||
|       is_inited_ = true; |       is_inited_ = true; | ||||||
|  |       LOG_DEBUG("succeed to init beng parser", K(ret), K(english_analyzer_), KPC(token_stream_), K(doc_)); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { |   if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { | ||||||
| @ -139,6 +119,8 @@ void ObBEngFTParser::reset() | |||||||
| { | { | ||||||
|   analysis_ctx_.reset(); |   analysis_ctx_.reset(); | ||||||
|   english_analyzer_.reset(); |   english_analyzer_.reset(); | ||||||
|  |   doc_.reset(); | ||||||
|  |   token_stream_ = nullptr; | ||||||
|   is_inited_ = false; |   is_inited_ = false; | ||||||
| } | } | ||||||
|  |  | ||||||
| @ -159,20 +141,43 @@ int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param) | |||||||
|   return OB_SUCCESS; |   return OB_SUCCESS; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const | int ObBasicEnglishFTParserDesc::segment( | ||||||
|  |     lib::ObFTParserParam *param, | ||||||
|  |     lib::ObITokenIterator *&iter) const | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|  |   void *buf = nullptr; | ||||||
|   if (OB_UNLIKELY(!is_inited_)) { |   if (OB_UNLIKELY(!is_inited_)) { | ||||||
|     ret = OB_NOT_INIT; |     ret = OB_NOT_INIT; | ||||||
|     LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); |     LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); | ||||||
|   } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { |   } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid argument", K(ret), KPC(param)); |     LOG_WARN("invalid argument", K(ret), KPC(param)); | ||||||
|   } else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) { |   } else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObBEngFTParser)))) { | ||||||
|     LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param), |     ret = OB_ALLOCATE_MEMORY_FAILED; | ||||||
|         K(param->fulltext_), K(param->ft_length_)); |     LOG_WARN("fail to allocate basic english ft parser", K(ret)); | ||||||
|  |   } else { | ||||||
|  |     ObBEngFTParser *parser = new (buf) ObBEngFTParser(*(param->allocator_)); | ||||||
|  |     if (OB_FAIL(parser->init(param))) { | ||||||
|  |       LOG_WARN("fail to init basic english parser", K(ret), KPC(param)); | ||||||
|  |     } else { | ||||||
|  |       iter = parser; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void ObBasicEnglishFTParserDesc::free_token_iter( | ||||||
|  |     lib::ObFTParserParam *param, | ||||||
|  |     lib::ObITokenIterator *&iter) const | ||||||
|  | { | ||||||
|  |   if (OB_NOT_NULL(iter)) { | ||||||
|  |     abort_unless(nullptr != param); | ||||||
|  |     abort_unless(nullptr != param->allocator_); | ||||||
|  |     iter->~ObITokenIterator(); | ||||||
|  |     param->allocator_->free(iter); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| } // end namespace storage | } // end namespace storage | ||||||
|  | |||||||
| @ -23,40 +23,41 @@ namespace oceanbase | |||||||
| namespace storage | namespace storage | ||||||
| { | { | ||||||
|  |  | ||||||
| class ObBEngFTParser final | class ObBEngFTParser final : public lib::ObITokenIterator | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   static const int64_t FT_MIN_WORD_LEN = 3; |   static const int64_t FT_MIN_WORD_LEN = 3; | ||||||
|   static const int64_t FT_MAX_WORD_LEN = 84; |   static const int64_t FT_MAX_WORD_LEN = 84; | ||||||
| public: | public: | ||||||
|   static int segment( |   explicit ObBEngFTParser(common::ObIAllocator &allocator) | ||||||
|       lib::ObFTParserParam *param, |     : allocator_(allocator), | ||||||
|       const char *fulltext, |       analysis_ctx_(), | ||||||
|       const int64_t ft_len); |  | ||||||
|  |  | ||||||
| private: |  | ||||||
|   ObBEngFTParser() |  | ||||||
|     : analysis_ctx_(), |  | ||||||
|       english_analyzer_(), |       english_analyzer_(), | ||||||
|  |       doc_(), | ||||||
|  |       token_stream_(nullptr), | ||||||
|       is_inited_(false) |       is_inited_(false) | ||||||
|   {} |   {} | ||||||
|   ~ObBEngFTParser() = default; |   ~ObBEngFTParser() { reset(); } | ||||||
|  |  | ||||||
|   static int add_word( |  | ||||||
|       lib::ObFTParserParam *param, |  | ||||||
|       common::ObIAllocator *allocator, |  | ||||||
|       const char *word, |  | ||||||
|       int64_t word_len); |  | ||||||
|   int init(lib::ObFTParserParam *param); |   int init(lib::ObFTParserParam *param); | ||||||
|   void reset(); |   void reset(); | ||||||
|  |   virtual int get_next_token( | ||||||
|  |       const char *&word, | ||||||
|  |       int64_t &word_len, | ||||||
|  |       int64_t &char_len, | ||||||
|  |       int64_t &word_freq) override; | ||||||
|  |  | ||||||
|  |   VIRTUAL_TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), KP_(token_stream), K_(is_inited)); | ||||||
|  | private: | ||||||
|   int segment( |   int segment( | ||||||
|       const common::ObDatum &doc, |       const common::ObDatum &doc, | ||||||
|       share::ObITokenStream *&token_stream); |       share::ObITokenStream *&token_stream); | ||||||
|   TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited)); |  | ||||||
|  |  | ||||||
| private: | private: | ||||||
|  |   common::ObIAllocator &allocator_; | ||||||
|   share::ObTextAnalysisCtx analysis_ctx_; |   share::ObTextAnalysisCtx analysis_ctx_; | ||||||
|   share::ObEnglishTextAnalyzer english_analyzer_; |   share::ObEnglishTextAnalyzer english_analyzer_; | ||||||
|  |   common::ObDatum doc_; | ||||||
|  |   share::ObITokenStream *token_stream_; | ||||||
|   bool is_inited_; |   bool is_inited_; | ||||||
|  |  | ||||||
|   DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser); |   DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser); | ||||||
| @ -69,7 +70,8 @@ public: | |||||||
|   virtual ~ObBasicEnglishFTParserDesc() = default; |   virtual ~ObBasicEnglishFTParserDesc() = default; | ||||||
|   virtual int init(lib::ObPluginParam *param) override; |   virtual int init(lib::ObPluginParam *param) override; | ||||||
|   virtual int deinit(lib::ObPluginParam *param) override; |   virtual int deinit(lib::ObPluginParam *param) override; | ||||||
|   virtual int segment(lib::ObFTParserParam *param) const override; |   virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
|  |   virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
|   OB_INLINE void reset() { is_inited_ = false; } |   OB_INLINE void reset() { is_inited_ = false; } | ||||||
| private: | private: | ||||||
|   bool is_inited_; |   bool is_inited_; | ||||||
|  | |||||||
| @ -21,13 +21,13 @@ | |||||||
|  |  | ||||||
| OB_DECLARE_PLUGIN(whitespace_parser) | OB_DECLARE_PLUGIN(whitespace_parser) | ||||||
| { | { | ||||||
|   oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN,        // fulltext parser type |   oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN,             // fulltext parser type | ||||||
|   "space",                                                  // name |   "space",                                                       // name | ||||||
|   OB_PLUGIN_AUTHOR_OCEANBASE,                               // author |   OB_PLUGIN_AUTHOR_OCEANBASE,                                    // author | ||||||
|   "This is a default whitespace parser plugin.",            // brief specification |   "This is a default whitespace parser plugin.",                 // brief specification | ||||||
|   0x00001,                                                  // version |   0x00001,                                                       // version | ||||||
|   oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license |   oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license | ||||||
|   &oceanbase::storage::whitespace_parser,                   // default space parser plugin instance |   &oceanbase::storage::whitespace_parser,                        // default space parser plugin instance | ||||||
| }; | }; | ||||||
|  |  | ||||||
| OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser); | OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser); | ||||||
| @ -36,28 +36,28 @@ OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser | |||||||
|  |  | ||||||
| OB_DECLARE_PLUGIN(ngram_parser) | OB_DECLARE_PLUGIN(ngram_parser) | ||||||
| { | { | ||||||
|   oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN,        // fulltext parser type |   oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN,             // fulltext parser type | ||||||
|   "ngram",                                                  // name |   "ngram",                                                       // name | ||||||
|   OB_PLUGIN_AUTHOR_OCEANBASE,                               // author |   OB_PLUGIN_AUTHOR_OCEANBASE,                                    // author | ||||||
|   "This is a ngram fulltext parser plugin.",                // brief specification |   "This is a ngram fulltext parser plugin.",                     // brief specification | ||||||
|   0x00001,                                                  // version |   0x00001,                                                       // version | ||||||
|   oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license |   oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license | ||||||
|   &oceanbase::storage::ngram_parser,                        // ngram parser plugin instance |   &oceanbase::storage::ngram_parser,                             // ngram parser plugin instance | ||||||
| }; | }; | ||||||
|  |  | ||||||
| OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser); | OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser); | ||||||
|  |  | ||||||
| ///////////////////////////////////// Default fulltext parser ////////////////////////////////////////// | ///////////////////////////////////// BEng fulltext parser ////////////////////////////////////////// | ||||||
|  |  | ||||||
| OB_DECLARE_PLUGIN(beng_parser) | OB_DECLARE_PLUGIN(beng_parser) | ||||||
| { | { | ||||||
|   oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN,        // fulltext parser type |   oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN,             // fulltext parser type | ||||||
|   "beng",                                                   // name |   "beng",                                                        // name | ||||||
|   OB_PLUGIN_AUTHOR_OCEANBASE,                               // author |   OB_PLUGIN_AUTHOR_OCEANBASE,                                    // author | ||||||
|   "This is a basic english parser plugin.",                 // brief specification |   "This is a basic english parser plugin.",                      // brief specification | ||||||
|   0x00001,                                                  // version |   0x00001,                                                       // version | ||||||
|   oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license |   oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license | ||||||
|   &oceanbase::storage::beng_parser,                         // default space parser plugin instance |   &oceanbase::storage::beng_parser,                              // basic english parser plugin instance | ||||||
| }; | }; | ||||||
|  |  | ||||||
| OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser); | OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser); | ||||||
|  | |||||||
| @ -119,7 +119,7 @@ int ObFTParseHelper::segment( | |||||||
|     const char *ft, |     const char *ft, | ||||||
|     const int64_t ft_len, |     const int64_t ft_len, | ||||||
|     common::ObIAllocator &allocator, |     common::ObIAllocator &allocator, | ||||||
|     lib::ObFTParserParam::ObIAddWord &add_word) |     ObAddWord &add_word) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   if (OB_UNLIKELY(parser_version < 0 || nullptr == parser_desc || nullptr == cs || nullptr == ft || 0 >= ft_len)) { |   if (OB_UNLIKELY(parser_version < 0 || nullptr == parser_desc || nullptr == cs || nullptr == ft || 0 >= ft_len)) { | ||||||
| @ -127,14 +127,38 @@ int ObFTParseHelper::segment( | |||||||
|     LOG_WARN("invalid arguments", K(ret), K(parser_version), KP(parser_desc), KP(cs), K(ft), K(ft_len)); |     LOG_WARN("invalid arguments", K(ret), K(parser_version), KP(parser_desc), KP(cs), K(ft), K(ft_len)); | ||||||
|   } else { |   } else { | ||||||
|     lib::ObFTParserParam param; |     lib::ObFTParserParam param; | ||||||
|  |     lib::ObITokenIterator *iter = nullptr; | ||||||
|     param.allocator_ = &allocator; |     param.allocator_ = &allocator; | ||||||
|     param.add_word_ = &add_word; |  | ||||||
|     param.cs_ = cs; |     param.cs_ = cs; | ||||||
|     param.fulltext_ = ft; |     param.fulltext_ = ft; | ||||||
|     param.ft_length_ = ft_len; |     param.ft_length_ = ft_len; | ||||||
|     param.parser_version_ = parser_version; |     param.parser_version_ = parser_version; | ||||||
|     if (OB_FAIL(parser_desc->segment(¶m))) { |     if (OB_FAIL(parser_desc->segment(¶m, iter))) { | ||||||
|       LOG_WARN("fail to segment", K(ret), K(param)); |       LOG_WARN("fail to segment", K(ret), K(param)); | ||||||
|  |     } else if (OB_ISNULL(iter)) { | ||||||
|  |       ret = OB_ERR_UNEXPECTED; | ||||||
|  |       LOG_WARN("unexpected error, token iterator is nullptr", K(ret), KP(iter)); | ||||||
|  |     } else { | ||||||
|  |       const char *word = nullptr; | ||||||
|  |       int64_t word_len = 0; | ||||||
|  |       int64_t char_cnt = 0; | ||||||
|  |       int64_t word_freq = 0; | ||||||
|  |       while (OB_SUCC(ret)) { | ||||||
|  |         if (OB_FAIL(iter->get_next_token(word, word_len, char_cnt, word_freq))) { | ||||||
|  |           if (OB_ITER_END != ret) { | ||||||
|  |             LOG_WARN("fail to get next token", K(ret), KPC(iter)); | ||||||
|  |           } | ||||||
|  |         } else if (OB_FAIL(add_word.process_word(word, word_len, char_cnt, word_freq))) { | ||||||
|  |           LOG_WARN("fail to process one word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq)); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |       if (OB_ITER_END == ret) { | ||||||
|  |         ret = OB_SUCCESS; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (OB_NOT_NULL(iter)) { | ||||||
|  |       parser_desc->free_token_iter(¶m, iter); | ||||||
|  |       iter = nullptr; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| @ -176,11 +200,10 @@ int ObFTParseHelper::init( | |||||||
|     LOG_WARN("unexpected error, parse handler is nullptr", K(ret), KP(parse_handler)); |     LOG_WARN("unexpected error, parse handler is nullptr", K(ret), KP(parse_handler)); | ||||||
|   } else if (OB_FAIL(get_fulltext_parser_desc(*parse_handler, parser_desc_))) { |   } else if (OB_FAIL(get_fulltext_parser_desc(*parse_handler, parser_desc_))) { | ||||||
|     LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler)); |     LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler)); | ||||||
|  |   } else if (OB_FAIL(set_add_word_flag(parser_name_))) { | ||||||
|  |     LOG_WARN("fail to set add word flag", K(ret), K(parser_name_)); | ||||||
|   } else { |   } else { | ||||||
|     plugin_param_.desc_ = parser_desc_; |     plugin_param_.desc_ = parser_desc_; | ||||||
|     if (need_min_max_word(parser_name_))  { add_word_flag_.set_min_max_word(); } |  | ||||||
|     if (need_castdn(parser_name_))        { add_word_flag_.set_casedown();     } |  | ||||||
|     if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word();    } |  | ||||||
|     allocator_ = allocator; |     allocator_ = allocator; | ||||||
|     is_inited_ = true; |     is_inited_ = true; | ||||||
|   } |   } | ||||||
| @ -204,7 +227,7 @@ int ObFTParseHelper::segment( | |||||||
|     const char *fulltext, |     const char *fulltext, | ||||||
|     const int64_t fulltext_len, |     const int64_t fulltext_len, | ||||||
|     int64_t &doc_length, |     int64_t &doc_length, | ||||||
|     common::ObIArray<ObFTWord> &words) const |     ObFTWordMap &words) const | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   const ObCharsetInfo *cs = nullptr; |   const ObCharsetInfo *cs = nullptr; | ||||||
| @ -231,29 +254,34 @@ int ObFTParseHelper::segment( | |||||||
|       doc_length = add_word.get_add_word_count(); |       doc_length = add_word.get_add_word_count(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words)); |   LOG_DEBUG("ft parse segment", K(ret), K(type), K(add_word_flag_), K(parser_name_), | ||||||
|  |       K(ObString(fulltext_len, fulltext)), K(words.size())); | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser) | int ObFTParseHelper::set_add_word_flag(const ObFTParser &parser) | ||||||
| { | { | ||||||
|   share::ObPluginName space("space"); |   int ret = OB_SUCCESS; | ||||||
|   share::ObPluginName beng("beng"); |   if (OB_UNLIKELY(!parser.is_valid())) { | ||||||
|   return parser.get_parser_name() == space || parser.get_parser_name() == beng; |     ret = OB_INVALID_ARGUMENT; | ||||||
| } |     LOG_WARN("invalid arguments", K(ret), K(parser)); | ||||||
|  |   } else if (share::ObPluginName("space") == parser.get_parser_name()) { | ||||||
| bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser) |     add_word_flag_.set_min_max_word(); | ||||||
| { |     add_word_flag_.set_stop_word(); | ||||||
|   share::ObPluginName space("space"); |     add_word_flag_.set_casedown(); | ||||||
|   share::ObPluginName beng("beng"); |     add_word_flag_.set_groupby_word(); | ||||||
|   return parser.get_parser_name() == space || parser.get_parser_name() == beng; |   } else if (share::ObPluginName("beng") == parser.get_parser_name()) { | ||||||
| } |     add_word_flag_.set_min_max_word(); | ||||||
|  |     add_word_flag_.set_stop_word(); | ||||||
| bool ObFTParseHelper::need_castdn(const ObFTParser &parser) |     add_word_flag_.set_groupby_word(); | ||||||
| { |   } else if (share::ObPluginName("ngram") == parser.get_parser_name()) { | ||||||
|   share::ObPluginName space("space"); |     add_word_flag_.set_casedown(); | ||||||
|   share::ObPluginName ngram("ngram"); |     add_word_flag_.set_groupby_word(); | ||||||
|   return parser.get_parser_name() == space || parser.get_parser_name() == ngram; |   } else { | ||||||
|  |     ret = OB_NOT_SUPPORTED; | ||||||
|  |     LOG_WARN("unsupported parser for fulltext search", K(ret), K(parser)); | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| } // end namespace storage | } // end namespace storage | ||||||
|  | |||||||
| @ -25,6 +25,8 @@ namespace oceanbase | |||||||
| namespace storage | namespace storage | ||||||
| { | { | ||||||
|  |  | ||||||
|  | class ObAddWord; | ||||||
|  |  | ||||||
| class ObFTParser final | class ObFTParser final | ||||||
| { | { | ||||||
| public: | public: | ||||||
| @ -89,7 +91,7 @@ public: | |||||||
|       const char *fulltext, |       const char *fulltext, | ||||||
|       const int64_t fulltext_len, |       const int64_t fulltext_len, | ||||||
|       int64_t &doc_length, |       int64_t &doc_length, | ||||||
|       common::ObIArray<ObFTWord> &words) const; |       ObFTWordMap &words) const; | ||||||
|   const ObFTParser &get_parser_name() const { return parser_name_; } |   const ObFTParser &get_parser_name() const { return parser_name_; } | ||||||
|   void reset(); |   void reset(); | ||||||
|  |  | ||||||
| @ -105,17 +107,8 @@ private: | |||||||
|       const char *fulltext, |       const char *fulltext, | ||||||
|       const int64_t fulltext_len, |       const int64_t fulltext_len, | ||||||
|       common::ObIAllocator &allocator, |       common::ObIAllocator &allocator, | ||||||
|       lib::ObFTParserParam::ObIAddWord &add_word); |       ObAddWord &add_word); | ||||||
|   static bool need_stopword_list(const ObFTParser &parser); |   int set_add_word_flag(const ObFTParser &parser); | ||||||
|   static bool need_castdn(const ObFTParser &parser); |  | ||||||
|   static bool need_min_max_word(const ObFTParser &parser); |  | ||||||
|  |  | ||||||
|   int alloc_add_word( |  | ||||||
|       const ObCollationType &type, |  | ||||||
|       common::ObIArray<ObFTWord> &words, |  | ||||||
|       lib::ObFTParserParam::ObIAddWord *&add_word) const; |  | ||||||
|   void free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const; |  | ||||||
|  |  | ||||||
| private: | private: | ||||||
|   lib::ObPluginParam plugin_param_; |   lib::ObPluginParam plugin_param_; | ||||||
|   common::ObIAllocator *allocator_; |   common::ObIAllocator *allocator_; | ||||||
|  | |||||||
| @ -24,10 +24,10 @@ ObAddWord::ObAddWord( | |||||||
|     const ObCollationType &type, |     const ObCollationType &type, | ||||||
|     const ObAddWordFlag &flag, |     const ObAddWordFlag &flag, | ||||||
|     common::ObIAllocator &allocator, |     common::ObIAllocator &allocator, | ||||||
|     common::ObIArray<ObFTWord> &word) |     ObFTWordMap &word_map) | ||||||
|   : collation_type_(type), |   : collation_type_(type), | ||||||
|     allocator_(allocator), |     allocator_(allocator), | ||||||
|     words_(word), |     word_map_(word_map), | ||||||
|     min_max_word_cnt_(0), |     min_max_word_cnt_(0), | ||||||
|     non_stopword_cnt_(0), |     non_stopword_cnt_(0), | ||||||
|     stopword_cnt_(0), |     stopword_cnt_(0), | ||||||
| @ -35,19 +35,19 @@ ObAddWord::ObAddWord( | |||||||
| { | { | ||||||
| } | } | ||||||
|  |  | ||||||
| int ObAddWord::operator()( | int ObAddWord::process_word( | ||||||
|     lib::ObFTParserParam *param, |  | ||||||
|     const char *word, |     const char *word, | ||||||
|     const int64_t word_len, |     const int64_t word_len, | ||||||
|     const int64_t char_cnt) |     const int64_t char_cnt, | ||||||
|  |     const int64_t word_freq) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   bool is_stopword = false; |   bool is_stopword = false; | ||||||
|   ObFTWord src_word(word_len, word, collation_type_); |   ObFTWord src_word(word_len, word, collation_type_); | ||||||
|   ObFTWord dst_word; |   ObFTWord dst_word; | ||||||
|   if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) { |   if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt || 0 >= word_freq)) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); |     LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq)); | ||||||
|   } else if (is_min_max_word(char_cnt)) { |   } else if (is_min_max_word(char_cnt)) { | ||||||
|     ++min_max_word_cnt_; |     ++min_max_word_cnt_; | ||||||
|     LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt)); |     LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt)); | ||||||
| @ -58,11 +58,11 @@ int ObAddWord::operator()( | |||||||
|   } else if (OB_UNLIKELY(is_stopword)) { |   } else if (OB_UNLIKELY(is_stopword)) { | ||||||
|     ++stopword_cnt_; |     ++stopword_cnt_; | ||||||
|     LOG_DEBUG("skip stopword", K(ret), K(dst_word)); |     LOG_DEBUG("skip stopword", K(ret), K(dst_word)); | ||||||
|   } else if (OB_FAIL(words_.push_back(dst_word))) { |   } else if (OB_FAIL(groupby_word(dst_word, word_freq))) { | ||||||
|     LOG_WARN("fail to push word into words array", K(ret), K(dst_word)); |     LOG_WARN("fail to groupby word into word map", K(ret), K(dst_word), K(word_freq)); | ||||||
|   } else { |   } else { | ||||||
|     ++non_stopword_cnt_; |     non_stopword_cnt_ += word_freq; | ||||||
|     LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word)); |     LOG_DEBUG("add word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq), K(src_word), K(dst_word)); | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @ -104,5 +104,31 @@ int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword) | |||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | int ObAddWord::groupby_word(const ObFTWord &word, const int64_t word_freq) | ||||||
|  | { | ||||||
|  |   int ret = OB_SUCCESS; | ||||||
|  |   int64_t word_count = 0; | ||||||
|  |   if (OB_UNLIKELY(word.empty() || word_freq <= 0)) { | ||||||
|  |     ret = OB_INVALID_ARGUMENT; | ||||||
|  |     LOG_WARN("invalid arguments", K(ret), K(word), K(word_freq)); | ||||||
|  |   } else if (!flag_.groupby_word()) { | ||||||
|  |     if (OB_FAIL(word_map_.set_refactored(word, 1/*word count*/))) { | ||||||
|  |       LOG_WARN("fail to set fulltext word and count", K(ret), K(word)); | ||||||
|  |     } | ||||||
|  |   } else if (OB_FAIL(word_map_.get_refactored(word, word_count)) && OB_HASH_NOT_EXIST != ret) { | ||||||
|  |     LOG_WARN("fail to get fulltext word", K(ret), K(word)); | ||||||
|  |   } else { | ||||||
|  |     if (OB_HASH_NOT_EXIST == ret) { | ||||||
|  |       word_count = 1; | ||||||
|  |     } else { | ||||||
|  |       word_count += word_freq; | ||||||
|  |     } | ||||||
|  |     if (OB_FAIL(word_map_.set_refactored(word, word_count, 1/*overwrite*/))) { | ||||||
|  |       LOG_WARN("fail to set fulltext word and count", K(ret), K(word), K(word_count)); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
| } // end namespace storage | } // end namespace storage | ||||||
| } // end namespace oceanbase | } // end namespace oceanbase | ||||||
|  | |||||||
| @ -63,23 +63,23 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = { | |||||||
|   "www" |   "www" | ||||||
| }; | }; | ||||||
|  |  | ||||||
| class ObAddWord final : public lib::ObFTParserParam::ObIAddWord | class ObAddWord final | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   ObAddWord( |   ObAddWord( | ||||||
|       const ObCollationType &type, |       const ObCollationType &type, | ||||||
|       const ObAddWordFlag &flag, |       const ObAddWordFlag &flag, | ||||||
|       common::ObIAllocator &allocator, |       common::ObIAllocator &allocator, | ||||||
|       common::ObIArray<ObFTWord> &word); |       ObFTWordMap &word_map); | ||||||
|   virtual ~ObAddWord() = default; |   ~ObAddWord() = default; | ||||||
|   virtual int operator()( |   int process_word( | ||||||
|       lib::ObFTParserParam *param, |  | ||||||
|       const char *word, |       const char *word, | ||||||
|       const int64_t word_len, |       const int64_t word_len, | ||||||
|       const int64_t char_cnt) override; |       const int64_t char_cnt, | ||||||
|   virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; } |       const int64_t word_freq); | ||||||
|  |   virtual int64_t get_add_word_count() const { return non_stopword_cnt_; } | ||||||
|   VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt), |   VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt), | ||||||
|       K_(words)); |       K(word_map_.size())); | ||||||
| public: | public: | ||||||
|   static const int64_t FT_MIN_WORD_LEN = 3; |   static const int64_t FT_MIN_WORD_LEN = 3; | ||||||
|   static const int64_t FT_MAX_WORD_LEN = 84; |   static const int64_t FT_MAX_WORD_LEN = 84; | ||||||
| @ -87,10 +87,11 @@ private: | |||||||
|   bool is_min_max_word(const int64_t c_len) const; |   bool is_min_max_word(const int64_t c_len) const; | ||||||
|   int casedown_word(const ObFTWord &src, ObFTWord &dst); |   int casedown_word(const ObFTWord &src, ObFTWord &dst); | ||||||
|   int check_stopword(const ObFTWord &word, bool &is_stopword); |   int check_stopword(const ObFTWord &word, bool &is_stopword); | ||||||
|  |   int groupby_word(const ObFTWord &word, const int64_t word_cnt); | ||||||
| private: | private: | ||||||
|   ObCollationType collation_type_; |   ObCollationType collation_type_; | ||||||
|   common::ObIAllocator &allocator_; |   common::ObIAllocator &allocator_; | ||||||
|   common::ObIArray<ObFTWord> &words_; |   ObFTWordMap &word_map_; | ||||||
|   int64_t min_max_word_cnt_; |   int64_t min_max_word_cnt_; | ||||||
|   int64_t non_stopword_cnt_; |   int64_t non_stopword_cnt_; | ||||||
|   int64_t stopword_cnt_; |   int64_t stopword_cnt_; | ||||||
|  | |||||||
| @ -14,6 +14,7 @@ | |||||||
| #define OB_FTS_STRUCT_H_ | #define OB_FTS_STRUCT_H_ | ||||||
|  |  | ||||||
| #include "lib/charset/ob_charset.h" | #include "lib/charset/ob_charset.h" | ||||||
|  | #include "lib/hash/ob_hashmap.h" | ||||||
|  |  | ||||||
| namespace oceanbase | namespace oceanbase | ||||||
| { | { | ||||||
| @ -34,7 +35,7 @@ public: | |||||||
|     hash_val = ObCharset::hash(type_, word_); |     hash_val = ObCharset::hash(type_, word_); | ||||||
|     return common::OB_SUCCESS; |     return common::OB_SUCCESS; | ||||||
|   } |   } | ||||||
|   OB_INLINE uint64_t hash() const { return word_.hash(); } |   OB_INLINE uint64_t hash() const { return ObCharset::hash(type_, word_); } | ||||||
|   OB_INLINE bool empty() const { return word_.empty(); } |   OB_INLINE bool empty() const { return word_.empty(); } | ||||||
|  |  | ||||||
|   OB_INLINE bool operator ==(const ObFTWord &other) const |   OB_INLINE bool operator ==(const ObFTWord &other) const | ||||||
| @ -76,6 +77,8 @@ public: | |||||||
|   int64_t word_cnt_; |   int64_t word_cnt_; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap; | ||||||
|  |  | ||||||
| class ObAddWordFlag final | class ObAddWordFlag final | ||||||
| { | { | ||||||
| private: | private: | ||||||
| @ -84,6 +87,7 @@ private: | |||||||
|                                                    // than a maximum word length. |                                                    // than a maximum word length. | ||||||
|   static const uint64_t AWF_STOPWORD     = 1 << 1; // filter by sotp word table. |   static const uint64_t AWF_STOPWORD     = 1 << 1; // filter by sotp word table. | ||||||
|   static const uint64_t AWF_CASEDOWN     = 1 << 2; // convert characters from uppercase to lowercase. |   static const uint64_t AWF_CASEDOWN     = 1 << 2; // convert characters from uppercase to lowercase. | ||||||
|  |   static const uint64_t AWF_GROUPBY_WORD = 1 << 3; // distinct and word aggregation | ||||||
| public: | public: | ||||||
|   ObAddWordFlag() : flag_(AWF_NONE) {} |   ObAddWordFlag() : flag_(AWF_NONE) {} | ||||||
|   ~ObAddWordFlag() = default; |   ~ObAddWordFlag() = default; | ||||||
| @ -95,13 +99,17 @@ public: | |||||||
|   void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); } |   void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); } | ||||||
|   void set_stop_word() { set_flag(AWF_STOPWORD); } |   void set_stop_word() { set_flag(AWF_STOPWORD); } | ||||||
|   void set_casedown() { set_flag(AWF_CASEDOWN); } |   void set_casedown() { set_flag(AWF_CASEDOWN); } | ||||||
|  |   void set_groupby_word() { set_flag(AWF_GROUPBY_WORD); } | ||||||
|   void clear() { flag_ = AWF_NONE; } |   void clear() { flag_ = AWF_NONE; } | ||||||
|   void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); } |   void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); } | ||||||
|   void clear_stop_word() { clear_flag(AWF_STOPWORD); } |   void clear_stop_word() { clear_flag(AWF_STOPWORD); } | ||||||
|   void clear_casedown() { clear_flag(AWF_CASEDOWN); } |   void clear_casedown() { clear_flag(AWF_CASEDOWN); } | ||||||
|  |   void clear_groupby_word() { clear_flag(AWF_GROUPBY_WORD); } | ||||||
|   bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); } |   bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); } | ||||||
|   bool stopword() const { return has_flag(AWF_STOPWORD); } |   bool stopword() const { return has_flag(AWF_STOPWORD); } | ||||||
|   bool casedown() const { return has_flag(AWF_CASEDOWN); } |   bool casedown() const { return has_flag(AWF_CASEDOWN); } | ||||||
|  |   bool groupby_word() const { return has_flag(AWF_GROUPBY_WORD); } | ||||||
|  |   TO_STRING_KV(K_(flag)); | ||||||
| private: | private: | ||||||
|   uint64_t flag_; |   uint64_t flag_; | ||||||
| }; | }; | ||||||
|  | |||||||
| @ -24,24 +24,81 @@ namespace storage | |||||||
|  |  | ||||||
| #define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') | #define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') | ||||||
|  |  | ||||||
| /*static*/ int ObNgramFTParser::segment( |  | ||||||
|   lib::ObFTParserParam *param, | ObNgramFTParser::ObNgramFTParser() | ||||||
|   const char *fulltext, |   : cs_(nullptr), | ||||||
|   const int64_t ft_len) |     start_(nullptr), | ||||||
|  |     next_(nullptr), | ||||||
|  |     end_(nullptr), | ||||||
|  |     c_nums_(0), | ||||||
|  |     is_inited_(false) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | ObNgramFTParser::~ObNgramFTParser() | ||||||
|  | { | ||||||
|  |   reset(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void ObNgramFTParser::reset() | ||||||
|  | { | ||||||
|  |   cs_ = nullptr; | ||||||
|  |   start_ = nullptr; | ||||||
|  |   next_ = nullptr; | ||||||
|  |   end_ = nullptr; | ||||||
|  |   c_nums_ = 0; | ||||||
|  |   is_inited_ = false; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int ObNgramFTParser::init(lib::ObFTParserParam *param) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   int64_t c_nums = 0; |   if (OB_UNLIKELY(is_inited_)) { | ||||||
|   const char *start = fulltext; |     ret = OB_INIT_TWICE; | ||||||
|   const char *next = start; |     LOG_WARN("init twice", K(ret), KPC(param), KPC(this)); | ||||||
|   const char *end = start + ft_len; |   } else if (OB_ISNULL(param) | ||||||
|   if (OB_ISNULL(param) || OB_ISNULL(fulltext) || OB_UNLIKELY(ft_len <= 0)) { |       || OB_ISNULL(param->cs_) | ||||||
|  |       || OB_ISNULL(param->fulltext_) | ||||||
|  |       || OB_UNLIKELY(0 >= param->ft_length_)) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KP(param), KP(fulltext), K(ft_len)); |     LOG_WARN("invalid arguments", K(ret), KPC(param)); | ||||||
|   } else { |   } else { | ||||||
|     const ObCharsetInfo *cs = param->cs_; |     cs_ = param->cs_; | ||||||
|     while (OB_SUCC(ret) && next < end) { |     start_ = param->fulltext_; | ||||||
|  |     next_ = start_; | ||||||
|  |     end_ = start_ + param->ft_length_; | ||||||
|  |     c_nums_ = 0; | ||||||
|  |     is_inited_ = true; | ||||||
|  |   } | ||||||
|  |   if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { | ||||||
|  |     reset(); | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int ObNgramFTParser::get_next_token( | ||||||
|  |     const char *&word, | ||||||
|  |     int64_t &word_len, | ||||||
|  |     int64_t &char_len, | ||||||
|  |     int64_t &word_freq) | ||||||
|  | { | ||||||
|  |   int ret = OB_SUCCESS; | ||||||
|  |   word = nullptr; | ||||||
|  |   word_len = 0; | ||||||
|  |   char_len = 0; | ||||||
|  |   word_freq = 0; | ||||||
|  |   if (OB_UNLIKELY(!is_inited_)) { | ||||||
|  |     ret = OB_NOT_INIT; | ||||||
|  |     LOG_WARN("ngram ft parser isn't initialized", K(ret), K(is_inited_)); | ||||||
|  |   } else { | ||||||
|  |     int64_t c_nums = c_nums_; | ||||||
|  |     const char *start = start_; | ||||||
|  |     const char *next = next_; | ||||||
|  |     const char *end = end_; | ||||||
|  |     const ObCharsetInfo *cs = cs_; | ||||||
|  |     do { | ||||||
|       const int64_t c_len = ob_mbcharlen_ptr(cs, next, end); |       const int64_t c_len = ob_mbcharlen_ptr(cs, next, end); | ||||||
|       if (next + c_len > end || 0 == c_len) { // if char is invalid, just skip the rest of doc. |       if (next + c_len > end || 0 == c_len) { // if char is invalid, just skip the rest of doc. | ||||||
|  |         ret = OB_ITER_END; | ||||||
|         break; |         break; | ||||||
|       } else { |       } else { | ||||||
|         int ctype; |         int ctype; | ||||||
| @ -50,38 +107,31 @@ namespace storage | |||||||
|           start = next + 1; |           start = next + 1; | ||||||
|           next = start; |           next = start; | ||||||
|           c_nums = 0; |           c_nums = 0; | ||||||
|  |           if (next == end) { | ||||||
|  |             ret = OB_ITER_END; | ||||||
|  |           } | ||||||
|           continue; |           continue; | ||||||
|         } |         } | ||||||
|         next += c_len; |         next += c_len; | ||||||
|         ++c_nums; |         ++c_nums; | ||||||
|       } |       } | ||||||
|       if (NGRAM_TOKEN_SIZE == c_nums) { |       if (NGRAM_TOKEN_SIZE == c_nums) { | ||||||
|         if (OB_FAIL(add_word(param, start, next - start, c_nums))) { |         word = start; | ||||||
|           LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums)); |         word_len = next - start; | ||||||
|         } else { |         char_len = c_nums; | ||||||
|           start += ob_mbcharlen_ptr(cs, start, end); |         word_freq = 1; | ||||||
|           c_nums = NGRAM_TOKEN_SIZE - 1; |         start += ob_mbcharlen_ptr(cs, start, end); | ||||||
|         } |         c_nums = NGRAM_TOKEN_SIZE - 1; | ||||||
|  |         break; | ||||||
|       } |       } | ||||||
|  |     } while (OB_SUCC(ret) && next < end); | ||||||
|  |     if (OB_ITER_END == ret || OB_SUCCESS == ret) { | ||||||
|  |       start_ = start; | ||||||
|  |       next_ = next; | ||||||
|  |       end_ = end; | ||||||
|  |       c_nums_ = c_nums; | ||||||
|     } |     } | ||||||
|   } |     LOG_DEBUG("next word", K(ret), K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_)); | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*static*/ int ObNgramFTParser::add_word( |  | ||||||
|     lib::ObFTParserParam *param, |  | ||||||
|     const char *word, |  | ||||||
|     const int64_t word_len, |  | ||||||
|     const int64_t char_cnt) |  | ||||||
| { |  | ||||||
|   int ret = OB_SUCCESS; |  | ||||||
|   if (OB_ISNULL(param) |  | ||||||
|       || OB_ISNULL(word) |  | ||||||
|       || OB_UNLIKELY(0 >= word_len)) { |  | ||||||
|     ret = OB_INVALID_ARGUMENT; |  | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); |  | ||||||
|   } else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) { |  | ||||||
|     LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word))); |  | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @ -103,21 +153,43 @@ int ObNgramFTParserDesc::deinit(lib::ObPluginParam *param) | |||||||
|   return OB_SUCCESS; |   return OB_SUCCESS; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ObNgramFTParserDesc::segment(lib::ObFTParserParam *param) const | int ObNgramFTParserDesc::segment( | ||||||
|  |     lib::ObFTParserParam *param, | ||||||
|  |     lib::ObITokenIterator *&iter) const | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|  |   void *buf = nullptr; | ||||||
|   if (OB_UNLIKELY(!is_inited_)) { |   if (OB_UNLIKELY(!is_inited_)) { | ||||||
|     ret = OB_NOT_INIT; |     ret = OB_NOT_INIT; | ||||||
|     LOG_WARN("ngram ft parser desc hasn't be initialized", K(ret), K(is_inited_)); |     LOG_WARN("ngram ft parser desc hasn't be initialized", K(ret), K(is_inited_)); | ||||||
|   } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { |   } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid argument", K(ret), KPC(param)); |     LOG_WARN("invalid argument", K(ret), KPC(param)); | ||||||
|   } else if (OB_FAIL(ObNgramFTParser::segment(param, param->fulltext_, param->ft_length_))) { |   } else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObNgramFTParser)))) { | ||||||
|     LOG_WARN("fail to segment words for fulltext by ngram", K(ret), KPC(param), |     ret = OB_ALLOCATE_MEMORY_FAILED; | ||||||
|         K(param->fulltext_), K(param->ft_length_)); |     LOG_WARN("fail to allocate ngram ft parser", K(ret)); | ||||||
|  |   } else { | ||||||
|  |     ObNgramFTParser *parser = new (buf) ObNgramFTParser(); | ||||||
|  |     if (OB_FAIL(parser->init(param))) { | ||||||
|  |       LOG_WARN("fail to init ngram fulltext parser", K(ret), KPC(param)); | ||||||
|  |     } else { | ||||||
|  |       iter = parser; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void ObNgramFTParserDesc::free_token_iter( | ||||||
|  |     lib::ObFTParserParam *param, | ||||||
|  |     lib::ObITokenIterator *&iter) const | ||||||
|  | { | ||||||
|  |   if (OB_NOT_NULL(iter)) { | ||||||
|  |     abort_unless(nullptr != param); | ||||||
|  |     abort_unless(nullptr != param->allocator_); | ||||||
|  |     iter->~ObITokenIterator(); | ||||||
|  |     param->allocator_->free(iter); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| } // end namespace storage | } // end namespace storage | ||||||
| } // end namespace oceanbase | } // end namespace oceanbase | ||||||
|  | |||||||
| @ -22,23 +22,30 @@ namespace oceanbase | |||||||
| namespace storage | namespace storage | ||||||
| { | { | ||||||
|  |  | ||||||
| class ObNgramFTParser final | class ObNgramFTParser final : public lib::ObITokenIterator | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   static const int64_t NGRAM_TOKEN_SIZE = 2; // TODO: @jinzhu, please apply one system variable later, and keep the same as mysql. |   static const int64_t NGRAM_TOKEN_SIZE = 2; // TODO: @jinzhu, please apply one system variable later, and keep the same as mysql. | ||||||
| public: | public: | ||||||
|   ObNgramFTParser() = default; |   ObNgramFTParser(); | ||||||
|   ~ObNgramFTParser() = default; |   virtual ~ObNgramFTParser(); | ||||||
|   static int segment( |  | ||||||
|       lib::ObFTParserParam *param, |   int init(lib::ObFTParserParam *param); | ||||||
|       const char *fulltext, |   void reset(); | ||||||
|       const int64_t ft_len); |   virtual int get_next_token( | ||||||
|  |       const char *&word, | ||||||
|  |       int64_t &word_len, | ||||||
|  |       int64_t &char_len, | ||||||
|  |       int64_t &word_freq) override; | ||||||
|  |  | ||||||
|  |   VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited)); | ||||||
| private: | private: | ||||||
|   static int add_word( |   const ObCharsetInfo *cs_; | ||||||
|     lib::ObFTParserParam *param, |   const char *start_; | ||||||
|     const char *word, |   const char *next_; | ||||||
|     const int64_t word_len, |   const char *end_; | ||||||
|     const int64_t char_cnt); |   int64_t c_nums_; | ||||||
|  |   bool is_inited_; | ||||||
| private: | private: | ||||||
|   DISABLE_COPY_ASSIGN(ObNgramFTParser); |   DISABLE_COPY_ASSIGN(ObNgramFTParser); | ||||||
| }; | }; | ||||||
| @ -50,7 +57,8 @@ public: | |||||||
|   virtual ~ObNgramFTParserDesc() = default; |   virtual ~ObNgramFTParserDesc() = default; | ||||||
|   virtual int init(lib::ObPluginParam *param) override; |   virtual int init(lib::ObPluginParam *param) override; | ||||||
|   virtual int deinit(lib::ObPluginParam *param) override; |   virtual int deinit(lib::ObPluginParam *param) override; | ||||||
|   virtual int segment(lib::ObFTParserParam *param) const override; |   virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
|  |   virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
|   OB_INLINE void reset() { is_inited_ = false; } |   OB_INLINE void reset() { is_inited_ = false; } | ||||||
| private: | private: | ||||||
|   bool is_inited_; |   bool is_inited_; | ||||||
|  | |||||||
| @ -24,22 +24,74 @@ namespace storage | |||||||
|  |  | ||||||
| #define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') | #define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_') | ||||||
|  |  | ||||||
| int ObSpaceFTParser::segment( | ObSpaceFTParser::ObSpaceFTParser() | ||||||
|     lib::ObFTParserParam *param, |   : cs_(nullptr), | ||||||
|     const char *ft, |     start_(nullptr), | ||||||
|     const int64_t ft_len) |     next_(nullptr), | ||||||
|  |     end_(nullptr), | ||||||
|  |     is_inited_(false) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | ObSpaceFTParser::~ObSpaceFTParser() | ||||||
|  | { | ||||||
|  |   reset(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void ObSpaceFTParser::reset() | ||||||
|  | { | ||||||
|  |   cs_ = nullptr; | ||||||
|  |   start_ = nullptr; | ||||||
|  |   next_ = nullptr; | ||||||
|  |   end_ = nullptr; | ||||||
|  |   is_inited_ = false; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int ObSpaceFTParser::init(lib::ObFTParserParam *param) | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   const char *start = ft; |   if (OB_UNLIKELY(is_inited_)) { | ||||||
|   const char *next = start; |     ret = OB_INIT_TWICE; | ||||||
|   const char *end = start + ft_len; |     LOG_WARN("init twice", K(ret), KPC(param), KPC(this)); | ||||||
|   int mbl = 0; |   } else if (OB_ISNULL(param) | ||||||
|   if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) { |       || OB_ISNULL(param->cs_) | ||||||
|  |       || OB_ISNULL(param->fulltext_) | ||||||
|  |       || OB_UNLIKELY(0 >= param->ft_length_)) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len)); |     LOG_WARN("invalid arguments", K(ret), KPC(param)); | ||||||
|   } else { |   } else { | ||||||
|     const ObCharsetInfo *cs = param->cs_; |     cs_ = param->cs_; | ||||||
|     while (OB_SUCC(ret) && next < end) { |     start_ = param->fulltext_; | ||||||
|  |     next_ = start_; | ||||||
|  |     end_ = start_ + param->ft_length_; | ||||||
|  |     is_inited_ = true; | ||||||
|  |   } | ||||||
|  |   if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) { | ||||||
|  |     reset(); | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int ObSpaceFTParser::get_next_token( | ||||||
|  |     const char *&word, | ||||||
|  |     int64_t &word_len, | ||||||
|  |     int64_t &char_len, | ||||||
|  |     int64_t &word_freq) | ||||||
|  | { | ||||||
|  |   int ret = OB_SUCCESS; | ||||||
|  |   int mbl = 0; | ||||||
|  |   word = nullptr; | ||||||
|  |   word_len = 0; | ||||||
|  |   char_len = 0; | ||||||
|  |   word_freq = 0; | ||||||
|  |   if (OB_UNLIKELY(!is_inited_)) { | ||||||
|  |     ret = OB_NOT_INIT; | ||||||
|  |     LOG_WARN("space ft parser isn't initialized", K(ret), K(is_inited_)); | ||||||
|  |   } else { | ||||||
|  |     const char *start = start_; | ||||||
|  |     const char *next = next_; | ||||||
|  |     const char *end = end_; | ||||||
|  |     const ObCharsetInfo *cs = cs_; | ||||||
|  |     do { | ||||||
|       while (next < end) { |       while (next < end) { | ||||||
|         int ctype; |         int ctype; | ||||||
|         mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end); |         mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end); | ||||||
| @ -62,34 +114,24 @@ int ObSpaceFTParser::segment( | |||||||
|           ++c_nums; |           ++c_nums; | ||||||
|           next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); |           next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); | ||||||
|         } |         } | ||||||
|         if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) { |         if (0 < c_nums) { | ||||||
|           LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next)); |           word = start; | ||||||
|  |           word_len = next - start; | ||||||
|  |           char_len = c_nums; | ||||||
|  |           word_freq = 1; | ||||||
|  |           start = next; | ||||||
|  |           break; | ||||||
|         } else { |         } else { | ||||||
|           start = next; |           start = next; | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|  |     } while (OB_SUCC(ret) && next < end); | ||||||
|  |     if (OB_ITER_END == ret || OB_SUCCESS == ret) { | ||||||
|  |       start_ = start; | ||||||
|  |       next_ = next; | ||||||
|  |       end_ = end; | ||||||
|     } |     } | ||||||
|     if (OB_ITER_END == ret) { |     LOG_DEBUG("next word", K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_)); | ||||||
|       ret = OB_SUCCESS; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int ObSpaceFTParser::add_word( |  | ||||||
|     lib::ObFTParserParam *param, |  | ||||||
|     const char *word, |  | ||||||
|     const int64_t word_len, |  | ||||||
|     const int64_t char_cnt) |  | ||||||
| { |  | ||||||
|   int ret = OB_SUCCESS; |  | ||||||
|   if (OB_ISNULL(param) |  | ||||||
|       || OB_ISNULL(word) |  | ||||||
|       || OB_UNLIKELY(0 >= word_len)) { |  | ||||||
|     ret = OB_INVALID_ARGUMENT; |  | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len)); |  | ||||||
|   } else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) { |  | ||||||
|     LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt),  K(ObString(word_len, word))); |  | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @ -111,21 +153,43 @@ int ObWhiteSpaceFTParserDesc::deinit(lib::ObPluginParam *param) | |||||||
|   return OB_SUCCESS; |   return OB_SUCCESS; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ObWhiteSpaceFTParserDesc::segment(lib::ObFTParserParam *param) const | int ObWhiteSpaceFTParserDesc::segment( | ||||||
|  |     lib::ObFTParserParam *param, | ||||||
|  |     lib::ObITokenIterator *&iter) const | ||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|  |   void *buf = nullptr; | ||||||
|   if (OB_UNLIKELY(!is_inited_)) { |   if (OB_UNLIKELY(!is_inited_)) { | ||||||
|     ret = OB_NOT_INIT; |     ret = OB_NOT_INIT; | ||||||
|     LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); |     LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_)); | ||||||
|   } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { |   } else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid argument", K(ret), KPC(param)); |     LOG_WARN("invalid argument", K(ret), KPC(param)); | ||||||
|   } else if (OB_FAIL(ObSpaceFTParser::segment(param, param->fulltext_, param->ft_length_))) { |   } else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObSpaceFTParser)))) { | ||||||
|     LOG_WARN("fail to segment words for fulltext by spaces", K(ret), KPC(param), |     ret = OB_ALLOCATE_MEMORY_FAILED; | ||||||
|         K(param->fulltext_), K(param->ft_length_)); |     LOG_WARN("fail to allocate space ft parser", K(ret)); | ||||||
|  |   } else { | ||||||
|  |     ObSpaceFTParser *parser = new (buf) ObSpaceFTParser(); | ||||||
|  |     if (OB_FAIL(parser->init(param))) { | ||||||
|  |       LOG_WARN("fail to init whitespace fulltext parser", K(ret), KPC(param)); | ||||||
|  |     } else { | ||||||
|  |       iter = parser; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void ObWhiteSpaceFTParserDesc::free_token_iter( | ||||||
|  |     lib::ObFTParserParam *param, | ||||||
|  |     lib::ObITokenIterator *&iter) const | ||||||
|  | { | ||||||
|  |   if (OB_NOT_NULL(iter)) { | ||||||
|  |     abort_unless(nullptr != param); | ||||||
|  |     abort_unless(nullptr != param->allocator_); | ||||||
|  |     iter->~ObITokenIterator(); | ||||||
|  |     param->allocator_->free(iter); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| } // end namespace storage | } // end namespace storage | ||||||
| } // end namespace oceanbase | } // end namespace oceanbase | ||||||
|  | |||||||
| @ -23,21 +23,27 @@ namespace oceanbase | |||||||
| namespace storage | namespace storage | ||||||
| { | { | ||||||
|  |  | ||||||
| class ObSpaceFTParser final | class ObSpaceFTParser final : public lib::ObITokenIterator | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   ObSpaceFTParser() = default; |   ObSpaceFTParser(); | ||||||
|   ~ObSpaceFTParser() = default; |   virtual ~ObSpaceFTParser(); | ||||||
|   static int segment( |  | ||||||
|       lib::ObFTParserParam *param, |   int init(lib::ObFTParserParam *param); | ||||||
|       const char *fulltext, |   void reset(); | ||||||
|       const int64_t ft_len); |   virtual int get_next_token( | ||||||
|  |       const char *&word, | ||||||
|  |       int64_t &word_len, | ||||||
|  |       int64_t &char_len, | ||||||
|  |       int64_t &word_freq) override; | ||||||
|  |  | ||||||
|  |   VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited)); | ||||||
| private: | private: | ||||||
|   static int add_word( |   const ObCharsetInfo *cs_; | ||||||
|       lib::ObFTParserParam *param, |   const char *start_; | ||||||
|       const char *word, |   const char *next_; | ||||||
|       const int64_t word_len, |   const char *end_; | ||||||
|       const int64_t char_cnt); |   bool is_inited_; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc | class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc | ||||||
| @ -47,7 +53,8 @@ public: | |||||||
|   virtual ~ObWhiteSpaceFTParserDesc() = default; |   virtual ~ObWhiteSpaceFTParserDesc() = default; | ||||||
|   virtual int init(lib::ObPluginParam *param) override; |   virtual int init(lib::ObPluginParam *param) override; | ||||||
|   virtual int deinit(lib::ObPluginParam *param) override; |   virtual int deinit(lib::ObPluginParam *param) override; | ||||||
|   virtual int segment(lib::ObFTParserParam *param) const override; |   virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
|  |   virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
|   OB_INLINE void reset() { is_inited_ = false; } |   OB_INLINE void reset() { is_inited_ = false; } | ||||||
| private: | private: | ||||||
|   bool is_inited_; |   bool is_inited_; | ||||||
|  | |||||||
| @ -19,6 +19,6 @@ OB_DECLARE_PLUGIN(mock_ft_parser) | |||||||
|   OB_PLUGIN_AUTHOR_OCEANBASE, |   OB_PLUGIN_AUTHOR_OCEANBASE, | ||||||
|   "This is mock fulltext parser plugin.", |   "This is mock fulltext parser plugin.", | ||||||
|   0x00001, |   0x00001, | ||||||
|   oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, |   oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, | ||||||
|   &oceanbase::storage::mock_ft_parser, |   &oceanbase::storage::mock_ft_parser, | ||||||
| }; | }; | ||||||
|  | |||||||
| @ -27,7 +27,7 @@ public: | |||||||
|   virtual ~ObMockFTParserDesc() = default; |   virtual ~ObMockFTParserDesc() = default; | ||||||
|   virtual int init(lib::ObPluginParam *param) override; |   virtual int init(lib::ObPluginParam *param) override; | ||||||
|   virtual int deinit(lib::ObPluginParam *param) override; |   virtual int deinit(lib::ObPluginParam *param) override; | ||||||
|   virtual int segment(lib::ObFTParserParam *param) const override; |   virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| int ObMockFTParserDesc::init(lib::ObPluginParam *param) | int ObMockFTParserDesc::init(lib::ObPluginParam *param) | ||||||
| @ -42,7 +42,7 @@ int ObMockFTParserDesc::deinit(lib::ObPluginParam *param) | |||||||
|   return OB_SUCCESS; |   return OB_SUCCESS; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ObMockFTParserDesc::segment(lib::ObFTParserParam *param) const | int ObMockFTParserDesc::segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const | ||||||
| { | { | ||||||
|   UNUSED(param); |   UNUSED(param); | ||||||
|   return OB_SUCCESS; |   return OB_SUCCESS; | ||||||
|  | |||||||
| @ -49,33 +49,19 @@ int segment_and_calc_word_count( | |||||||
| { | { | ||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   int64_t doc_length = 0; |   int64_t doc_length = 0; | ||||||
|   common::ObSEArray<ObFTWord, 256> words; |  | ||||||
|   if (OB_ISNULL(helper) |   if (OB_ISNULL(helper) | ||||||
|       || OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type |       || OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type | ||||||
|                   || ObCollationType::CS_TYPE_EXTENDED_MARK < type) |                   || ObCollationType::CS_TYPE_EXTENDED_MARK < type) | ||||||
|       || OB_UNLIKELY(!words_count.created())) { |       || OB_UNLIKELY(!words_count.created())) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created())); |     LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created())); | ||||||
|   } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) { |   } else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) { | ||||||
|     LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext)); |     LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext)); | ||||||
|   } else { |  | ||||||
|     for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) { |  | ||||||
|       const ObFTWord &ft_word = words.at(i); |  | ||||||
|       int64_t word_count = 0; |  | ||||||
|       if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) { |  | ||||||
|         LOG_WARN("fail to get ft word", K(ret), K(ft_word)); |  | ||||||
|       } else { |  | ||||||
|         word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count; |  | ||||||
|         if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) { |  | ||||||
|           LOG_WARN("fail to set ft word and count", K(ret), K(ft_word)); |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| class ObTestAddWord final : public lib::ObFTParserParam::ObIAddWord | class ObTestAddWord final | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   static const char *TEST_FULLTEXT; |   static const char *TEST_FULLTEXT; | ||||||
| @ -85,14 +71,16 @@ public: | |||||||
|   static const int64_t FT_MAX_WORD_LEN = 84; |   static const int64_t FT_MAX_WORD_LEN = 84; | ||||||
| public: | public: | ||||||
|   ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator); |   ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator); | ||||||
|   virtual ~ObTestAddWord() = default; |   ~ObTestAddWord() = default; | ||||||
|   virtual int operator()( |   int check_words(lib::ObITokenIterator *iter); | ||||||
|       lib::ObFTParserParam *param, |   int64_t get_add_word_count() const { return ith_word_; } | ||||||
|  |   static int64_t get_word_cnt_without_stopword() { return TEST_WORD_COUNT_WITHOUT_STOPWORD; } | ||||||
|  |   VIRTUAL_TO_STRING_KV(K_(ith_word)); | ||||||
|  | private: | ||||||
|  |   int check_ith_word( | ||||||
|       const char *word, |       const char *word, | ||||||
|       const int64_t word_len, |       const int64_t word_len, | ||||||
|       const int64_t char_cnt) override; |       const int64_t char_cnt); | ||||||
|   virtual int64_t get_add_word_count() const override { return ith_word_; } |  | ||||||
|   VIRTUAL_TO_STRING_KV(K_(ith_word)); |  | ||||||
| private: | private: | ||||||
|   bool is_min_max_word(const int64_t c_len) const; |   bool is_min_max_word(const int64_t c_len) const; | ||||||
|   int casedown_word(const ObFTWord &src, ObFTWord &dst); |   int casedown_word(const ObFTWord &src, ObFTWord &dst); | ||||||
| @ -137,8 +125,32 @@ int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst) | |||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| int ObTestAddWord::operator()( | int ObTestAddWord::check_words(lib::ObITokenIterator *iter) | ||||||
|       lib::ObFTParserParam *param, | { | ||||||
|  |   int ret = OB_SUCCESS; | ||||||
|  |   if (OB_ISNULL(iter)) { | ||||||
|  |     ret = OB_INVALID_ARGUMENT; | ||||||
|  |     LOG_WARN("invalid arguments", K(ret), KP(iter)); | ||||||
|  |   } else { | ||||||
|  |     const char *word = nullptr; | ||||||
|  |     int64_t word_len = 0; | ||||||
|  |     int64_t char_len = 0; | ||||||
|  |     int64_t word_freq = 0; | ||||||
|  |     while (OB_SUCC(ret)) { | ||||||
|  |       if (OB_FAIL(iter->get_next_token(word, word_len, char_len, word_freq))) { | ||||||
|  |         LOG_WARN("fail to get next token", K(ret), KPC(iter)); | ||||||
|  |       } else if (OB_FAIL(check_ith_word(word, word_len, char_len))) { | ||||||
|  |         LOG_WARN("fail to check ith word", K(ret), KP(word), K(word_len), K(char_len)); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (OB_ITER_END == ret) { | ||||||
|  |       ret = OB_SUCCESS; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int ObTestAddWord::check_ith_word( | ||||||
|       const char *word, |       const char *word, | ||||||
|       const int64_t word_len, |       const int64_t word_len, | ||||||
|       const int64_t char_cnt) |       const int64_t char_cnt) | ||||||
| @ -146,9 +158,9 @@ int ObTestAddWord::operator()( | |||||||
|   int ret = OB_SUCCESS; |   int ret = OB_SUCCESS; | ||||||
|   ObFTWord src_word(word_len, word, collation_type_); |   ObFTWord src_word(word_len, word, collation_type_); | ||||||
|   ObFTWord dst_word; |   ObFTWord dst_word; | ||||||
|   if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) { |   if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) { | ||||||
|     ret = OB_INVALID_ARGUMENT; |     ret = OB_INVALID_ARGUMENT; | ||||||
|     LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt)); |     LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt)); | ||||||
|   } else if (is_min_max_word(char_cnt)) { |   } else if (is_min_max_word(char_cnt)) { | ||||||
|     // skip min/max word |     // skip min/max word | ||||||
|   } else if (OB_FAIL(casedown_word(src_word, dst_word))) { |   } else if (OB_FAIL(casedown_word(src_word, dst_word))) { | ||||||
| @ -194,7 +206,6 @@ void TestDefaultFTParser::SetUp() | |||||||
|   ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_)); |   ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_)); | ||||||
|  |  | ||||||
|   ft_parser_param_.allocator_ = &allocator_; |   ft_parser_param_.allocator_ = &allocator_; | ||||||
|   ft_parser_param_.add_word_ = &add_word_; |  | ||||||
|   ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_UTF8MB4_BIN); |   ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_UTF8MB4_BIN); | ||||||
|   ft_parser_param_.parser_version_ = 0x00001; |   ft_parser_param_.parser_version_ = 0x00001; | ||||||
|   ASSERT_TRUE(nullptr != ft_parser_param_.cs_); |   ASSERT_TRUE(nullptr != ft_parser_param_.cs_); | ||||||
| @ -209,54 +220,74 @@ void TestDefaultFTParser::TearDown() | |||||||
|  |  | ||||||
| TEST_F(TestDefaultFTParser, test_space_ft_parser_segment) | TEST_F(TestDefaultFTParser, test_space_ft_parser_segment) | ||||||
| { | { | ||||||
|  |   ObSpaceFTParser parser; | ||||||
|   const char *fulltext = ObTestAddWord::TEST_FULLTEXT; |   const char *fulltext = ObTestAddWord::TEST_FULLTEXT; | ||||||
|   const int64_t ft_len = strlen(fulltext); |   const int64_t ft_len = strlen(fulltext); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(nullptr, nullptr, 0)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(nullptr)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, nullptr, 0)); |  | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, 0)); |   ft_parser_param_.fulltext_ = nullptr; | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, -1)); |   ft_parser_param_.ft_length_ = 0; | ||||||
|  |   ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_)); | ||||||
|  |  | ||||||
|  |   ft_parser_param_.fulltext_ = fulltext; | ||||||
|  |   ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_)); | ||||||
|  |  | ||||||
|  |   ft_parser_param_.ft_length_ = -1; | ||||||
|  |   ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_)); | ||||||
|  |  | ||||||
|   ft_parser_param_.fulltext_ = fulltext; |   ft_parser_param_.fulltext_ = fulltext; | ||||||
|   ft_parser_param_.ft_length_ = ft_len; |   ft_parser_param_.ft_length_ = ft_len; | ||||||
|  |  | ||||||
|   LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); |   LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); | ||||||
|   ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len)); |   ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_)); | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, add_word_.check_words(&parser)); | ||||||
|   LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); |   LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); | ||||||
| } | } | ||||||
|  |  | ||||||
| TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268) | TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268) | ||||||
| { | { | ||||||
|   common::ObArray<ObFTWord> words; |   ObSpaceFTParser parser; | ||||||
|   ObAddWordFlag flag; |  | ||||||
|   ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words); |  | ||||||
|   const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 "; |   const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 "; | ||||||
|   const int64_t ft_len = strlen(fulltext); |   const int64_t ft_len = strlen(fulltext); | ||||||
|  |  | ||||||
|   ft_parser_param_.fulltext_ = fulltext; |   ft_parser_param_.fulltext_ = fulltext; | ||||||
|   ft_parser_param_.ft_length_ = ft_len; |   ft_parser_param_.ft_length_ = ft_len; | ||||||
|   ft_parser_param_.add_word_ = &add_word; |  | ||||||
|   ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI); |   ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI); | ||||||
|  |  | ||||||
|   LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); |   LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); | ||||||
|   ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len)); |   ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_)); | ||||||
|   LOG_INFO("after space segment", KCSTRING(fulltext), K(words), K(ft_len), K(ft_parser_param_)); |   const char *word = nullptr; | ||||||
|  |   int64_t word_len = 0; | ||||||
|  |   int64_t char_len = 0; | ||||||
|  |   int64_t word_freq = 0; | ||||||
|  |   int ret = OB_SUCCESS; | ||||||
|  |   while (OB_SUCC(ret)) { | ||||||
|  |     if (OB_FAIL(parser.get_next_token(word, word_len, char_len, word_freq))) { | ||||||
|  |       LOG_WARN("fail to get next token", K(ret), K(parser)); | ||||||
|  |     } else { | ||||||
|  |       LOG_INFO("succeed to get next token", K(ret), K(ObString(word_len, word)), K(char_len)); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_)); | ||||||
| } | } | ||||||
|  |  | ||||||
| TEST_F(TestDefaultFTParser, test_default_ft_parser_desc) | TEST_F(TestDefaultFTParser, test_default_ft_parser_desc) | ||||||
| { | { | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_)); |   ObITokenIterator *iter = nullptr; | ||||||
|  |   ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_, iter)); | ||||||
|  |  | ||||||
|   ft_parser_param_.fulltext_ = ObTestAddWord::TEST_FULLTEXT; |   ft_parser_param_.fulltext_ = ObTestAddWord::TEST_FULLTEXT; | ||||||
|   ft_parser_param_.ft_length_ = strlen(ft_parser_param_.fulltext_); |   ft_parser_param_.ft_length_ = strlen(ft_parser_param_.fulltext_); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_)); |   ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_, iter)); | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, add_word_.check_words(iter)); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_SUCCESS, desc_.deinit(&plugin_param_)); |   ASSERT_EQ(OB_SUCCESS, desc_.deinit(&plugin_param_)); | ||||||
|   ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_)); |   ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_, iter)); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_)); |   ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr, iter)); | ||||||
| } | } | ||||||
|  |  | ||||||
| class ObTestFTPluginHelper : public ::testing::Test | class ObTestFTPluginHelper : public ::testing::Test | ||||||
| @ -442,29 +473,35 @@ void ObTestFTParseHelper::TearDownTestCase() | |||||||
|  |  | ||||||
| TEST_F(ObTestFTParseHelper, test_parse_fulltext) | TEST_F(ObTestFTParseHelper, test_parse_fulltext) | ||||||
| { | { | ||||||
|   common::ObSEArray<ObFTWord, 16> words; |   ObFTWordMap ft_word_map; | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse")); | ||||||
|   int64_t doc_length = 0; |   int64_t doc_length = 0; | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|  |  | ||||||
|   ObTestAddWord test_add_word(cs_type_, allocator_); |   ObTestAddWord test_add_word(cs_type_, allocator_); | ||||||
|   for (int64_t i = 0; i < words.count(); ++i) { |   ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); | ||||||
|     ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length())); |   for (int64_t i = 0; i < ft_word_map.size(); ++i) { | ||||||
|  |     int64_t word_cnt = 0; | ||||||
|  |     ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_); | ||||||
|  |     ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt)); | ||||||
|  |     ASSERT_TRUE(word_cnt >= 1); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ObFTWordMap ft_word_map; |   ft_word_map.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse")); |  | ||||||
|   ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_, |   ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_, | ||||||
|         cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map)); |         cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map)); | ||||||
|   ASSERT_EQ(words.count(), ft_word_map.size()); |   ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |   ft_word_map.clear(); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, ft_word_map)); | ||||||
|  |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, ft_word_map)); | ||||||
|  |  | ||||||
|   parse_helper_.reset(); |   parse_helper_.reset(); | ||||||
|  |   ft_word_map.clear(); | ||||||
|   ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(nullptr, plugin_name_)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(nullptr, plugin_name_)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(&allocator_, ObString())); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(&allocator_, ObString())); | ||||||
| @ -472,9 +509,9 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext) | |||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_INVALID, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_INVALID, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_EXTENDED_MARK, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_EXTENDED_MARK, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|  |  | ||||||
|   ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_)); |   ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_)); | ||||||
|  |  | ||||||
| @ -484,57 +521,80 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext) | |||||||
|   parse_helper_.reset(); |   parse_helper_.reset(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|   for (int64_t i = 0; i < words.count(); ++i) { |   ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); | ||||||
|     ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length())); |   for (int64_t i = 0; i < ft_word_map.size(); ++i) { | ||||||
|  |     int64_t word_cnt = 0; | ||||||
|  |     ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_); | ||||||
|  |     ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt)); | ||||||
|  |     ASSERT_TRUE(word_cnt >= 1); | ||||||
|  |   } | ||||||
|  |   parse_helper_.reset(); | ||||||
|  |   ft_word_map.clear(); | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, "beng.1")); | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|  |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map)); | ||||||
|  |   ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size()); | ||||||
|  |   for (int64_t i = 0; i < ft_word_map.size(); ++i) { | ||||||
|  |     int64_t word_cnt = 0; | ||||||
|  |     ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_); | ||||||
|  |     ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt)); | ||||||
|  |     ASSERT_TRUE(word_cnt >= 1); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| TEST_F(ObTestFTParseHelper, test_min_and_max_word_len) | TEST_F(ObTestFTParseHelper, test_min_and_max_word_len) | ||||||
| { | { | ||||||
|   common::ObSEArray<ObFTWord, 16> words; |   ObFTWordMap words; | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse")); | ||||||
|   int64_t doc_length = 0; |   int64_t doc_length = 0; | ||||||
|  |  | ||||||
|   // word len = 2; |   // word len = 2; | ||||||
|   const char *word_len_2 = "ab"; |   const char *word_len_2 = "ab"; | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_2, std::strlen(word_len_2), doc_length, words)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_2, std::strlen(word_len_2), doc_length, words)); | ||||||
|   ASSERT_EQ(0, words.count()); |   ASSERT_EQ(0, words.size()); | ||||||
|  |  | ||||||
|   // word len = 3; |   // word len = 3; | ||||||
|   const char *word_len_3 = "abc"; |   const char *word_len_3 = "abc"; | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_3, std::strlen(word_len_3), doc_length, words)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_3, std::strlen(word_len_3), doc_length, words)); | ||||||
|   ASSERT_EQ(1, words.count()); |   ASSERT_EQ(1, words.size()); | ||||||
|  |  | ||||||
|   // word len = 4; |   // word len = 4; | ||||||
|   const char *word_len_4 = "abcd"; |   const char *word_len_4 = "abcd"; | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_4, std::strlen(word_len_4), doc_length, words)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_4, std::strlen(word_len_4), doc_length, words)); | ||||||
|   ASSERT_EQ(1, words.count()); |   ASSERT_EQ(1, words.size()); | ||||||
|  |  | ||||||
|   // word len = 76; |   // word len = 76; | ||||||
|   const char *word_len_76 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; |   const char *word_len_76 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_76, std::strlen(word_len_76), doc_length, words)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_76, std::strlen(word_len_76), doc_length, words)); | ||||||
|   ASSERT_EQ(1, words.count()); |   ASSERT_EQ(1, words.size()); | ||||||
|  |  | ||||||
|   // word len = 84; |   // word len = 84; | ||||||
|   const char *word_len_84 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz123456"; |   const char *word_len_84 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz123456"; | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_84, std::strlen(word_len_84), doc_length, words)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_84, std::strlen(word_len_84), doc_length, words)); | ||||||
|   ASSERT_EQ(1, words.count()); |   ASSERT_EQ(1, words.size()); | ||||||
|  |  | ||||||
|   // word len = 85; |   // word len = 85; | ||||||
|   const char *word_len_85 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567"; |   const char *word_len_85 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567"; | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_85, std::strlen(word_len_85), doc_length, words)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_85, std::strlen(word_len_85), doc_length, words)); | ||||||
|   ASSERT_EQ(0, words.count()); |   ASSERT_EQ(0, words.size()); | ||||||
| } | } | ||||||
|  |  | ||||||
| class ObTestNgramFTParseHelper : public ::testing::Test | class ObTestNgramFTParseHelper : public ::testing::Test | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   static const char *name_; |   static const char *name_; | ||||||
|   static const int64_t TEST_WORD_COUNT = 29; |   static const int64_t TEST_WORD_COUNT = 27; | ||||||
|   typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap; |   typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap; | ||||||
| public: | public: | ||||||
|   ObTestNgramFTParseHelper(); |   ObTestNgramFTParseHelper(); | ||||||
|   virtual ~ObTestNgramFTParseHelper() = default; |   virtual ~ObTestNgramFTParseHelper() = default; | ||||||
|  |   static int64_t get_word_count() { return TEST_WORD_COUNT; } | ||||||
|  |  | ||||||
|   static void SetUpTestCase(); |   static void SetUpTestCase(); | ||||||
|   static void TearDownTestCase(); |   static void TearDownTestCase(); | ||||||
| @ -553,7 +613,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1"; | |||||||
|  |  | ||||||
| ObTestNgramFTParseHelper::ObTestNgramFTParseHelper() | ObTestNgramFTParseHelper::ObTestNgramFTParseHelper() | ||||||
|   : plugin_name_(STRLEN(name_), name_), |   : plugin_name_(STRLEN(name_), name_), | ||||||
|     ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"}, |     ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"}, | ||||||
|     cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN), |     cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN), | ||||||
|     allocator_() |     allocator_() | ||||||
| { | { | ||||||
| @ -583,26 +643,33 @@ void ObTestNgramFTParseHelper::TearDownTestCase() | |||||||
|  |  | ||||||
| TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext) | TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext) | ||||||
| { | { | ||||||
|  |   ObFTWordMap words; | ||||||
|  |   ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse")); | ||||||
|   int64_t doc_length = 0; |   int64_t doc_length = 0; | ||||||
|   common::ObSEArray<ObFTWord, 16> words; |  | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); | ||||||
|  |  | ||||||
|   for (int64_t i = 0; i < words.count(); ++i) { |   ASSERT_EQ(get_word_count(), words.size()); | ||||||
|     ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length())); |   for (int64_t i = 0; i < words.size(); ++i) { | ||||||
|  |     int64_t word_cnt = 0; | ||||||
|  |     ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_); | ||||||
|  |     ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt)); | ||||||
|  |     ASSERT_TRUE(word_cnt >= 1); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ObFTWordMap ft_word_map; |   ObFTWordMap ft_word_map; | ||||||
|   ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse")); |   ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse")); | ||||||
|   ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_, |   ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_, | ||||||
|         cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map)); |         cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map)); | ||||||
|   ASSERT_EQ(words.count(), ft_word_map.size() + 2); |   ASSERT_EQ(words.size(), ft_word_map.size()); | ||||||
|  |  | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words)); | ||||||
|   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words)); |   ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words)); | ||||||
|  |  | ||||||
|   parse_helper_.reset(); |   parse_helper_.reset(); | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); | ||||||
|  |  | ||||||
| @ -620,14 +687,19 @@ TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext) | |||||||
|   ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_)); |   ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_)); | ||||||
|  |  | ||||||
|   parse_helper_.reset(); |   parse_helper_.reset(); | ||||||
|  |   words.clear(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); | ||||||
|  |  | ||||||
|   parse_helper_.reset(); |   parse_helper_.reset(); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); |   ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_)); | ||||||
|   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, |   ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, | ||||||
|         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); |         std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words)); | ||||||
|   for (int64_t i = 0; i < words.count(); ++i) { |   ASSERT_EQ(get_word_count(), words.size()); | ||||||
|     ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length())); |   for (int64_t i = 0; i < words.size(); ++i) { | ||||||
|  |     int64_t word_cnt = 0; | ||||||
|  |     ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_); | ||||||
|  |     ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt)); | ||||||
|  |     ASSERT_TRUE(word_cnt >= 1); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @ -638,7 +710,7 @@ int main(int argc, char **argv) | |||||||
| { | { | ||||||
|   system("rm -rf test_fts_plugin.log"); |   system("rm -rf test_fts_plugin.log"); | ||||||
|   OB_LOGGER.set_file_name("test_fts_plugin.log", true); |   OB_LOGGER.set_file_name("test_fts_plugin.log", true); | ||||||
|   OB_LOGGER.set_log_level("INFO"); |   OB_LOGGER.set_log_level("DEBUG"); | ||||||
|   oceanbase::storage::ObTestFTPluginHelper::file_name = argv[0]; |   oceanbase::storage::ObTestFTPluginHelper::file_name = argv[0]; | ||||||
|   testing::InitGoogleTest(&argc, argv); |   testing::InitGoogleTest(&argc, argv); | ||||||
|   return RUN_ALL_TESTS(); |   return RUN_ALL_TESTS(); | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user
	 Tyshawn
					Tyshawn