[FTS] Adjust plugin tokenizer interface for fulltext search

This commit is contained in:
Tyshawn
2024-06-17 09:38:01 +00:00
committed by ob-robot
parent 106d246f51
commit 89e696f309
19 changed files with 676 additions and 412 deletions

View File

@ -114,7 +114,7 @@ enum class ObPluginType : uint64_t
// define plugin license
enum class ObPluginLicenseType : uint64_t
{
OB_MULAN_V2_LICENSE = 1, // Mulan PubL v2 license
OB_Mulan_PubL_V2_LICENSE = 1, // Mulan PubL v2 license
OB_MAX_PLUGIN_LICENSE_TYPE = 2, // max plugin license type
};
@ -186,7 +186,7 @@ public:
&& nullptr != author_
&& nullptr != spec_
&& PLUGIN_VERSION == version_
&& (ObPluginLicenseType::OB_MULAN_V2_LICENSE <= license_
&& (ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE <= license_
&& license_ < ObPluginLicenseType::OB_MAX_PLUGIN_LICENSE_TYPE)
&& nullptr != desc_;
}
@ -217,24 +217,9 @@ public:
class ObFTParserParam final
{
public:
class ObIAddWord
{
public:
ObIAddWord() = default;
virtual ~ObIAddWord() = default;
virtual int operator()(
ObFTParserParam *param,
const char *word,
const int64_t word_len,
const int64_t char_cnt) = 0;
virtual int64_t get_add_word_count() const = 0;
DECLARE_PURE_VIRTUAL_TO_STRING;
};
public:
ObFTParserParam()
: allocator_(nullptr),
add_word_(nullptr),
cs_(nullptr),
fulltext_(nullptr),
ft_length_(0),
@ -245,36 +230,42 @@ public:
inline bool is_valid() const
{
return nullptr != allocator_
&& nullptr != add_word_
&& nullptr != cs_
&& nullptr != fulltext_
&& 0 < ft_length_
&& 0 <= parser_version_;
}
inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt)
{
return (*add_word_)(param, word, word_len, char_cnt);
}
inline void reset()
{
allocator_ = nullptr;
add_word_ = nullptr;
cs_ = nullptr;
fulltext_ = nullptr;
ft_length_ = 0;
parser_version_ = 0;
}
TO_STRING_KV(KP_(allocator), KP_(add_word), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version));
TO_STRING_KV(KP_(allocator), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version));
public:
common::ObIAllocator *allocator_;
ObIAddWord *add_word_;
const ObCharsetInfo *cs_;
const char *fulltext_;
int64_t ft_length_;
int64_t parser_version_;
};
class ObITokenIterator
{
public:
ObITokenIterator() = default;
virtual ~ObITokenIterator() = default;
virtual int get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_cnt,
int64_t &word_freq) = 0;
DECLARE_PURE_VIRTUAL_TO_STRING;
};
// fulltext parser descriptor interface for domain index
// - splitting a document into many tokenizations.
class ObIFTParserDesc : public ObIPluginDesc
@ -286,12 +277,22 @@ public:
/**
* split fulltext into multiple word segments
*
* @param[in] fulltext, the document to be tokenized.
* @param[out] words, the word segmentation after splitting.
* @param[in] param, the document to be tokenized and parameters related to word segmentation.
* @param[out] iter, the tokenized words' iterator.
*
* @return error code, such as, OB_SUCCESS, OB_INVALID_ARGUMENT, ...
*/
virtual int segment(ObFTParserParam *param) const = 0;
virtual int segment(ObFTParserParam *param, ObITokenIterator *&iter) const = 0;
/**
* Release resources held by the iterator and free token iterator.
*/
virtual void free_token_iter(ObFTParserParam *param, ObITokenIterator *&iter) const
{
if (OB_NOT_NULL(iter)) {
iter->~ObITokenIterator();
}
}
};
} // end namespace lib

View File

@ -207,30 +207,16 @@ int ObDASDomainUtils::generate_spatial_index_rows(
ObFTWordMap &words_count)
{
int ret = OB_SUCCESS;
common::ObSEArray<ObFTWord, 256> words;
if (OB_ISNULL(helper)
|| OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type
|| ObCollationType::CS_TYPE_EXTENDED_MARK < type)
|| OB_UNLIKELY(!words_count.created())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created()));
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) {
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) {
LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext));
} else {
for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) {
const ObFTWord &ft_word = words.at(i);
int64_t word_count = 0;
if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) {
LOG_WARN("fail to get ft word", K(ret), K(ft_word));
} else {
word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count;
if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) {
LOG_WARN("fail to set ft word and count", K(ret), K(ft_word));
}
}
}
}
STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words), K(type));
STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words_count.size()), K(type));
return ret;
}
@ -484,6 +470,7 @@ void ObDomainDMLIterator::reset()
row_projector_ = nullptr;
das_ctdef_ = nullptr;
main_ctdef_ = nullptr;
allocator_.reset();
}
void ObDomainDMLIterator::set_ctdef(
@ -520,10 +507,12 @@ int ObDomainDMLIterator::get_next_domain_row(ObNewRow *&row)
while (OB_SUCC(ret) && !got_row) {
if (row_idx_ >= rows_.count()) {
rows_.reuse();
allocator_.reuse();
row_idx_ = 0;
if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected error, not domain index", K(ret), K(das_ctdef_->table_param_.get_data_table()));
} else if (FAILEDx(write_iter_.get_next_row(sr))) {
if (OB_ITER_END != ret) {
LOG_WARN("get next row from result iterator failed", K(ret));
@ -562,6 +551,7 @@ int ObDomainDMLIterator::get_next_domain_rows(ObNewRow *&row, int64_t &row_count
while (OB_SUCC(ret) && !got_row) {
if (row_idx_ >= rows_.count()) {
rows_.reuse();
allocator_.reuse();
row_idx_ = 0;
if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) {
ret = OB_ERR_UNEXPECTED;
@ -757,7 +747,7 @@ int ObFTDMLIterator::get_ft_and_doc_id(
const ObChunkDatumStore::StoredRow *store_row,
ObString &doc_id,
ObString &ft,
common::ObObjMeta &ft_meta) const
common::ObObjMeta &ft_meta)
{
int ret = OB_SUCCESS;
const uint64_t doc_id_col_id = das_ctdef_->table_param_.get_data_table().get_doc_id_col_id();
@ -793,7 +783,7 @@ int ObFTDMLIterator::get_ft_and_doc_id_for_update(
const ObChunkDatumStore::StoredRow *store_row,
ObString &doc_id,
ObString &ft,
common::ObObjMeta &ft_meta) const
common::ObObjMeta &ft_meta)
{
int ret = OB_SUCCESS;
const uint64_t rowkey_col_cnt = das_ctdef_->table_param_.get_data_table().get_rowkey_column_num();
@ -863,7 +853,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data(
const ObChunkDatumStore::StoredRow *store_row,
int64_t& multivalue_idx,
int64_t& multivalue_arr_idx,
ObString &multivalue_data) const
ObString &multivalue_data)
{
int ret = OB_SUCCESS;
multivalue_idx = OB_INVALID_ID;
@ -910,7 +900,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data_for_update(
const ObChunkDatumStore::StoredRow *store_row,
int64_t& multivalue_idx,
int64_t& multivalue_arr_idx,
ObString &multivalue_data) const
ObString &multivalue_data)
{
int ret = OB_SUCCESS;
bool found = false;

View File

@ -13,6 +13,7 @@
#ifndef OCEANBASE_DAS_DOMAIN_UTILS_H
#define OCEANBASE_DAS_DOMAIN_UTILS_H
#include "lib/allocator/page_arena.h"
#include "lib/hash/ob_hashset.h"
#include "sql/das/ob_das_dml_ctx_define.h"
#include "storage/fts/ob_fts_plugin_helper.h"
@ -56,8 +57,6 @@ public:
const IntFixedArray &row_projector,
const ObDASWriteBuffer::DmlRow &dml_row,
ObDomainIndexRow &domain_rows);
private:
typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap;
private:
static int segment_and_calc_word_count(
common::ObIAllocator &allocator,
@ -126,7 +125,7 @@ protected:
ObDASWriteBuffer::Iterator &write_iter_;
const ObDASDMLBaseCtDef *das_ctdef_;
const ObDASDMLBaseCtDef *main_ctdef_;
common::ObIAllocator &allocator_;
common::ObArenaAllocator allocator_;
bool is_update_;
private:
DISALLOW_COPY_AND_ASSIGN(ObDomainDMLIterator);
@ -178,13 +177,13 @@ private:
const ObChunkDatumStore::StoredRow *store_row,
int64_t& multivalue_idx,
int64_t& multivalue_arr_idx,
ObString &multivalue_data) const;
ObString &multivalue_data);
int get_multivlaue_json_data_for_update(
const ObChunkDatumStore::StoredRow *store_row,
int64_t& multivalue_idx,
int64_t& multivalue_arr_idx,
ObString &multivalue_data) const;
ObString &multivalue_data);
};
@ -214,12 +213,12 @@ protected:
const ObChunkDatumStore::StoredRow *store_row,
ObString &doc_id,
ObString &ft,
common::ObObjMeta &ft_meta) const;
common::ObObjMeta &ft_meta);
int get_ft_and_doc_id_for_update(
const ObChunkDatumStore::StoredRow *store_row,
ObString &doc_id,
ObString &ft,
common::ObObjMeta &ft_meta) const;
common::ObObjMeta &ft_meta);
private:
storage::ObFTParseHelper ft_parse_helper_;

View File

@ -312,21 +312,9 @@ int ObTextRetrievalMerge::init_query_tokens(const ObDASIRScanCtDef *ir_ctdef, Ob
} else if (OB_FAIL(token_map.create(ft_word_bkt_cnt, common::ObMemAttr(MTL_ID(), "FTWordMap")))) {
LOG_WARN("failed to create token map", K(ret));
} else if (OB_FAIL(tokenize_helper.segment(
cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, tokens))) {
cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, token_map))) {
LOG_WARN("failed to segment");
} else {
for (int64_t i = 0; OB_SUCC(ret) && i < tokens.count(); ++i) {
const ObFTWord &token = tokens.at(i);
int64_t word_count = 0;
if (OB_FAIL(token_map.get_refactored(token, word_count)) && OB_HASH_NOT_EXIST != ret) {
LOG_WARN("fail to get ft word", K(ret), K(token));
} else {
word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count;
if (OB_FAIL(token_map.set_refactored(token, word_count, 1/*overwrite*/))) {
LOG_WARN("fail to set ft word and count", K(ret), K(token));
}
}
}
for (hash::ObHashMap<ObFTWord, int64_t>::const_iterator iter = token_map.begin();
OB_SUCC(ret) && iter != token_map.end();
++iter) {

View File

@ -22,70 +22,43 @@ namespace oceanbase
namespace storage
{
/*static*/ int ObBEngFTParser::segment(
lib::ObFTParserParam *param,
const char *ft,
const int64_t ft_len)
int ObBEngFTParser::get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_len,
int64_t &word_freq)
{
int ret = OB_SUCCESS;
ObDatum doc;
doc.set_string(ft, ft_len);
ObBEngFTParser parser;
share::ObITokenStream *token_stream = nullptr;
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
} else if (OB_FAIL(parser.init(param))) {
LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param));
} else if (FALSE_IT(doc.set_string(ft, ft_len))) {
} else if (OB_FAIL(parser.segment(doc, token_stream))) {
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len));
} else if (OB_ISNULL(token_stream)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream));
} else {
ObDatum token;
int64_t token_freq = 0;
while (OB_SUCC(ret)) {
if (OB_FAIL(token_stream->get_next(token, token_freq))) {
if (OB_ITER_END != ret) {
LOG_WARN("fail to get next token", K(ret), KPC(token_stream));
}
} else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) {
LOG_WARN("fail to add word", K(ret), K(token), KPC(param));
}
}
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
}
}
return ret;
}
/*static*/ int ObBEngFTParser::add_word(
lib::ObFTParserParam *param,
common::ObIAllocator *allocator,
const char *word,
int64_t word_len)
{
int ret = OB_SUCCESS;
char *buf = nullptr;
if (OB_ISNULL(param)
|| OB_ISNULL(allocator)
|| OB_ISNULL(word)
|| OB_UNLIKELY(0 >= word_len)) {
word = nullptr;
word_len = 0;
char_len = 0;
word_freq = 0;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("beng ft parser isn't initialized", K(ret), K(is_inited_));
} else if (OB_ISNULL(token_stream_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_));
} else if (OB_FAIL(token_stream_->get_next(token, token_freq))) {
if (OB_ITER_END != ret) {
LOG_WARN("fail to get next token", K(ret), KPC(token_stream_));
}
} else if (OB_ISNULL(token.ptr_) || OB_UNLIKELY(0 >= token.len_ || 0 >= token_freq)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len));
} else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) {
LOG_DEBUG("skip too small or large word", K(ret), K(word_len));
} else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) {
LOG_WARN("invalid arguments", K(ret), KP(token.ptr_), K(token.len_), K(token_freq));
} else if (OB_ISNULL(buf = static_cast<char *>(allocator_.alloc(token.len_)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate word memory", K(ret), K(word_len));
} else if (FALSE_IT(MEMCPY(buf, word, word_len))) {
} else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word)));
LOG_WARN("fail to allocate word memory", K(ret), K(token.len_));
} else {
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)));
MEMCPY(buf, token.ptr_, token.len_);
word = buf;
word_len = token.len_;
char_len = token.len_;
word_freq = token_freq;
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)), K(word_freq));
}
return ret;
}
@ -103,13 +76,20 @@ int ObBEngFTParser::init(lib::ObFTParserParam *param)
ret = OB_NOT_SUPPORTED;
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_));
} else {
doc_.set_string(param->fulltext_, param->ft_length_);
analysis_ctx_.cs_ = param->cs_;
analysis_ctx_.filter_stopword_ = false;
analysis_ctx_.need_grouping_ = false;
if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) {
LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_));
} else if (OB_FAIL(segment(doc_, token_stream_))) {
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(param->fulltext_), K(param->ft_length_));
} else if (OB_ISNULL(token_stream_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_));
} else {
is_inited_ = true;
LOG_DEBUG("succeed to init beng parser", K(ret), K(english_analyzer_), KPC(token_stream_), K(doc_));
}
}
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
@ -139,6 +119,8 @@ void ObBEngFTParser::reset()
{
analysis_ctx_.reset();
english_analyzer_.reset();
doc_.reset();
token_stream_ = nullptr;
is_inited_ = false;
}
@ -159,20 +141,43 @@ int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param)
return OB_SUCCESS;
}
int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const
int ObBasicEnglishFTParserDesc::segment(
lib::ObFTParserParam *param,
lib::ObITokenIterator *&iter) const
{
int ret = OB_SUCCESS;
void *buf = nullptr;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_));
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), KPC(param));
} else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) {
LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param),
K(param->fulltext_), K(param->ft_length_));
} else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObBEngFTParser)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate basic english ft parser", K(ret));
} else {
ObBEngFTParser *parser = new (buf) ObBEngFTParser(*(param->allocator_));
if (OB_FAIL(parser->init(param))) {
LOG_WARN("fail to init basic english parser", K(ret), KPC(param));
} else {
iter = parser;
}
}
return ret;
return ret;
}
void ObBasicEnglishFTParserDesc::free_token_iter(
lib::ObFTParserParam *param,
lib::ObITokenIterator *&iter) const
{
if (OB_NOT_NULL(iter)) {
abort_unless(nullptr != param);
abort_unless(nullptr != param->allocator_);
iter->~ObITokenIterator();
param->allocator_->free(iter);
}
}
} // end namespace storage

View File

@ -23,40 +23,41 @@ namespace oceanbase
namespace storage
{
class ObBEngFTParser final
class ObBEngFTParser final : public lib::ObITokenIterator
{
public:
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
public:
static int segment(
lib::ObFTParserParam *param,
const char *fulltext,
const int64_t ft_len);
private:
ObBEngFTParser()
: analysis_ctx_(),
explicit ObBEngFTParser(common::ObIAllocator &allocator)
: allocator_(allocator),
analysis_ctx_(),
english_analyzer_(),
doc_(),
token_stream_(nullptr),
is_inited_(false)
{}
~ObBEngFTParser() = default;
~ObBEngFTParser() { reset(); }
static int add_word(
lib::ObFTParserParam *param,
common::ObIAllocator *allocator,
const char *word,
int64_t word_len);
int init(lib::ObFTParserParam *param);
void reset();
virtual int get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_len,
int64_t &word_freq) override;
VIRTUAL_TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), KP_(token_stream), K_(is_inited));
private:
int segment(
const common::ObDatum &doc,
share::ObITokenStream *&token_stream);
TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited));
private:
common::ObIAllocator &allocator_;
share::ObTextAnalysisCtx analysis_ctx_;
share::ObEnglishTextAnalyzer english_analyzer_;
common::ObDatum doc_;
share::ObITokenStream *token_stream_;
bool is_inited_;
DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser);
@ -69,7 +70,8 @@ public:
virtual ~ObBasicEnglishFTParserDesc() = default;
virtual int init(lib::ObPluginParam *param) override;
virtual int deinit(lib::ObPluginParam *param) override;
virtual int segment(lib::ObFTParserParam *param) const override;
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
OB_INLINE void reset() { is_inited_ = false; }
private:
bool is_inited_;

View File

@ -26,7 +26,7 @@ OB_DECLARE_PLUGIN(whitespace_parser)
OB_PLUGIN_AUTHOR_OCEANBASE, // author
"This is a default whitespace parser plugin.", // brief specification
0x00001, // version
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license
&oceanbase::storage::whitespace_parser, // default space parser plugin instance
};
@ -41,13 +41,13 @@ OB_DECLARE_PLUGIN(ngram_parser)
OB_PLUGIN_AUTHOR_OCEANBASE, // author
"This is a ngram fulltext parser plugin.", // brief specification
0x00001, // version
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license
&oceanbase::storage::ngram_parser, // ngram parser plugin instance
};
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser);
///////////////////////////////////// Default fulltext parser //////////////////////////////////////////
///////////////////////////////////// BEng fulltext parser //////////////////////////////////////////
OB_DECLARE_PLUGIN(beng_parser)
{
@ -56,8 +56,8 @@ OB_DECLARE_PLUGIN(beng_parser)
OB_PLUGIN_AUTHOR_OCEANBASE, // author
"This is a basic english parser plugin.", // brief specification
0x00001, // version
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
&oceanbase::storage::beng_parser, // default space parser plugin instance
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license
&oceanbase::storage::beng_parser, // basic english parser plugin instance
};
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser);

View File

@ -119,7 +119,7 @@ int ObFTParseHelper::segment(
const char *ft,
const int64_t ft_len,
common::ObIAllocator &allocator,
lib::ObFTParserParam::ObIAddWord &add_word)
ObAddWord &add_word)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(parser_version < 0 || nullptr == parser_desc || nullptr == cs || nullptr == ft || 0 >= ft_len)) {
@ -127,14 +127,38 @@ int ObFTParseHelper::segment(
LOG_WARN("invalid arguments", K(ret), K(parser_version), KP(parser_desc), KP(cs), K(ft), K(ft_len));
} else {
lib::ObFTParserParam param;
lib::ObITokenIterator *iter = nullptr;
param.allocator_ = &allocator;
param.add_word_ = &add_word;
param.cs_ = cs;
param.fulltext_ = ft;
param.ft_length_ = ft_len;
param.parser_version_ = parser_version;
if (OB_FAIL(parser_desc->segment(&param))) {
if (OB_FAIL(parser_desc->segment(&param, iter))) {
LOG_WARN("fail to segment", K(ret), K(param));
} else if (OB_ISNULL(iter)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected error, token iterator is nullptr", K(ret), KP(iter));
} else {
const char *word = nullptr;
int64_t word_len = 0;
int64_t char_cnt = 0;
int64_t word_freq = 0;
while (OB_SUCC(ret)) {
if (OB_FAIL(iter->get_next_token(word, word_len, char_cnt, word_freq))) {
if (OB_ITER_END != ret) {
LOG_WARN("fail to get next token", K(ret), KPC(iter));
}
} else if (OB_FAIL(add_word.process_word(word, word_len, char_cnt, word_freq))) {
LOG_WARN("fail to process one word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq));
}
}
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
}
}
if (OB_NOT_NULL(iter)) {
parser_desc->free_token_iter(&param, iter);
iter = nullptr;
}
}
return ret;
@ -176,11 +200,10 @@ int ObFTParseHelper::init(
LOG_WARN("unexpected error, parse handler is nullptr", K(ret), KP(parse_handler));
} else if (OB_FAIL(get_fulltext_parser_desc(*parse_handler, parser_desc_))) {
LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler));
} else if (OB_FAIL(set_add_word_flag(parser_name_))) {
LOG_WARN("fail to set add word flag", K(ret), K(parser_name_));
} else {
plugin_param_.desc_ = parser_desc_;
if (need_min_max_word(parser_name_)) { add_word_flag_.set_min_max_word(); }
if (need_castdn(parser_name_)) { add_word_flag_.set_casedown(); }
if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word(); }
allocator_ = allocator;
is_inited_ = true;
}
@ -204,7 +227,7 @@ int ObFTParseHelper::segment(
const char *fulltext,
const int64_t fulltext_len,
int64_t &doc_length,
common::ObIArray<ObFTWord> &words) const
ObFTWordMap &words) const
{
int ret = OB_SUCCESS;
const ObCharsetInfo *cs = nullptr;
@ -231,29 +254,34 @@ int ObFTParseHelper::segment(
doc_length = add_word.get_add_word_count();
}
}
LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words));
LOG_DEBUG("ft parse segment", K(ret), K(type), K(add_word_flag_), K(parser_name_),
K(ObString(fulltext_len, fulltext)), K(words.size()));
return ret;
}
bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser)
int ObFTParseHelper::set_add_word_flag(const ObFTParser &parser)
{
share::ObPluginName space("space");
share::ObPluginName beng("beng");
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
}
bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser)
{
share::ObPluginName space("space");
share::ObPluginName beng("beng");
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
}
bool ObFTParseHelper::need_castdn(const ObFTParser &parser)
{
share::ObPluginName space("space");
share::ObPluginName ngram("ngram");
return parser.get_parser_name() == space || parser.get_parser_name() == ngram;
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!parser.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), K(parser));
} else if (share::ObPluginName("space") == parser.get_parser_name()) {
add_word_flag_.set_min_max_word();
add_word_flag_.set_stop_word();
add_word_flag_.set_casedown();
add_word_flag_.set_groupby_word();
} else if (share::ObPluginName("beng") == parser.get_parser_name()) {
add_word_flag_.set_min_max_word();
add_word_flag_.set_stop_word();
add_word_flag_.set_groupby_word();
} else if (share::ObPluginName("ngram") == parser.get_parser_name()) {
add_word_flag_.set_casedown();
add_word_flag_.set_groupby_word();
} else {
ret = OB_NOT_SUPPORTED;
LOG_WARN("unsupported parser for fulltext search", K(ret), K(parser));
}
return ret;
}
} // end namespace storage

View File

@ -25,6 +25,8 @@ namespace oceanbase
namespace storage
{
class ObAddWord;
class ObFTParser final
{
public:
@ -89,7 +91,7 @@ public:
const char *fulltext,
const int64_t fulltext_len,
int64_t &doc_length,
common::ObIArray<ObFTWord> &words) const;
ObFTWordMap &words) const;
const ObFTParser &get_parser_name() const { return parser_name_; }
void reset();
@ -105,17 +107,8 @@ private:
const char *fulltext,
const int64_t fulltext_len,
common::ObIAllocator &allocator,
lib::ObFTParserParam::ObIAddWord &add_word);
static bool need_stopword_list(const ObFTParser &parser);
static bool need_castdn(const ObFTParser &parser);
static bool need_min_max_word(const ObFTParser &parser);
int alloc_add_word(
const ObCollationType &type,
common::ObIArray<ObFTWord> &words,
lib::ObFTParserParam::ObIAddWord *&add_word) const;
void free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const;
ObAddWord &add_word);
int set_add_word_flag(const ObFTParser &parser);
private:
lib::ObPluginParam plugin_param_;
common::ObIAllocator *allocator_;

View File

@ -24,10 +24,10 @@ ObAddWord::ObAddWord(
const ObCollationType &type,
const ObAddWordFlag &flag,
common::ObIAllocator &allocator,
common::ObIArray<ObFTWord> &word)
ObFTWordMap &word_map)
: collation_type_(type),
allocator_(allocator),
words_(word),
word_map_(word_map),
min_max_word_cnt_(0),
non_stopword_cnt_(0),
stopword_cnt_(0),
@ -35,19 +35,19 @@ ObAddWord::ObAddWord(
{
}
int ObAddWord::operator()(
lib::ObFTParserParam *param,
int ObAddWord::process_word(
const char *word,
const int64_t word_len,
const int64_t char_cnt)
const int64_t char_cnt,
const int64_t word_freq)
{
int ret = OB_SUCCESS;
bool is_stopword = false;
ObFTWord src_word(word_len, word, collation_type_);
ObFTWord dst_word;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt || 0 >= word_freq)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq));
} else if (is_min_max_word(char_cnt)) {
++min_max_word_cnt_;
LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt));
@ -58,11 +58,11 @@ int ObAddWord::operator()(
} else if (OB_UNLIKELY(is_stopword)) {
++stopword_cnt_;
LOG_DEBUG("skip stopword", K(ret), K(dst_word));
} else if (OB_FAIL(words_.push_back(dst_word))) {
LOG_WARN("fail to push word into words array", K(ret), K(dst_word));
} else if (OB_FAIL(groupby_word(dst_word, word_freq))) {
LOG_WARN("fail to groupby word into word map", K(ret), K(dst_word), K(word_freq));
} else {
++non_stopword_cnt_;
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word));
non_stopword_cnt_ += word_freq;
LOG_DEBUG("add word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq), K(src_word), K(dst_word));
}
return ret;
}
@ -104,5 +104,31 @@ int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword)
return ret;
}
int ObAddWord::groupby_word(const ObFTWord &word, const int64_t word_freq)
{
int ret = OB_SUCCESS;
int64_t word_count = 0;
if (OB_UNLIKELY(word.empty() || word_freq <= 0)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), K(word), K(word_freq));
} else if (!flag_.groupby_word()) {
if (OB_FAIL(word_map_.set_refactored(word, 1/*word count*/))) {
LOG_WARN("fail to set fulltext word and count", K(ret), K(word));
}
} else if (OB_FAIL(word_map_.get_refactored(word, word_count)) && OB_HASH_NOT_EXIST != ret) {
LOG_WARN("fail to get fulltext word", K(ret), K(word));
} else {
if (OB_HASH_NOT_EXIST == ret) {
word_count = 1;
} else {
word_count += word_freq;
}
if (OB_FAIL(word_map_.set_refactored(word, word_count, 1/*overwrite*/))) {
LOG_WARN("fail to set fulltext word and count", K(ret), K(word), K(word_count));
}
}
return ret;
}
} // end namespace storage
} // end namespace oceanbase

View File

@ -63,23 +63,23 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = {
"www"
};
class ObAddWord final : public lib::ObFTParserParam::ObIAddWord
class ObAddWord final
{
public:
ObAddWord(
const ObCollationType &type,
const ObAddWordFlag &flag,
common::ObIAllocator &allocator,
common::ObIArray<ObFTWord> &word);
virtual ~ObAddWord() = default;
virtual int operator()(
lib::ObFTParserParam *param,
ObFTWordMap &word_map);
~ObAddWord() = default;
int process_word(
const char *word,
const int64_t word_len,
const int64_t char_cnt) override;
virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; }
const int64_t char_cnt,
const int64_t word_freq);
virtual int64_t get_add_word_count() const { return non_stopword_cnt_; }
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt),
K_(words));
K(word_map_.size()));
public:
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
@ -87,10 +87,11 @@ private:
bool is_min_max_word(const int64_t c_len) const;
int casedown_word(const ObFTWord &src, ObFTWord &dst);
int check_stopword(const ObFTWord &word, bool &is_stopword);
int groupby_word(const ObFTWord &word, const int64_t word_cnt);
private:
ObCollationType collation_type_;
common::ObIAllocator &allocator_;
common::ObIArray<ObFTWord> &words_;
ObFTWordMap &word_map_;
int64_t min_max_word_cnt_;
int64_t non_stopword_cnt_;
int64_t stopword_cnt_;

View File

@ -14,6 +14,7 @@
#define OB_FTS_STRUCT_H_
#include "lib/charset/ob_charset.h"
#include "lib/hash/ob_hashmap.h"
namespace oceanbase
{
@ -34,7 +35,7 @@ public:
hash_val = ObCharset::hash(type_, word_);
return common::OB_SUCCESS;
}
OB_INLINE uint64_t hash() const { return word_.hash(); }
OB_INLINE uint64_t hash() const { return ObCharset::hash(type_, word_); }
OB_INLINE bool empty() const { return word_.empty(); }
OB_INLINE bool operator ==(const ObFTWord &other) const
@ -76,6 +77,8 @@ public:
int64_t word_cnt_;
};
typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap;
class ObAddWordFlag final
{
private:
@ -84,6 +87,7 @@ private:
// than a maximum word length.
static const uint64_t AWF_STOPWORD = 1 << 1; // filter by sotp word table.
static const uint64_t AWF_CASEDOWN = 1 << 2; // convert characters from uppercase to lowercase.
static const uint64_t AWF_GROUPBY_WORD = 1 << 3; // distinct and word aggregation
public:
ObAddWordFlag() : flag_(AWF_NONE) {}
~ObAddWordFlag() = default;
@ -95,13 +99,17 @@ public:
void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); }
void set_stop_word() { set_flag(AWF_STOPWORD); }
void set_casedown() { set_flag(AWF_CASEDOWN); }
void set_groupby_word() { set_flag(AWF_GROUPBY_WORD); }
void clear() { flag_ = AWF_NONE; }
void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); }
void clear_stop_word() { clear_flag(AWF_STOPWORD); }
void clear_casedown() { clear_flag(AWF_CASEDOWN); }
void clear_groupby_word() { clear_flag(AWF_GROUPBY_WORD); }
bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); }
bool stopword() const { return has_flag(AWF_STOPWORD); }
bool casedown() const { return has_flag(AWF_CASEDOWN); }
bool groupby_word() const { return has_flag(AWF_GROUPBY_WORD); }
TO_STRING_KV(K_(flag));
private:
uint64_t flag_;
};

View File

@ -24,24 +24,81 @@ namespace storage
#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
/*static*/ int ObNgramFTParser::segment(
lib::ObFTParserParam *param,
const char *fulltext,
const int64_t ft_len)
ObNgramFTParser::ObNgramFTParser()
: cs_(nullptr),
start_(nullptr),
next_(nullptr),
end_(nullptr),
c_nums_(0),
is_inited_(false)
{}
ObNgramFTParser::~ObNgramFTParser()
{
reset();
}
void ObNgramFTParser::reset()
{
cs_ = nullptr;
start_ = nullptr;
next_ = nullptr;
end_ = nullptr;
c_nums_ = 0;
is_inited_ = false;
}
int ObNgramFTParser::init(lib::ObFTParserParam *param)
{
int ret = OB_SUCCESS;
int64_t c_nums = 0;
const char *start = fulltext;
const char *next = start;
const char *end = start + ft_len;
if (OB_ISNULL(param) || OB_ISNULL(fulltext) || OB_UNLIKELY(ft_len <= 0)) {
if (OB_UNLIKELY(is_inited_)) {
ret = OB_INIT_TWICE;
LOG_WARN("init twice", K(ret), KPC(param), KPC(this));
} else if (OB_ISNULL(param)
|| OB_ISNULL(param->cs_)
|| OB_ISNULL(param->fulltext_)
|| OB_UNLIKELY(0 >= param->ft_length_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(param), KP(fulltext), K(ft_len));
LOG_WARN("invalid arguments", K(ret), KPC(param));
} else {
const ObCharsetInfo *cs = param->cs_;
while (OB_SUCC(ret) && next < end) {
cs_ = param->cs_;
start_ = param->fulltext_;
next_ = start_;
end_ = start_ + param->ft_length_;
c_nums_ = 0;
is_inited_ = true;
}
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
reset();
}
return ret;
}
int ObNgramFTParser::get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_len,
int64_t &word_freq)
{
int ret = OB_SUCCESS;
word = nullptr;
word_len = 0;
char_len = 0;
word_freq = 0;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ngram ft parser isn't initialized", K(ret), K(is_inited_));
} else {
int64_t c_nums = c_nums_;
const char *start = start_;
const char *next = next_;
const char *end = end_;
const ObCharsetInfo *cs = cs_;
do {
const int64_t c_len = ob_mbcharlen_ptr(cs, next, end);
if (next + c_len > end || 0 == c_len) { // if char is invalid, just skip the rest of doc.
ret = OB_ITER_END;
break;
} else {
int ctype;
@ -50,38 +107,31 @@ namespace storage
start = next + 1;
next = start;
c_nums = 0;
if (next == end) {
ret = OB_ITER_END;
}
continue;
}
next += c_len;
++c_nums;
}
if (NGRAM_TOKEN_SIZE == c_nums) {
if (OB_FAIL(add_word(param, start, next - start, c_nums))) {
LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums));
} else {
word = start;
word_len = next - start;
char_len = c_nums;
word_freq = 1;
start += ob_mbcharlen_ptr(cs, start, end);
c_nums = NGRAM_TOKEN_SIZE - 1;
break;
}
} while (OB_SUCC(ret) && next < end);
if (OB_ITER_END == ret || OB_SUCCESS == ret) {
start_ = start;
next_ = next;
end_ = end;
c_nums_ = c_nums;
}
}
}
return ret;
}
/*static*/ int ObNgramFTParser::add_word(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(param)
|| OB_ISNULL(word)
|| OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
LOG_DEBUG("next word", K(ret), K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_));
}
return ret;
}
@ -103,21 +153,43 @@ int ObNgramFTParserDesc::deinit(lib::ObPluginParam *param)
return OB_SUCCESS;
}
int ObNgramFTParserDesc::segment(lib::ObFTParserParam *param) const
int ObNgramFTParserDesc::segment(
lib::ObFTParserParam *param,
lib::ObITokenIterator *&iter) const
{
int ret = OB_SUCCESS;
void *buf = nullptr;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ngram ft parser desc hasn't be initialized", K(ret), K(is_inited_));
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), KPC(param));
} else if (OB_FAIL(ObNgramFTParser::segment(param, param->fulltext_, param->ft_length_))) {
LOG_WARN("fail to segment words for fulltext by ngram", K(ret), KPC(param),
K(param->fulltext_), K(param->ft_length_));
} else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObNgramFTParser)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate ngram ft parser", K(ret));
} else {
ObNgramFTParser *parser = new (buf) ObNgramFTParser();
if (OB_FAIL(parser->init(param))) {
LOG_WARN("fail to init ngram fulltext parser", K(ret), KPC(param));
} else {
iter = parser;
}
}
return ret;
}
void ObNgramFTParserDesc::free_token_iter(
lib::ObFTParserParam *param,
lib::ObITokenIterator *&iter) const
{
if (OB_NOT_NULL(iter)) {
abort_unless(nullptr != param);
abort_unless(nullptr != param->allocator_);
iter->~ObITokenIterator();
param->allocator_->free(iter);
}
}
} // end namespace storage
} // end namespace oceanbase

View File

@ -22,23 +22,30 @@ namespace oceanbase
namespace storage
{
class ObNgramFTParser final
class ObNgramFTParser final : public lib::ObITokenIterator
{
public:
static const int64_t NGRAM_TOKEN_SIZE = 2; // TODO: @jinzhu, please apply one system variable later, and keep the same as mysql.
public:
ObNgramFTParser() = default;
~ObNgramFTParser() = default;
static int segment(
lib::ObFTParserParam *param,
const char *fulltext,
const int64_t ft_len);
ObNgramFTParser();
virtual ~ObNgramFTParser();
int init(lib::ObFTParserParam *param);
void reset();
virtual int get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_len,
int64_t &word_freq) override;
VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited));
private:
static int add_word(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len,
const int64_t char_cnt);
const ObCharsetInfo *cs_;
const char *start_;
const char *next_;
const char *end_;
int64_t c_nums_;
bool is_inited_;
private:
DISABLE_COPY_ASSIGN(ObNgramFTParser);
};
@ -50,7 +57,8 @@ public:
virtual ~ObNgramFTParserDesc() = default;
virtual int init(lib::ObPluginParam *param) override;
virtual int deinit(lib::ObPluginParam *param) override;
virtual int segment(lib::ObFTParserParam *param) const override;
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
OB_INLINE void reset() { is_inited_ = false; }
private:
bool is_inited_;

View File

@ -24,22 +24,74 @@ namespace storage
#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
int ObSpaceFTParser::segment(
lib::ObFTParserParam *param,
const char *ft,
const int64_t ft_len)
ObSpaceFTParser::ObSpaceFTParser()
: cs_(nullptr),
start_(nullptr),
next_(nullptr),
end_(nullptr),
is_inited_(false)
{}
ObSpaceFTParser::~ObSpaceFTParser()
{
reset();
}
void ObSpaceFTParser::reset()
{
cs_ = nullptr;
start_ = nullptr;
next_ = nullptr;
end_ = nullptr;
is_inited_ = false;
}
int ObSpaceFTParser::init(lib::ObFTParserParam *param)
{
int ret = OB_SUCCESS;
const char *start = ft;
const char *next = start;
const char *end = start + ft_len;
int mbl = 0;
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
if (OB_UNLIKELY(is_inited_)) {
ret = OB_INIT_TWICE;
LOG_WARN("init twice", K(ret), KPC(param), KPC(this));
} else if (OB_ISNULL(param)
|| OB_ISNULL(param->cs_)
|| OB_ISNULL(param->fulltext_)
|| OB_UNLIKELY(0 >= param->ft_length_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
LOG_WARN("invalid arguments", K(ret), KPC(param));
} else {
const ObCharsetInfo *cs = param->cs_;
while (OB_SUCC(ret) && next < end) {
cs_ = param->cs_;
start_ = param->fulltext_;
next_ = start_;
end_ = start_ + param->ft_length_;
is_inited_ = true;
}
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
reset();
}
return ret;
}
int ObSpaceFTParser::get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_len,
int64_t &word_freq)
{
int ret = OB_SUCCESS;
int mbl = 0;
word = nullptr;
word_len = 0;
char_len = 0;
word_freq = 0;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("space ft parser isn't initialized", K(ret), K(is_inited_));
} else {
const char *start = start_;
const char *next = next_;
const char *end = end_;
const ObCharsetInfo *cs = cs_;
do {
while (next < end) {
int ctype;
mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end);
@ -62,34 +114,24 @@ int ObSpaceFTParser::segment(
++c_nums;
next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
}
if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) {
LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next));
if (0 < c_nums) {
word = start;
word_len = next - start;
char_len = c_nums;
word_freq = 1;
start = next;
break;
} else {
start = next;
}
}
} while (OB_SUCC(ret) && next < end);
if (OB_ITER_END == ret || OB_SUCCESS == ret) {
start_ = start;
next_ = next;
end_ = end;
}
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
}
}
return ret;
}
int ObSpaceFTParser::add_word(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(param)
|| OB_ISNULL(word)
|| OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
LOG_DEBUG("next word", K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_));
}
return ret;
}
@ -111,21 +153,43 @@ int ObWhiteSpaceFTParserDesc::deinit(lib::ObPluginParam *param)
return OB_SUCCESS;
}
int ObWhiteSpaceFTParserDesc::segment(lib::ObFTParserParam *param) const
int ObWhiteSpaceFTParserDesc::segment(
lib::ObFTParserParam *param,
lib::ObITokenIterator *&iter) const
{
int ret = OB_SUCCESS;
void *buf = nullptr;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_));
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), KPC(param));
} else if (OB_FAIL(ObSpaceFTParser::segment(param, param->fulltext_, param->ft_length_))) {
LOG_WARN("fail to segment words for fulltext by spaces", K(ret), KPC(param),
K(param->fulltext_), K(param->ft_length_));
} else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObSpaceFTParser)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate space ft parser", K(ret));
} else {
ObSpaceFTParser *parser = new (buf) ObSpaceFTParser();
if (OB_FAIL(parser->init(param))) {
LOG_WARN("fail to init whitespace fulltext parser", K(ret), KPC(param));
} else {
iter = parser;
}
}
return ret;
}
void ObWhiteSpaceFTParserDesc::free_token_iter(
lib::ObFTParserParam *param,
lib::ObITokenIterator *&iter) const
{
if (OB_NOT_NULL(iter)) {
abort_unless(nullptr != param);
abort_unless(nullptr != param->allocator_);
iter->~ObITokenIterator();
param->allocator_->free(iter);
}
}
} // end namespace storage
} // end namespace oceanbase

View File

@ -23,21 +23,27 @@ namespace oceanbase
namespace storage
{
class ObSpaceFTParser final
class ObSpaceFTParser final : public lib::ObITokenIterator
{
public:
ObSpaceFTParser() = default;
~ObSpaceFTParser() = default;
static int segment(
lib::ObFTParserParam *param,
const char *fulltext,
const int64_t ft_len);
ObSpaceFTParser();
virtual ~ObSpaceFTParser();
int init(lib::ObFTParserParam *param);
void reset();
virtual int get_next_token(
const char *&word,
int64_t &word_len,
int64_t &char_len,
int64_t &word_freq) override;
VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited));
private:
static int add_word(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len,
const int64_t char_cnt);
const ObCharsetInfo *cs_;
const char *start_;
const char *next_;
const char *end_;
bool is_inited_;
};
class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc
@ -47,7 +53,8 @@ public:
virtual ~ObWhiteSpaceFTParserDesc() = default;
virtual int init(lib::ObPluginParam *param) override;
virtual int deinit(lib::ObPluginParam *param) override;
virtual int segment(lib::ObFTParserParam *param) const override;
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
OB_INLINE void reset() { is_inited_ = false; }
private:
bool is_inited_;

View File

@ -19,6 +19,6 @@ OB_DECLARE_PLUGIN(mock_ft_parser)
OB_PLUGIN_AUTHOR_OCEANBASE,
"This is mock fulltext parser plugin.",
0x00001,
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE,
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE,
&oceanbase::storage::mock_ft_parser,
};

View File

@ -27,7 +27,7 @@ public:
virtual ~ObMockFTParserDesc() = default;
virtual int init(lib::ObPluginParam *param) override;
virtual int deinit(lib::ObPluginParam *param) override;
virtual int segment(lib::ObFTParserParam *param) const override;
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
};
int ObMockFTParserDesc::init(lib::ObPluginParam *param)
@ -42,7 +42,7 @@ int ObMockFTParserDesc::deinit(lib::ObPluginParam *param)
return OB_SUCCESS;
}
int ObMockFTParserDesc::segment(lib::ObFTParserParam *param) const
int ObMockFTParserDesc::segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const
{
UNUSED(param);
return OB_SUCCESS;

View File

@ -49,33 +49,19 @@ int segment_and_calc_word_count(
{
int ret = OB_SUCCESS;
int64_t doc_length = 0;
common::ObSEArray<ObFTWord, 256> words;
if (OB_ISNULL(helper)
|| OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type
|| ObCollationType::CS_TYPE_EXTENDED_MARK < type)
|| OB_UNLIKELY(!words_count.created())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created()));
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) {
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) {
LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext));
} else {
for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) {
const ObFTWord &ft_word = words.at(i);
int64_t word_count = 0;
if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) {
LOG_WARN("fail to get ft word", K(ret), K(ft_word));
} else {
word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count;
if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) {
LOG_WARN("fail to set ft word and count", K(ret), K(ft_word));
}
}
}
}
return ret;
}
class ObTestAddWord final : public lib::ObFTParserParam::ObIAddWord
class ObTestAddWord final
{
public:
static const char *TEST_FULLTEXT;
@ -85,14 +71,16 @@ public:
static const int64_t FT_MAX_WORD_LEN = 84;
public:
ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
virtual ~ObTestAddWord() = default;
virtual int operator()(
lib::ObFTParserParam *param,
~ObTestAddWord() = default;
int check_words(lib::ObITokenIterator *iter);
int64_t get_add_word_count() const { return ith_word_; }
static int64_t get_word_cnt_without_stopword() { return TEST_WORD_COUNT_WITHOUT_STOPWORD; }
VIRTUAL_TO_STRING_KV(K_(ith_word));
private:
int check_ith_word(
const char *word,
const int64_t word_len,
const int64_t char_cnt) override;
virtual int64_t get_add_word_count() const override { return ith_word_; }
VIRTUAL_TO_STRING_KV(K_(ith_word));
const int64_t char_cnt);
private:
bool is_min_max_word(const int64_t c_len) const;
int casedown_word(const ObFTWord &src, ObFTWord &dst);
@ -137,8 +125,32 @@ int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
return ret;
}
int ObTestAddWord::operator()(
lib::ObFTParserParam *param,
int ObTestAddWord::check_words(lib::ObITokenIterator *iter)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(iter)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(iter));
} else {
const char *word = nullptr;
int64_t word_len = 0;
int64_t char_len = 0;
int64_t word_freq = 0;
while (OB_SUCC(ret)) {
if (OB_FAIL(iter->get_next_token(word, word_len, char_len, word_freq))) {
LOG_WARN("fail to get next token", K(ret), KPC(iter));
} else if (OB_FAIL(check_ith_word(word, word_len, char_len))) {
LOG_WARN("fail to check ith word", K(ret), KP(word), K(word_len), K(char_len));
}
}
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
}
}
return ret;
}
int ObTestAddWord::check_ith_word(
const char *word,
const int64_t word_len,
const int64_t char_cnt)
@ -146,9 +158,9 @@ int ObTestAddWord::operator()(
int ret = OB_SUCCESS;
ObFTWord src_word(word_len, word, collation_type_);
ObFTWord dst_word;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt));
} else if (is_min_max_word(char_cnt)) {
// skip min/max word
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
@ -194,7 +206,6 @@ void TestDefaultFTParser::SetUp()
ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_));
ft_parser_param_.allocator_ = &allocator_;
ft_parser_param_.add_word_ = &add_word_;
ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_UTF8MB4_BIN);
ft_parser_param_.parser_version_ = 0x00001;
ASSERT_TRUE(nullptr != ft_parser_param_.cs_);
@ -209,54 +220,74 @@ void TestDefaultFTParser::TearDown()
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
{
ObSpaceFTParser parser;
const char *fulltext = ObTestAddWord::TEST_FULLTEXT;
const int64_t ft_len = strlen(fulltext);
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(nullptr, nullptr, 0));
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, nullptr, 0));
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, 0));
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, -1));
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(nullptr));
ft_parser_param_.fulltext_ = nullptr;
ft_parser_param_.ft_length_ = 0;
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_));
ft_parser_param_.fulltext_ = fulltext;
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_));
ft_parser_param_.ft_length_ = -1;
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_));
ft_parser_param_.fulltext_ = fulltext;
ft_parser_param_.ft_length_ = ft_len;
LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len));
ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_));
ASSERT_EQ(OB_SUCCESS, add_word_.check_words(&parser));
LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
}
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
{
common::ObArray<ObFTWord> words;
ObAddWordFlag flag;
ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
ObSpaceFTParser parser;
const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
const int64_t ft_len = strlen(fulltext);
ft_parser_param_.fulltext_ = fulltext;
ft_parser_param_.ft_length_ = ft_len;
ft_parser_param_.add_word_ = &add_word;
ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI);
LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len));
LOG_INFO("after space segment", KCSTRING(fulltext), K(words), K(ft_len), K(ft_parser_param_));
ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_));
const char *word = nullptr;
int64_t word_len = 0;
int64_t char_len = 0;
int64_t word_freq = 0;
int ret = OB_SUCCESS;
while (OB_SUCC(ret)) {
if (OB_FAIL(parser.get_next_token(word, word_len, char_len, word_freq))) {
LOG_WARN("fail to get next token", K(ret), K(parser));
} else {
LOG_INFO("succeed to get next token", K(ret), K(ObString(word_len, word)), K(char_len));
}
}
LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
}
TEST_F(TestDefaultFTParser, test_default_ft_parser_desc)
{
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_));
ObITokenIterator *iter = nullptr;
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_, iter));
ft_parser_param_.fulltext_ = ObTestAddWord::TEST_FULLTEXT;
ft_parser_param_.ft_length_ = strlen(ft_parser_param_.fulltext_);
ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_));
ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_, iter));
ASSERT_EQ(OB_SUCCESS, add_word_.check_words(iter));
ASSERT_EQ(OB_SUCCESS, desc_.deinit(&plugin_param_));
ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_));
ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_, iter));
ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_));
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr));
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr, iter));
}
class ObTestFTPluginHelper : public ::testing::Test
@ -442,29 +473,35 @@ void ObTestFTParseHelper::TearDownTestCase()
TEST_F(ObTestFTParseHelper, test_parse_fulltext)
{
common::ObSEArray<ObFTWord, 16> words;
ObFTWordMap ft_word_map;
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse"));
int64_t doc_length = 0;
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ObTestAddWord test_add_word(cs_type_, allocator_);
for (int64_t i = 0; i < words.count(); ++i) {
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
for (int64_t i = 0; i < ft_word_map.size(); ++i) {
int64_t word_cnt = 0;
ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_);
ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt));
ASSERT_TRUE(word_cnt >= 1);
}
ObFTWordMap ft_word_map;
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse"));
ft_word_map.clear();
ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_,
cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map));
ASSERT_EQ(words.count(), ft_word_map.size());
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words));
ft_word_map.clear();
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, ft_word_map));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, ft_word_map));
parse_helper_.reset();
ft_word_map.clear();
ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(nullptr, plugin_name_));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(&allocator_, ObString()));
@ -472,9 +509,9 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_INVALID, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_EXTENDED_MARK, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_));
@ -484,57 +521,80 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
parse_helper_.reset();
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
for (int64_t i = 0; i < words.count(); ++i) {
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
for (int64_t i = 0; i < ft_word_map.size(); ++i) {
int64_t word_cnt = 0;
ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_);
ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt));
ASSERT_TRUE(word_cnt >= 1);
}
parse_helper_.reset();
ft_word_map.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, "beng.1"));
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
for (int64_t i = 0; i < ft_word_map.size(); ++i) {
int64_t word_cnt = 0;
ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_);
ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt));
ASSERT_TRUE(word_cnt >= 1);
}
}
TEST_F(ObTestFTParseHelper, test_min_and_max_word_len)
{
common::ObSEArray<ObFTWord, 16> words;
ObFTWordMap words;
ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse"));
int64_t doc_length = 0;
// word len = 2;
const char *word_len_2 = "ab";
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_2, std::strlen(word_len_2), doc_length, words));
ASSERT_EQ(0, words.count());
ASSERT_EQ(0, words.size());
// word len = 3;
const char *word_len_3 = "abc";
words.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_3, std::strlen(word_len_3), doc_length, words));
ASSERT_EQ(1, words.count());
ASSERT_EQ(1, words.size());
// word len = 4;
const char *word_len_4 = "abcd";
words.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_4, std::strlen(word_len_4), doc_length, words));
ASSERT_EQ(1, words.count());
ASSERT_EQ(1, words.size());
// word len = 76;
const char *word_len_76 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
words.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_76, std::strlen(word_len_76), doc_length, words));
ASSERT_EQ(1, words.count());
ASSERT_EQ(1, words.size());
// word len = 84;
const char *word_len_84 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz123456";
words.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_84, std::strlen(word_len_84), doc_length, words));
ASSERT_EQ(1, words.count());
ASSERT_EQ(1, words.size());
// word len = 85;
const char *word_len_85 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567";
words.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_85, std::strlen(word_len_85), doc_length, words));
ASSERT_EQ(0, words.count());
ASSERT_EQ(0, words.size());
}
class ObTestNgramFTParseHelper : public ::testing::Test
{
public:
static const char *name_;
static const int64_t TEST_WORD_COUNT = 29;
static const int64_t TEST_WORD_COUNT = 27;
typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap;
public:
ObTestNgramFTParseHelper();
virtual ~ObTestNgramFTParseHelper() = default;
static int64_t get_word_count() { return TEST_WORD_COUNT; }
static void SetUpTestCase();
static void TearDownTestCase();
@ -553,7 +613,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";
ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
: plugin_name_(STRLEN(name_), name_),
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
allocator_()
{
@ -583,26 +643,33 @@ void ObTestNgramFTParseHelper::TearDownTestCase()
TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext)
{
ObFTWordMap words;
ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse"));
int64_t doc_length = 0;
common::ObSEArray<ObFTWord, 16> words;
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
for (int64_t i = 0; i < words.count(); ++i) {
ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length()));
ASSERT_EQ(get_word_count(), words.size());
for (int64_t i = 0; i < words.size(); ++i) {
int64_t word_cnt = 0;
ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_);
ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt));
ASSERT_TRUE(word_cnt >= 1);
}
ObFTWordMap ft_word_map;
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse"));
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse"));
ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_,
cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map));
ASSERT_EQ(words.count(), ft_word_map.size() + 2);
ASSERT_EQ(words.size(), ft_word_map.size());
words.clear();
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words));
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words));
parse_helper_.reset();
words.clear();
ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
@ -620,14 +687,19 @@ TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext)
ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_));
parse_helper_.reset();
words.clear();
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
parse_helper_.reset();
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
for (int64_t i = 0; i < words.count(); ++i) {
ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length()));
ASSERT_EQ(get_word_count(), words.size());
for (int64_t i = 0; i < words.size(); ++i) {
int64_t word_cnt = 0;
ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_);
ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt));
ASSERT_TRUE(word_cnt >= 1);
}
}
@ -638,7 +710,7 @@ int main(int argc, char **argv)
{
system("rm -rf test_fts_plugin.log");
OB_LOGGER.set_file_name("test_fts_plugin.log", true);
OB_LOGGER.set_log_level("INFO");
OB_LOGGER.set_log_level("DEBUG");
oceanbase::storage::ObTestFTPluginHelper::file_name = argv[0];
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();