[FTS] Adjust plugin tokenizer interface for fulltext search
This commit is contained in:
57
deps/oblib/src/lib/ob_plugin.h
vendored
57
deps/oblib/src/lib/ob_plugin.h
vendored
@ -114,7 +114,7 @@ enum class ObPluginType : uint64_t
|
||||
// define plugin license
|
||||
enum class ObPluginLicenseType : uint64_t
|
||||
{
|
||||
OB_MULAN_V2_LICENSE = 1, // Mulan PubL v2 license
|
||||
OB_Mulan_PubL_V2_LICENSE = 1, // Mulan PubL v2 license
|
||||
OB_MAX_PLUGIN_LICENSE_TYPE = 2, // max plugin license type
|
||||
};
|
||||
|
||||
@ -186,7 +186,7 @@ public:
|
||||
&& nullptr != author_
|
||||
&& nullptr != spec_
|
||||
&& PLUGIN_VERSION == version_
|
||||
&& (ObPluginLicenseType::OB_MULAN_V2_LICENSE <= license_
|
||||
&& (ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE <= license_
|
||||
&& license_ < ObPluginLicenseType::OB_MAX_PLUGIN_LICENSE_TYPE)
|
||||
&& nullptr != desc_;
|
||||
}
|
||||
@ -217,24 +217,9 @@ public:
|
||||
|
||||
class ObFTParserParam final
|
||||
{
|
||||
public:
|
||||
class ObIAddWord
|
||||
{
|
||||
public:
|
||||
ObIAddWord() = default;
|
||||
virtual ~ObIAddWord() = default;
|
||||
virtual int operator()(
|
||||
ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) = 0;
|
||||
virtual int64_t get_add_word_count() const = 0;
|
||||
DECLARE_PURE_VIRTUAL_TO_STRING;
|
||||
};
|
||||
public:
|
||||
ObFTParserParam()
|
||||
: allocator_(nullptr),
|
||||
add_word_(nullptr),
|
||||
cs_(nullptr),
|
||||
fulltext_(nullptr),
|
||||
ft_length_(0),
|
||||
@ -245,36 +230,42 @@ public:
|
||||
inline bool is_valid() const
|
||||
{
|
||||
return nullptr != allocator_
|
||||
&& nullptr != add_word_
|
||||
&& nullptr != cs_
|
||||
&& nullptr != fulltext_
|
||||
&& 0 < ft_length_
|
||||
&& 0 <= parser_version_;
|
||||
}
|
||||
inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt)
|
||||
{
|
||||
return (*add_word_)(param, word, word_len, char_cnt);
|
||||
}
|
||||
inline void reset()
|
||||
{
|
||||
allocator_ = nullptr;
|
||||
add_word_ = nullptr;
|
||||
cs_ = nullptr;
|
||||
fulltext_ = nullptr;
|
||||
ft_length_ = 0;
|
||||
parser_version_ = 0;
|
||||
}
|
||||
|
||||
TO_STRING_KV(KP_(allocator), KP_(add_word), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version));
|
||||
TO_STRING_KV(KP_(allocator), KP_(cs), K_(fulltext), K_(ft_length), K_(parser_version));
|
||||
public:
|
||||
common::ObIAllocator *allocator_;
|
||||
ObIAddWord *add_word_;
|
||||
const ObCharsetInfo *cs_;
|
||||
const char *fulltext_;
|
||||
int64_t ft_length_;
|
||||
int64_t parser_version_;
|
||||
};
|
||||
|
||||
class ObITokenIterator
|
||||
{
|
||||
public:
|
||||
ObITokenIterator() = default;
|
||||
virtual ~ObITokenIterator() = default;
|
||||
virtual int get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_cnt,
|
||||
int64_t &word_freq) = 0;
|
||||
DECLARE_PURE_VIRTUAL_TO_STRING;
|
||||
};
|
||||
|
||||
// fulltext parser descriptor interface for domain index
|
||||
// - splitting a document into many tokenizations.
|
||||
class ObIFTParserDesc : public ObIPluginDesc
|
||||
@ -286,12 +277,22 @@ public:
|
||||
/**
|
||||
* split fulltext into multiple word segments
|
||||
*
|
||||
* @param[in] fulltext, the document to be tokenized.
|
||||
* @param[out] words, the word segmentation after splitting.
|
||||
* @param[in] param, the document to be tokenized and parameters related to word segmentation.
|
||||
* @param[out] iter, the tokenized words' iterator.
|
||||
*
|
||||
* @return error code, such as, OB_SUCCESS, OB_INVALID_ARGUMENT, ...
|
||||
*/
|
||||
virtual int segment(ObFTParserParam *param) const = 0;
|
||||
virtual int segment(ObFTParserParam *param, ObITokenIterator *&iter) const = 0;
|
||||
|
||||
/**
|
||||
* Release resources held by the iterator and free token iterator.
|
||||
*/
|
||||
virtual void free_token_iter(ObFTParserParam *param, ObITokenIterator *&iter) const
|
||||
{
|
||||
if (OB_NOT_NULL(iter)) {
|
||||
iter->~ObITokenIterator();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace lib
|
||||
|
||||
@ -207,30 +207,16 @@ int ObDASDomainUtils::generate_spatial_index_rows(
|
||||
ObFTWordMap &words_count)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
common::ObSEArray<ObFTWord, 256> words;
|
||||
if (OB_ISNULL(helper)
|
||||
|| OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type
|
||||
|| ObCollationType::CS_TYPE_EXTENDED_MARK < type)
|
||||
|| OB_UNLIKELY(!words_count.created())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created()));
|
||||
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) {
|
||||
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) {
|
||||
LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext));
|
||||
} else {
|
||||
for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) {
|
||||
const ObFTWord &ft_word = words.at(i);
|
||||
int64_t word_count = 0;
|
||||
if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) {
|
||||
LOG_WARN("fail to get ft word", K(ret), K(ft_word));
|
||||
} else {
|
||||
word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count;
|
||||
if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) {
|
||||
LOG_WARN("fail to set ft word and count", K(ret), K(ft_word));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words), K(type));
|
||||
STORAGE_FTS_LOG(DEBUG, "segment and calc word count", K(ret), K(words_count.size()), K(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -484,6 +470,7 @@ void ObDomainDMLIterator::reset()
|
||||
row_projector_ = nullptr;
|
||||
das_ctdef_ = nullptr;
|
||||
main_ctdef_ = nullptr;
|
||||
allocator_.reset();
|
||||
}
|
||||
|
||||
void ObDomainDMLIterator::set_ctdef(
|
||||
@ -520,10 +507,12 @@ int ObDomainDMLIterator::get_next_domain_row(ObNewRow *&row)
|
||||
while (OB_SUCC(ret) && !got_row) {
|
||||
if (row_idx_ >= rows_.count()) {
|
||||
rows_.reuse();
|
||||
allocator_.reuse();
|
||||
row_idx_ = 0;
|
||||
if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("unexpected error, not domain index", K(ret), K(das_ctdef_->table_param_.get_data_table()));
|
||||
|
||||
} else if (FAILEDx(write_iter_.get_next_row(sr))) {
|
||||
if (OB_ITER_END != ret) {
|
||||
LOG_WARN("get next row from result iterator failed", K(ret));
|
||||
@ -562,6 +551,7 @@ int ObDomainDMLIterator::get_next_domain_rows(ObNewRow *&row, int64_t &row_count
|
||||
while (OB_SUCC(ret) && !got_row) {
|
||||
if (row_idx_ >= rows_.count()) {
|
||||
rows_.reuse();
|
||||
allocator_.reuse();
|
||||
row_idx_ = 0;
|
||||
if (OB_UNLIKELY(!das_ctdef_->table_param_.get_data_table().is_domain_index())) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
@ -757,7 +747,7 @@ int ObFTDMLIterator::get_ft_and_doc_id(
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
ObString &doc_id,
|
||||
ObString &ft,
|
||||
common::ObObjMeta &ft_meta) const
|
||||
common::ObObjMeta &ft_meta)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
const uint64_t doc_id_col_id = das_ctdef_->table_param_.get_data_table().get_doc_id_col_id();
|
||||
@ -793,7 +783,7 @@ int ObFTDMLIterator::get_ft_and_doc_id_for_update(
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
ObString &doc_id,
|
||||
ObString &ft,
|
||||
common::ObObjMeta &ft_meta) const
|
||||
common::ObObjMeta &ft_meta)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
const uint64_t rowkey_col_cnt = das_ctdef_->table_param_.get_data_table().get_rowkey_column_num();
|
||||
@ -863,7 +853,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data(
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
int64_t& multivalue_idx,
|
||||
int64_t& multivalue_arr_idx,
|
||||
ObString &multivalue_data) const
|
||||
ObString &multivalue_data)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
multivalue_idx = OB_INVALID_ID;
|
||||
@ -910,7 +900,7 @@ int ObMultivalueDMLIterator::get_multivlaue_json_data_for_update(
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
int64_t& multivalue_idx,
|
||||
int64_t& multivalue_arr_idx,
|
||||
ObString &multivalue_data) const
|
||||
ObString &multivalue_data)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
bool found = false;
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
#ifndef OCEANBASE_DAS_DOMAIN_UTILS_H
|
||||
#define OCEANBASE_DAS_DOMAIN_UTILS_H
|
||||
|
||||
#include "lib/allocator/page_arena.h"
|
||||
#include "lib/hash/ob_hashset.h"
|
||||
#include "sql/das/ob_das_dml_ctx_define.h"
|
||||
#include "storage/fts/ob_fts_plugin_helper.h"
|
||||
@ -56,8 +57,6 @@ public:
|
||||
const IntFixedArray &row_projector,
|
||||
const ObDASWriteBuffer::DmlRow &dml_row,
|
||||
ObDomainIndexRow &domain_rows);
|
||||
private:
|
||||
typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap;
|
||||
private:
|
||||
static int segment_and_calc_word_count(
|
||||
common::ObIAllocator &allocator,
|
||||
@ -126,7 +125,7 @@ protected:
|
||||
ObDASWriteBuffer::Iterator &write_iter_;
|
||||
const ObDASDMLBaseCtDef *das_ctdef_;
|
||||
const ObDASDMLBaseCtDef *main_ctdef_;
|
||||
common::ObIAllocator &allocator_;
|
||||
common::ObArenaAllocator allocator_;
|
||||
bool is_update_;
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(ObDomainDMLIterator);
|
||||
@ -178,13 +177,13 @@ private:
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
int64_t& multivalue_idx,
|
||||
int64_t& multivalue_arr_idx,
|
||||
ObString &multivalue_data) const;
|
||||
ObString &multivalue_data);
|
||||
|
||||
int get_multivlaue_json_data_for_update(
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
int64_t& multivalue_idx,
|
||||
int64_t& multivalue_arr_idx,
|
||||
ObString &multivalue_data) const;
|
||||
ObString &multivalue_data);
|
||||
};
|
||||
|
||||
|
||||
@ -214,12 +213,12 @@ protected:
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
ObString &doc_id,
|
||||
ObString &ft,
|
||||
common::ObObjMeta &ft_meta) const;
|
||||
common::ObObjMeta &ft_meta);
|
||||
int get_ft_and_doc_id_for_update(
|
||||
const ObChunkDatumStore::StoredRow *store_row,
|
||||
ObString &doc_id,
|
||||
ObString &ft,
|
||||
common::ObObjMeta &ft_meta) const;
|
||||
common::ObObjMeta &ft_meta);
|
||||
|
||||
private:
|
||||
storage::ObFTParseHelper ft_parse_helper_;
|
||||
|
||||
@ -312,21 +312,9 @@ int ObTextRetrievalMerge::init_query_tokens(const ObDASIRScanCtDef *ir_ctdef, Ob
|
||||
} else if (OB_FAIL(token_map.create(ft_word_bkt_cnt, common::ObMemAttr(MTL_ID(), "FTWordMap")))) {
|
||||
LOG_WARN("failed to create token map", K(ret));
|
||||
} else if (OB_FAIL(tokenize_helper.segment(
|
||||
cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, tokens))) {
|
||||
cs_type, search_text_string.ptr(), search_text_string.length(), doc_length, token_map))) {
|
||||
LOG_WARN("failed to segment");
|
||||
} else {
|
||||
for (int64_t i = 0; OB_SUCC(ret) && i < tokens.count(); ++i) {
|
||||
const ObFTWord &token = tokens.at(i);
|
||||
int64_t word_count = 0;
|
||||
if (OB_FAIL(token_map.get_refactored(token, word_count)) && OB_HASH_NOT_EXIST != ret) {
|
||||
LOG_WARN("fail to get ft word", K(ret), K(token));
|
||||
} else {
|
||||
word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count;
|
||||
if (OB_FAIL(token_map.set_refactored(token, word_count, 1/*overwrite*/))) {
|
||||
LOG_WARN("fail to set ft word and count", K(ret), K(token));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (hash::ObHashMap<ObFTWord, int64_t>::const_iterator iter = token_map.begin();
|
||||
OB_SUCC(ret) && iter != token_map.end();
|
||||
++iter) {
|
||||
|
||||
@ -22,70 +22,43 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
/*static*/ int ObBEngFTParser::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *ft,
|
||||
const int64_t ft_len)
|
||||
int ObBEngFTParser::get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_len,
|
||||
int64_t &word_freq)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObDatum doc;
|
||||
doc.set_string(ft, ft_len);
|
||||
ObBEngFTParser parser;
|
||||
share::ObITokenStream *token_stream = nullptr;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
|
||||
} else if (OB_FAIL(parser.init(param))) {
|
||||
LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param));
|
||||
} else if (FALSE_IT(doc.set_string(ft, ft_len))) {
|
||||
} else if (OB_FAIL(parser.segment(doc, token_stream))) {
|
||||
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len));
|
||||
} else if (OB_ISNULL(token_stream)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream));
|
||||
} else {
|
||||
ObDatum token;
|
||||
int64_t token_freq = 0;
|
||||
while (OB_SUCC(ret)) {
|
||||
if (OB_FAIL(token_stream->get_next(token, token_freq))) {
|
||||
if (OB_ITER_END != ret) {
|
||||
LOG_WARN("fail to get next token", K(ret), KPC(token_stream));
|
||||
}
|
||||
} else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) {
|
||||
LOG_WARN("fail to add word", K(ret), K(token), KPC(param));
|
||||
}
|
||||
}
|
||||
if (OB_ITER_END == ret) {
|
||||
ret = OB_SUCCESS;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*static*/ int ObBEngFTParser::add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
common::ObIAllocator *allocator,
|
||||
const char *word,
|
||||
int64_t word_len)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
char *buf = nullptr;
|
||||
if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(allocator)
|
||||
|| OB_ISNULL(word)
|
||||
|| OB_UNLIKELY(0 >= word_len)) {
|
||||
word = nullptr;
|
||||
word_len = 0;
|
||||
char_len = 0;
|
||||
word_freq = 0;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("beng ft parser isn't initialized", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(token_stream_)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_));
|
||||
} else if (OB_FAIL(token_stream_->get_next(token, token_freq))) {
|
||||
if (OB_ITER_END != ret) {
|
||||
LOG_WARN("fail to get next token", K(ret), KPC(token_stream_));
|
||||
}
|
||||
} else if (OB_ISNULL(token.ptr_) || OB_UNLIKELY(0 >= token.len_ || 0 >= token_freq)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len));
|
||||
} else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) {
|
||||
LOG_DEBUG("skip too small or large word", K(ret), K(word_len));
|
||||
} else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) {
|
||||
LOG_WARN("invalid arguments", K(ret), KP(token.ptr_), K(token.len_), K(token_freq));
|
||||
} else if (OB_ISNULL(buf = static_cast<char *>(allocator_.alloc(token.len_)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate word memory", K(ret), K(word_len));
|
||||
} else if (FALSE_IT(MEMCPY(buf, word, word_len))) {
|
||||
} else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word)));
|
||||
LOG_WARN("fail to allocate word memory", K(ret), K(token.len_));
|
||||
} else {
|
||||
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)));
|
||||
MEMCPY(buf, token.ptr_, token.len_);
|
||||
word = buf;
|
||||
word_len = token.len_;
|
||||
char_len = token.len_;
|
||||
word_freq = token_freq;
|
||||
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)), K(word_freq));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@ -103,13 +76,20 @@ int ObBEngFTParser::init(lib::ObFTParserParam *param)
|
||||
ret = OB_NOT_SUPPORTED;
|
||||
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_));
|
||||
} else {
|
||||
doc_.set_string(param->fulltext_, param->ft_length_);
|
||||
analysis_ctx_.cs_ = param->cs_;
|
||||
analysis_ctx_.filter_stopword_ = false;
|
||||
analysis_ctx_.need_grouping_ = false;
|
||||
if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) {
|
||||
LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_));
|
||||
} else if (OB_FAIL(segment(doc_, token_stream_))) {
|
||||
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(param->fulltext_), K(param->ft_length_));
|
||||
} else if (OB_ISNULL(token_stream_)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream_));
|
||||
} else {
|
||||
is_inited_ = true;
|
||||
LOG_DEBUG("succeed to init beng parser", K(ret), K(english_analyzer_), KPC(token_stream_), K(doc_));
|
||||
}
|
||||
}
|
||||
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
|
||||
@ -139,6 +119,8 @@ void ObBEngFTParser::reset()
|
||||
{
|
||||
analysis_ctx_.reset();
|
||||
english_analyzer_.reset();
|
||||
doc_.reset();
|
||||
token_stream_ = nullptr;
|
||||
is_inited_ = false;
|
||||
}
|
||||
|
||||
@ -159,20 +141,43 @@ int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param)
|
||||
return OB_SUCCESS;
|
||||
}
|
||||
|
||||
int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const
|
||||
int ObBasicEnglishFTParserDesc::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
void *buf = nullptr;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid argument", K(ret), KPC(param));
|
||||
} else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) {
|
||||
LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param),
|
||||
K(param->fulltext_), K(param->ft_length_));
|
||||
} else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObBEngFTParser)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate basic english ft parser", K(ret));
|
||||
} else {
|
||||
ObBEngFTParser *parser = new (buf) ObBEngFTParser(*(param->allocator_));
|
||||
if (OB_FAIL(parser->init(param))) {
|
||||
LOG_WARN("fail to init basic english parser", K(ret), KPC(param));
|
||||
} else {
|
||||
iter = parser;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ObBasicEnglishFTParserDesc::free_token_iter(
|
||||
lib::ObFTParserParam *param,
|
||||
lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
if (OB_NOT_NULL(iter)) {
|
||||
abort_unless(nullptr != param);
|
||||
abort_unless(nullptr != param->allocator_);
|
||||
iter->~ObITokenIterator();
|
||||
param->allocator_->free(iter);
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
|
||||
@ -23,40 +23,41 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
class ObBEngFTParser final
|
||||
class ObBEngFTParser final : public lib::ObITokenIterator
|
||||
{
|
||||
public:
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
public:
|
||||
static int segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *fulltext,
|
||||
const int64_t ft_len);
|
||||
|
||||
private:
|
||||
ObBEngFTParser()
|
||||
: analysis_ctx_(),
|
||||
explicit ObBEngFTParser(common::ObIAllocator &allocator)
|
||||
: allocator_(allocator),
|
||||
analysis_ctx_(),
|
||||
english_analyzer_(),
|
||||
doc_(),
|
||||
token_stream_(nullptr),
|
||||
is_inited_(false)
|
||||
{}
|
||||
~ObBEngFTParser() = default;
|
||||
~ObBEngFTParser() { reset(); }
|
||||
|
||||
static int add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
common::ObIAllocator *allocator,
|
||||
const char *word,
|
||||
int64_t word_len);
|
||||
int init(lib::ObFTParserParam *param);
|
||||
void reset();
|
||||
virtual int get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_len,
|
||||
int64_t &word_freq) override;
|
||||
|
||||
VIRTUAL_TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), KP_(token_stream), K_(is_inited));
|
||||
private:
|
||||
int segment(
|
||||
const common::ObDatum &doc,
|
||||
share::ObITokenStream *&token_stream);
|
||||
TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited));
|
||||
|
||||
private:
|
||||
common::ObIAllocator &allocator_;
|
||||
share::ObTextAnalysisCtx analysis_ctx_;
|
||||
share::ObEnglishTextAnalyzer english_analyzer_;
|
||||
common::ObDatum doc_;
|
||||
share::ObITokenStream *token_stream_;
|
||||
bool is_inited_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser);
|
||||
@ -69,7 +70,8 @@ public:
|
||||
virtual ~ObBasicEnglishFTParserDesc() = default;
|
||||
virtual int init(lib::ObPluginParam *param) override;
|
||||
virtual int deinit(lib::ObPluginParam *param) override;
|
||||
virtual int segment(lib::ObFTParserParam *param) const override;
|
||||
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
OB_INLINE void reset() { is_inited_ = false; }
|
||||
private:
|
||||
bool is_inited_;
|
||||
|
||||
@ -26,7 +26,7 @@ OB_DECLARE_PLUGIN(whitespace_parser)
|
||||
OB_PLUGIN_AUTHOR_OCEANBASE, // author
|
||||
"This is a default whitespace parser plugin.", // brief specification
|
||||
0x00001, // version
|
||||
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
|
||||
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license
|
||||
&oceanbase::storage::whitespace_parser, // default space parser plugin instance
|
||||
};
|
||||
|
||||
@ -41,13 +41,13 @@ OB_DECLARE_PLUGIN(ngram_parser)
|
||||
OB_PLUGIN_AUTHOR_OCEANBASE, // author
|
||||
"This is a ngram fulltext parser plugin.", // brief specification
|
||||
0x00001, // version
|
||||
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
|
||||
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license
|
||||
&oceanbase::storage::ngram_parser, // ngram parser plugin instance
|
||||
};
|
||||
|
||||
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser);
|
||||
|
||||
///////////////////////////////////// Default fulltext parser //////////////////////////////////////////
|
||||
///////////////////////////////////// BEng fulltext parser //////////////////////////////////////////
|
||||
|
||||
OB_DECLARE_PLUGIN(beng_parser)
|
||||
{
|
||||
@ -56,8 +56,8 @@ OB_DECLARE_PLUGIN(beng_parser)
|
||||
OB_PLUGIN_AUTHOR_OCEANBASE, // author
|
||||
"This is a basic english parser plugin.", // brief specification
|
||||
0x00001, // version
|
||||
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
|
||||
&oceanbase::storage::beng_parser, // default space parser plugin instance
|
||||
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE, // Mulan PubL v2 license
|
||||
&oceanbase::storage::beng_parser, // basic english parser plugin instance
|
||||
};
|
||||
|
||||
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser);
|
||||
|
||||
@ -119,7 +119,7 @@ int ObFTParseHelper::segment(
|
||||
const char *ft,
|
||||
const int64_t ft_len,
|
||||
common::ObIAllocator &allocator,
|
||||
lib::ObFTParserParam::ObIAddWord &add_word)
|
||||
ObAddWord &add_word)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(parser_version < 0 || nullptr == parser_desc || nullptr == cs || nullptr == ft || 0 >= ft_len)) {
|
||||
@ -127,14 +127,38 @@ int ObFTParseHelper::segment(
|
||||
LOG_WARN("invalid arguments", K(ret), K(parser_version), KP(parser_desc), KP(cs), K(ft), K(ft_len));
|
||||
} else {
|
||||
lib::ObFTParserParam param;
|
||||
lib::ObITokenIterator *iter = nullptr;
|
||||
param.allocator_ = &allocator;
|
||||
param.add_word_ = &add_word;
|
||||
param.cs_ = cs;
|
||||
param.fulltext_ = ft;
|
||||
param.ft_length_ = ft_len;
|
||||
param.parser_version_ = parser_version;
|
||||
if (OB_FAIL(parser_desc->segment(¶m))) {
|
||||
if (OB_FAIL(parser_desc->segment(¶m, iter))) {
|
||||
LOG_WARN("fail to segment", K(ret), K(param));
|
||||
} else if (OB_ISNULL(iter)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("unexpected error, token iterator is nullptr", K(ret), KP(iter));
|
||||
} else {
|
||||
const char *word = nullptr;
|
||||
int64_t word_len = 0;
|
||||
int64_t char_cnt = 0;
|
||||
int64_t word_freq = 0;
|
||||
while (OB_SUCC(ret)) {
|
||||
if (OB_FAIL(iter->get_next_token(word, word_len, char_cnt, word_freq))) {
|
||||
if (OB_ITER_END != ret) {
|
||||
LOG_WARN("fail to get next token", K(ret), KPC(iter));
|
||||
}
|
||||
} else if (OB_FAIL(add_word.process_word(word, word_len, char_cnt, word_freq))) {
|
||||
LOG_WARN("fail to process one word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq));
|
||||
}
|
||||
}
|
||||
if (OB_ITER_END == ret) {
|
||||
ret = OB_SUCCESS;
|
||||
}
|
||||
}
|
||||
if (OB_NOT_NULL(iter)) {
|
||||
parser_desc->free_token_iter(¶m, iter);
|
||||
iter = nullptr;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
@ -176,11 +200,10 @@ int ObFTParseHelper::init(
|
||||
LOG_WARN("unexpected error, parse handler is nullptr", K(ret), KP(parse_handler));
|
||||
} else if (OB_FAIL(get_fulltext_parser_desc(*parse_handler, parser_desc_))) {
|
||||
LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler));
|
||||
} else if (OB_FAIL(set_add_word_flag(parser_name_))) {
|
||||
LOG_WARN("fail to set add word flag", K(ret), K(parser_name_));
|
||||
} else {
|
||||
plugin_param_.desc_ = parser_desc_;
|
||||
if (need_min_max_word(parser_name_)) { add_word_flag_.set_min_max_word(); }
|
||||
if (need_castdn(parser_name_)) { add_word_flag_.set_casedown(); }
|
||||
if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word(); }
|
||||
allocator_ = allocator;
|
||||
is_inited_ = true;
|
||||
}
|
||||
@ -204,7 +227,7 @@ int ObFTParseHelper::segment(
|
||||
const char *fulltext,
|
||||
const int64_t fulltext_len,
|
||||
int64_t &doc_length,
|
||||
common::ObIArray<ObFTWord> &words) const
|
||||
ObFTWordMap &words) const
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
const ObCharsetInfo *cs = nullptr;
|
||||
@ -231,29 +254,34 @@ int ObFTParseHelper::segment(
|
||||
doc_length = add_word.get_add_word_count();
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words));
|
||||
LOG_DEBUG("ft parse segment", K(ret), K(type), K(add_word_flag_), K(parser_name_),
|
||||
K(ObString(fulltext_len, fulltext)), K(words.size()));
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser)
|
||||
int ObFTParseHelper::set_add_word_flag(const ObFTParser &parser)
|
||||
{
|
||||
share::ObPluginName space("space");
|
||||
share::ObPluginName beng("beng");
|
||||
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
|
||||
}
|
||||
|
||||
bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser)
|
||||
{
|
||||
share::ObPluginName space("space");
|
||||
share::ObPluginName beng("beng");
|
||||
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
|
||||
}
|
||||
|
||||
bool ObFTParseHelper::need_castdn(const ObFTParser &parser)
|
||||
{
|
||||
share::ObPluginName space("space");
|
||||
share::ObPluginName ngram("ngram");
|
||||
return parser.get_parser_name() == space || parser.get_parser_name() == ngram;
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(!parser.is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), K(parser));
|
||||
} else if (share::ObPluginName("space") == parser.get_parser_name()) {
|
||||
add_word_flag_.set_min_max_word();
|
||||
add_word_flag_.set_stop_word();
|
||||
add_word_flag_.set_casedown();
|
||||
add_word_flag_.set_groupby_word();
|
||||
} else if (share::ObPluginName("beng") == parser.get_parser_name()) {
|
||||
add_word_flag_.set_min_max_word();
|
||||
add_word_flag_.set_stop_word();
|
||||
add_word_flag_.set_groupby_word();
|
||||
} else if (share::ObPluginName("ngram") == parser.get_parser_name()) {
|
||||
add_word_flag_.set_casedown();
|
||||
add_word_flag_.set_groupby_word();
|
||||
} else {
|
||||
ret = OB_NOT_SUPPORTED;
|
||||
LOG_WARN("unsupported parser for fulltext search", K(ret), K(parser));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
|
||||
@ -25,6 +25,8 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
class ObAddWord;
|
||||
|
||||
class ObFTParser final
|
||||
{
|
||||
public:
|
||||
@ -89,7 +91,7 @@ public:
|
||||
const char *fulltext,
|
||||
const int64_t fulltext_len,
|
||||
int64_t &doc_length,
|
||||
common::ObIArray<ObFTWord> &words) const;
|
||||
ObFTWordMap &words) const;
|
||||
const ObFTParser &get_parser_name() const { return parser_name_; }
|
||||
void reset();
|
||||
|
||||
@ -105,17 +107,8 @@ private:
|
||||
const char *fulltext,
|
||||
const int64_t fulltext_len,
|
||||
common::ObIAllocator &allocator,
|
||||
lib::ObFTParserParam::ObIAddWord &add_word);
|
||||
static bool need_stopword_list(const ObFTParser &parser);
|
||||
static bool need_castdn(const ObFTParser &parser);
|
||||
static bool need_min_max_word(const ObFTParser &parser);
|
||||
|
||||
int alloc_add_word(
|
||||
const ObCollationType &type,
|
||||
common::ObIArray<ObFTWord> &words,
|
||||
lib::ObFTParserParam::ObIAddWord *&add_word) const;
|
||||
void free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const;
|
||||
|
||||
ObAddWord &add_word);
|
||||
int set_add_word_flag(const ObFTParser &parser);
|
||||
private:
|
||||
lib::ObPluginParam plugin_param_;
|
||||
common::ObIAllocator *allocator_;
|
||||
|
||||
@ -24,10 +24,10 @@ ObAddWord::ObAddWord(
|
||||
const ObCollationType &type,
|
||||
const ObAddWordFlag &flag,
|
||||
common::ObIAllocator &allocator,
|
||||
common::ObIArray<ObFTWord> &word)
|
||||
ObFTWordMap &word_map)
|
||||
: collation_type_(type),
|
||||
allocator_(allocator),
|
||||
words_(word),
|
||||
word_map_(word_map),
|
||||
min_max_word_cnt_(0),
|
||||
non_stopword_cnt_(0),
|
||||
stopword_cnt_(0),
|
||||
@ -35,19 +35,19 @@ ObAddWord::ObAddWord(
|
||||
{
|
||||
}
|
||||
|
||||
int ObAddWord::operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
int ObAddWord::process_word(
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
const int64_t char_cnt,
|
||||
const int64_t word_freq)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
bool is_stopword = false;
|
||||
ObFTWord src_word(word_len, word, collation_type_);
|
||||
ObFTWord dst_word;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
|
||||
if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt || 0 >= word_freq)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq));
|
||||
} else if (is_min_max_word(char_cnt)) {
|
||||
++min_max_word_cnt_;
|
||||
LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt));
|
||||
@ -58,11 +58,11 @@ int ObAddWord::operator()(
|
||||
} else if (OB_UNLIKELY(is_stopword)) {
|
||||
++stopword_cnt_;
|
||||
LOG_DEBUG("skip stopword", K(ret), K(dst_word));
|
||||
} else if (OB_FAIL(words_.push_back(dst_word))) {
|
||||
LOG_WARN("fail to push word into words array", K(ret), K(dst_word));
|
||||
} else if (OB_FAIL(groupby_word(dst_word, word_freq))) {
|
||||
LOG_WARN("fail to groupby word into word map", K(ret), K(dst_word), K(word_freq));
|
||||
} else {
|
||||
++non_stopword_cnt_;
|
||||
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word));
|
||||
non_stopword_cnt_ += word_freq;
|
||||
LOG_DEBUG("add word", K(ret), KP(word), K(word_len), K(char_cnt), K(word_freq), K(src_word), K(dst_word));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@ -104,5 +104,31 @@ int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObAddWord::groupby_word(const ObFTWord &word, const int64_t word_freq)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
int64_t word_count = 0;
|
||||
if (OB_UNLIKELY(word.empty() || word_freq <= 0)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), K(word), K(word_freq));
|
||||
} else if (!flag_.groupby_word()) {
|
||||
if (OB_FAIL(word_map_.set_refactored(word, 1/*word count*/))) {
|
||||
LOG_WARN("fail to set fulltext word and count", K(ret), K(word));
|
||||
}
|
||||
} else if (OB_FAIL(word_map_.get_refactored(word, word_count)) && OB_HASH_NOT_EXIST != ret) {
|
||||
LOG_WARN("fail to get fulltext word", K(ret), K(word));
|
||||
} else {
|
||||
if (OB_HASH_NOT_EXIST == ret) {
|
||||
word_count = 1;
|
||||
} else {
|
||||
word_count += word_freq;
|
||||
}
|
||||
if (OB_FAIL(word_map_.set_refactored(word, word_count, 1/*overwrite*/))) {
|
||||
LOG_WARN("fail to set fulltext word and count", K(ret), K(word), K(word_count));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
||||
|
||||
@ -63,23 +63,23 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = {
|
||||
"www"
|
||||
};
|
||||
|
||||
class ObAddWord final : public lib::ObFTParserParam::ObIAddWord
|
||||
class ObAddWord final
|
||||
{
|
||||
public:
|
||||
ObAddWord(
|
||||
const ObCollationType &type,
|
||||
const ObAddWordFlag &flag,
|
||||
common::ObIAllocator &allocator,
|
||||
common::ObIArray<ObFTWord> &word);
|
||||
virtual ~ObAddWord() = default;
|
||||
virtual int operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
ObFTWordMap &word_map);
|
||||
~ObAddWord() = default;
|
||||
int process_word(
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) override;
|
||||
virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; }
|
||||
const int64_t char_cnt,
|
||||
const int64_t word_freq);
|
||||
virtual int64_t get_add_word_count() const { return non_stopword_cnt_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt),
|
||||
K_(words));
|
||||
K(word_map_.size()));
|
||||
public:
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
@ -87,10 +87,11 @@ private:
|
||||
bool is_min_max_word(const int64_t c_len) const;
|
||||
int casedown_word(const ObFTWord &src, ObFTWord &dst);
|
||||
int check_stopword(const ObFTWord &word, bool &is_stopword);
|
||||
int groupby_word(const ObFTWord &word, const int64_t word_cnt);
|
||||
private:
|
||||
ObCollationType collation_type_;
|
||||
common::ObIAllocator &allocator_;
|
||||
common::ObIArray<ObFTWord> &words_;
|
||||
ObFTWordMap &word_map_;
|
||||
int64_t min_max_word_cnt_;
|
||||
int64_t non_stopword_cnt_;
|
||||
int64_t stopword_cnt_;
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#define OB_FTS_STRUCT_H_
|
||||
|
||||
#include "lib/charset/ob_charset.h"
|
||||
#include "lib/hash/ob_hashmap.h"
|
||||
|
||||
namespace oceanbase
|
||||
{
|
||||
@ -34,7 +35,7 @@ public:
|
||||
hash_val = ObCharset::hash(type_, word_);
|
||||
return common::OB_SUCCESS;
|
||||
}
|
||||
OB_INLINE uint64_t hash() const { return word_.hash(); }
|
||||
OB_INLINE uint64_t hash() const { return ObCharset::hash(type_, word_); }
|
||||
OB_INLINE bool empty() const { return word_.empty(); }
|
||||
|
||||
OB_INLINE bool operator ==(const ObFTWord &other) const
|
||||
@ -76,6 +77,8 @@ public:
|
||||
int64_t word_cnt_;
|
||||
};
|
||||
|
||||
typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap;
|
||||
|
||||
class ObAddWordFlag final
|
||||
{
|
||||
private:
|
||||
@ -84,6 +87,7 @@ private:
|
||||
// than a maximum word length.
|
||||
static const uint64_t AWF_STOPWORD = 1 << 1; // filter by sotp word table.
|
||||
static const uint64_t AWF_CASEDOWN = 1 << 2; // convert characters from uppercase to lowercase.
|
||||
static const uint64_t AWF_GROUPBY_WORD = 1 << 3; // distinct and word aggregation
|
||||
public:
|
||||
ObAddWordFlag() : flag_(AWF_NONE) {}
|
||||
~ObAddWordFlag() = default;
|
||||
@ -95,13 +99,17 @@ public:
|
||||
void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); }
|
||||
void set_stop_word() { set_flag(AWF_STOPWORD); }
|
||||
void set_casedown() { set_flag(AWF_CASEDOWN); }
|
||||
void set_groupby_word() { set_flag(AWF_GROUPBY_WORD); }
|
||||
void clear() { flag_ = AWF_NONE; }
|
||||
void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); }
|
||||
void clear_stop_word() { clear_flag(AWF_STOPWORD); }
|
||||
void clear_casedown() { clear_flag(AWF_CASEDOWN); }
|
||||
void clear_groupby_word() { clear_flag(AWF_GROUPBY_WORD); }
|
||||
bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); }
|
||||
bool stopword() const { return has_flag(AWF_STOPWORD); }
|
||||
bool casedown() const { return has_flag(AWF_CASEDOWN); }
|
||||
bool groupby_word() const { return has_flag(AWF_GROUPBY_WORD); }
|
||||
TO_STRING_KV(K_(flag));
|
||||
private:
|
||||
uint64_t flag_;
|
||||
};
|
||||
|
||||
@ -24,24 +24,81 @@ namespace storage
|
||||
|
||||
#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
|
||||
|
||||
/*static*/ int ObNgramFTParser::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *fulltext,
|
||||
const int64_t ft_len)
|
||||
|
||||
ObNgramFTParser::ObNgramFTParser()
|
||||
: cs_(nullptr),
|
||||
start_(nullptr),
|
||||
next_(nullptr),
|
||||
end_(nullptr),
|
||||
c_nums_(0),
|
||||
is_inited_(false)
|
||||
{}
|
||||
|
||||
ObNgramFTParser::~ObNgramFTParser()
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
||||
void ObNgramFTParser::reset()
|
||||
{
|
||||
cs_ = nullptr;
|
||||
start_ = nullptr;
|
||||
next_ = nullptr;
|
||||
end_ = nullptr;
|
||||
c_nums_ = 0;
|
||||
is_inited_ = false;
|
||||
}
|
||||
|
||||
int ObNgramFTParser::init(lib::ObFTParserParam *param)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
int64_t c_nums = 0;
|
||||
const char *start = fulltext;
|
||||
const char *next = start;
|
||||
const char *end = start + ft_len;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(fulltext) || OB_UNLIKELY(ft_len <= 0)) {
|
||||
if (OB_UNLIKELY(is_inited_)) {
|
||||
ret = OB_INIT_TWICE;
|
||||
LOG_WARN("init twice", K(ret), KPC(param), KPC(this));
|
||||
} else if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(param->cs_)
|
||||
|| OB_ISNULL(param->fulltext_)
|
||||
|| OB_UNLIKELY(0 >= param->ft_length_)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(param), KP(fulltext), K(ft_len));
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param));
|
||||
} else {
|
||||
const ObCharsetInfo *cs = param->cs_;
|
||||
while (OB_SUCC(ret) && next < end) {
|
||||
cs_ = param->cs_;
|
||||
start_ = param->fulltext_;
|
||||
next_ = start_;
|
||||
end_ = start_ + param->ft_length_;
|
||||
c_nums_ = 0;
|
||||
is_inited_ = true;
|
||||
}
|
||||
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
|
||||
reset();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObNgramFTParser::get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_len,
|
||||
int64_t &word_freq)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
word = nullptr;
|
||||
word_len = 0;
|
||||
char_len = 0;
|
||||
word_freq = 0;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("ngram ft parser isn't initialized", K(ret), K(is_inited_));
|
||||
} else {
|
||||
int64_t c_nums = c_nums_;
|
||||
const char *start = start_;
|
||||
const char *next = next_;
|
||||
const char *end = end_;
|
||||
const ObCharsetInfo *cs = cs_;
|
||||
do {
|
||||
const int64_t c_len = ob_mbcharlen_ptr(cs, next, end);
|
||||
if (next + c_len > end || 0 == c_len) { // if char is invalid, just skip the rest of doc.
|
||||
ret = OB_ITER_END;
|
||||
break;
|
||||
} else {
|
||||
int ctype;
|
||||
@ -50,38 +107,31 @@ namespace storage
|
||||
start = next + 1;
|
||||
next = start;
|
||||
c_nums = 0;
|
||||
if (next == end) {
|
||||
ret = OB_ITER_END;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
next += c_len;
|
||||
++c_nums;
|
||||
}
|
||||
if (NGRAM_TOKEN_SIZE == c_nums) {
|
||||
if (OB_FAIL(add_word(param, start, next - start, c_nums))) {
|
||||
LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums));
|
||||
} else {
|
||||
word = start;
|
||||
word_len = next - start;
|
||||
char_len = c_nums;
|
||||
word_freq = 1;
|
||||
start += ob_mbcharlen_ptr(cs, start, end);
|
||||
c_nums = NGRAM_TOKEN_SIZE - 1;
|
||||
break;
|
||||
}
|
||||
} while (OB_SUCC(ret) && next < end);
|
||||
if (OB_ITER_END == ret || OB_SUCCESS == ret) {
|
||||
start_ = start;
|
||||
next_ = next;
|
||||
end_ = end;
|
||||
c_nums_ = c_nums;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*static*/ int ObNgramFTParser::add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(word)
|
||||
|| OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
|
||||
LOG_DEBUG("next word", K(ret), K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@ -103,21 +153,43 @@ int ObNgramFTParserDesc::deinit(lib::ObPluginParam *param)
|
||||
return OB_SUCCESS;
|
||||
}
|
||||
|
||||
int ObNgramFTParserDesc::segment(lib::ObFTParserParam *param) const
|
||||
int ObNgramFTParserDesc::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
void *buf = nullptr;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("ngram ft parser desc hasn't be initialized", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid argument", K(ret), KPC(param));
|
||||
} else if (OB_FAIL(ObNgramFTParser::segment(param, param->fulltext_, param->ft_length_))) {
|
||||
LOG_WARN("fail to segment words for fulltext by ngram", K(ret), KPC(param),
|
||||
K(param->fulltext_), K(param->ft_length_));
|
||||
} else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObNgramFTParser)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate ngram ft parser", K(ret));
|
||||
} else {
|
||||
ObNgramFTParser *parser = new (buf) ObNgramFTParser();
|
||||
if (OB_FAIL(parser->init(param))) {
|
||||
LOG_WARN("fail to init ngram fulltext parser", K(ret), KPC(param));
|
||||
} else {
|
||||
iter = parser;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ObNgramFTParserDesc::free_token_iter(
|
||||
lib::ObFTParserParam *param,
|
||||
lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
if (OB_NOT_NULL(iter)) {
|
||||
abort_unless(nullptr != param);
|
||||
abort_unless(nullptr != param->allocator_);
|
||||
iter->~ObITokenIterator();
|
||||
param->allocator_->free(iter);
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
||||
|
||||
@ -22,23 +22,30 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
class ObNgramFTParser final
|
||||
class ObNgramFTParser final : public lib::ObITokenIterator
|
||||
{
|
||||
public:
|
||||
static const int64_t NGRAM_TOKEN_SIZE = 2; // TODO: @jinzhu, please apply one system variable later, and keep the same as mysql.
|
||||
public:
|
||||
ObNgramFTParser() = default;
|
||||
~ObNgramFTParser() = default;
|
||||
static int segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *fulltext,
|
||||
const int64_t ft_len);
|
||||
ObNgramFTParser();
|
||||
virtual ~ObNgramFTParser();
|
||||
|
||||
int init(lib::ObFTParserParam *param);
|
||||
void reset();
|
||||
virtual int get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_len,
|
||||
int64_t &word_freq) override;
|
||||
|
||||
VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited));
|
||||
private:
|
||||
static int add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt);
|
||||
const ObCharsetInfo *cs_;
|
||||
const char *start_;
|
||||
const char *next_;
|
||||
const char *end_;
|
||||
int64_t c_nums_;
|
||||
bool is_inited_;
|
||||
private:
|
||||
DISABLE_COPY_ASSIGN(ObNgramFTParser);
|
||||
};
|
||||
@ -50,7 +57,8 @@ public:
|
||||
virtual ~ObNgramFTParserDesc() = default;
|
||||
virtual int init(lib::ObPluginParam *param) override;
|
||||
virtual int deinit(lib::ObPluginParam *param) override;
|
||||
virtual int segment(lib::ObFTParserParam *param) const override;
|
||||
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
OB_INLINE void reset() { is_inited_ = false; }
|
||||
private:
|
||||
bool is_inited_;
|
||||
|
||||
@ -24,22 +24,74 @@ namespace storage
|
||||
|
||||
#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
|
||||
|
||||
int ObSpaceFTParser::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *ft,
|
||||
const int64_t ft_len)
|
||||
ObSpaceFTParser::ObSpaceFTParser()
|
||||
: cs_(nullptr),
|
||||
start_(nullptr),
|
||||
next_(nullptr),
|
||||
end_(nullptr),
|
||||
is_inited_(false)
|
||||
{}
|
||||
|
||||
ObSpaceFTParser::~ObSpaceFTParser()
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
||||
void ObSpaceFTParser::reset()
|
||||
{
|
||||
cs_ = nullptr;
|
||||
start_ = nullptr;
|
||||
next_ = nullptr;
|
||||
end_ = nullptr;
|
||||
is_inited_ = false;
|
||||
}
|
||||
|
||||
int ObSpaceFTParser::init(lib::ObFTParserParam *param)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
const char *start = ft;
|
||||
const char *next = start;
|
||||
const char *end = start + ft_len;
|
||||
int mbl = 0;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
|
||||
if (OB_UNLIKELY(is_inited_)) {
|
||||
ret = OB_INIT_TWICE;
|
||||
LOG_WARN("init twice", K(ret), KPC(param), KPC(this));
|
||||
} else if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(param->cs_)
|
||||
|| OB_ISNULL(param->fulltext_)
|
||||
|| OB_UNLIKELY(0 >= param->ft_length_)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param));
|
||||
} else {
|
||||
const ObCharsetInfo *cs = param->cs_;
|
||||
while (OB_SUCC(ret) && next < end) {
|
||||
cs_ = param->cs_;
|
||||
start_ = param->fulltext_;
|
||||
next_ = start_;
|
||||
end_ = start_ + param->ft_length_;
|
||||
is_inited_ = true;
|
||||
}
|
||||
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
|
||||
reset();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObSpaceFTParser::get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_len,
|
||||
int64_t &word_freq)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
int mbl = 0;
|
||||
word = nullptr;
|
||||
word_len = 0;
|
||||
char_len = 0;
|
||||
word_freq = 0;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("space ft parser isn't initialized", K(ret), K(is_inited_));
|
||||
} else {
|
||||
const char *start = start_;
|
||||
const char *next = next_;
|
||||
const char *end = end_;
|
||||
const ObCharsetInfo *cs = cs_;
|
||||
do {
|
||||
while (next < end) {
|
||||
int ctype;
|
||||
mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end);
|
||||
@ -62,34 +114,24 @@ int ObSpaceFTParser::segment(
|
||||
++c_nums;
|
||||
next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
|
||||
}
|
||||
if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next));
|
||||
if (0 < c_nums) {
|
||||
word = start;
|
||||
word_len = next - start;
|
||||
char_len = c_nums;
|
||||
word_freq = 1;
|
||||
start = next;
|
||||
break;
|
||||
} else {
|
||||
start = next;
|
||||
}
|
||||
}
|
||||
} while (OB_SUCC(ret) && next < end);
|
||||
if (OB_ITER_END == ret || OB_SUCCESS == ret) {
|
||||
start_ = start;
|
||||
next_ = next;
|
||||
end_ = end;
|
||||
}
|
||||
if (OB_ITER_END == ret) {
|
||||
ret = OB_SUCCESS;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObSpaceFTParser::add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(word)
|
||||
|| OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
|
||||
LOG_DEBUG("next word", K(ObString(word_len, word)), KP(start_), KP(next_), KP(end_));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@ -111,21 +153,43 @@ int ObWhiteSpaceFTParserDesc::deinit(lib::ObPluginParam *param)
|
||||
return OB_SUCCESS;
|
||||
}
|
||||
|
||||
int ObWhiteSpaceFTParserDesc::segment(lib::ObFTParserParam *param) const
|
||||
int ObWhiteSpaceFTParserDesc::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
void *buf = nullptr;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid argument", K(ret), KPC(param));
|
||||
} else if (OB_FAIL(ObSpaceFTParser::segment(param, param->fulltext_, param->ft_length_))) {
|
||||
LOG_WARN("fail to segment words for fulltext by spaces", K(ret), KPC(param),
|
||||
K(param->fulltext_), K(param->ft_length_));
|
||||
} else if (OB_ISNULL(buf = param->allocator_->alloc(sizeof(ObSpaceFTParser)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate space ft parser", K(ret));
|
||||
} else {
|
||||
ObSpaceFTParser *parser = new (buf) ObSpaceFTParser();
|
||||
if (OB_FAIL(parser->init(param))) {
|
||||
LOG_WARN("fail to init whitespace fulltext parser", K(ret), KPC(param));
|
||||
} else {
|
||||
iter = parser;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ObWhiteSpaceFTParserDesc::free_token_iter(
|
||||
lib::ObFTParserParam *param,
|
||||
lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
if (OB_NOT_NULL(iter)) {
|
||||
abort_unless(nullptr != param);
|
||||
abort_unless(nullptr != param->allocator_);
|
||||
iter->~ObITokenIterator();
|
||||
param->allocator_->free(iter);
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
||||
|
||||
@ -23,21 +23,27 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
class ObSpaceFTParser final
|
||||
class ObSpaceFTParser final : public lib::ObITokenIterator
|
||||
{
|
||||
public:
|
||||
ObSpaceFTParser() = default;
|
||||
~ObSpaceFTParser() = default;
|
||||
static int segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *fulltext,
|
||||
const int64_t ft_len);
|
||||
ObSpaceFTParser();
|
||||
virtual ~ObSpaceFTParser();
|
||||
|
||||
int init(lib::ObFTParserParam *param);
|
||||
void reset();
|
||||
virtual int get_next_token(
|
||||
const char *&word,
|
||||
int64_t &word_len,
|
||||
int64_t &char_len,
|
||||
int64_t &word_freq) override;
|
||||
|
||||
VIRTUAL_TO_STRING_KV(KP_(cs), KP_(start), KP_(next), KP_(end), K_(is_inited));
|
||||
private:
|
||||
static int add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt);
|
||||
const ObCharsetInfo *cs_;
|
||||
const char *start_;
|
||||
const char *next_;
|
||||
const char *end_;
|
||||
bool is_inited_;
|
||||
};
|
||||
|
||||
class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc
|
||||
@ -47,7 +53,8 @@ public:
|
||||
virtual ~ObWhiteSpaceFTParserDesc() = default;
|
||||
virtual int init(lib::ObPluginParam *param) override;
|
||||
virtual int deinit(lib::ObPluginParam *param) override;
|
||||
virtual int segment(lib::ObFTParserParam *param) const override;
|
||||
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
virtual void free_token_iter(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
OB_INLINE void reset() { is_inited_ = false; }
|
||||
private:
|
||||
bool is_inited_;
|
||||
|
||||
@ -19,6 +19,6 @@ OB_DECLARE_PLUGIN(mock_ft_parser)
|
||||
OB_PLUGIN_AUTHOR_OCEANBASE,
|
||||
"This is mock fulltext parser plugin.",
|
||||
0x00001,
|
||||
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE,
|
||||
oceanbase::lib::ObPluginLicenseType::OB_Mulan_PubL_V2_LICENSE,
|
||||
&oceanbase::storage::mock_ft_parser,
|
||||
};
|
||||
|
||||
@ -27,7 +27,7 @@ public:
|
||||
virtual ~ObMockFTParserDesc() = default;
|
||||
virtual int init(lib::ObPluginParam *param) override;
|
||||
virtual int deinit(lib::ObPluginParam *param) override;
|
||||
virtual int segment(lib::ObFTParserParam *param) const override;
|
||||
virtual int segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const override;
|
||||
};
|
||||
|
||||
int ObMockFTParserDesc::init(lib::ObPluginParam *param)
|
||||
@ -42,7 +42,7 @@ int ObMockFTParserDesc::deinit(lib::ObPluginParam *param)
|
||||
return OB_SUCCESS;
|
||||
}
|
||||
|
||||
int ObMockFTParserDesc::segment(lib::ObFTParserParam *param) const
|
||||
int ObMockFTParserDesc::segment(lib::ObFTParserParam *param, lib::ObITokenIterator *&iter) const
|
||||
{
|
||||
UNUSED(param);
|
||||
return OB_SUCCESS;
|
||||
|
||||
@ -49,33 +49,19 @@ int segment_and_calc_word_count(
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
int64_t doc_length = 0;
|
||||
common::ObSEArray<ObFTWord, 256> words;
|
||||
if (OB_ISNULL(helper)
|
||||
|| OB_UNLIKELY(ObCollationType::CS_TYPE_INVALID == type
|
||||
|| ObCollationType::CS_TYPE_EXTENDED_MARK < type)
|
||||
|| OB_UNLIKELY(!words_count.created())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(helper), K(type), K(words_count.created()));
|
||||
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words))) {
|
||||
} else if (OB_FAIL(helper->segment(type, fulltext.ptr(), fulltext.length(), doc_length, words_count))) {
|
||||
LOG_WARN("fail to segment", K(ret), KPC(helper), K(type), K(fulltext));
|
||||
} else {
|
||||
for (int64_t i = 0; OB_SUCC(ret) && i < words.count(); ++i) {
|
||||
const ObFTWord &ft_word = words.at(i);
|
||||
int64_t word_count = 0;
|
||||
if (OB_FAIL(words_count.get_refactored(ft_word, word_count)) && OB_HASH_NOT_EXIST != ret) {
|
||||
LOG_WARN("fail to get ft word", K(ret), K(ft_word));
|
||||
} else {
|
||||
word_count = OB_HASH_NOT_EXIST == ret ? 1 : ++word_count;
|
||||
if (OB_FAIL(words_count.set_refactored(ft_word, word_count, 1/*overwrite*/))) {
|
||||
LOG_WARN("fail to set ft word and count", K(ret), K(ft_word));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
class ObTestAddWord final : public lib::ObFTParserParam::ObIAddWord
|
||||
class ObTestAddWord final
|
||||
{
|
||||
public:
|
||||
static const char *TEST_FULLTEXT;
|
||||
@ -85,14 +71,16 @@ public:
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
public:
|
||||
ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
|
||||
virtual ~ObTestAddWord() = default;
|
||||
virtual int operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
~ObTestAddWord() = default;
|
||||
int check_words(lib::ObITokenIterator *iter);
|
||||
int64_t get_add_word_count() const { return ith_word_; }
|
||||
static int64_t get_word_cnt_without_stopword() { return TEST_WORD_COUNT_WITHOUT_STOPWORD; }
|
||||
VIRTUAL_TO_STRING_KV(K_(ith_word));
|
||||
private:
|
||||
int check_ith_word(
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) override;
|
||||
virtual int64_t get_add_word_count() const override { return ith_word_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(ith_word));
|
||||
const int64_t char_cnt);
|
||||
private:
|
||||
bool is_min_max_word(const int64_t c_len) const;
|
||||
int casedown_word(const ObFTWord &src, ObFTWord &dst);
|
||||
@ -137,8 +125,32 @@ int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObTestAddWord::operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
int ObTestAddWord::check_words(lib::ObITokenIterator *iter)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(iter)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(iter));
|
||||
} else {
|
||||
const char *word = nullptr;
|
||||
int64_t word_len = 0;
|
||||
int64_t char_len = 0;
|
||||
int64_t word_freq = 0;
|
||||
while (OB_SUCC(ret)) {
|
||||
if (OB_FAIL(iter->get_next_token(word, word_len, char_len, word_freq))) {
|
||||
LOG_WARN("fail to get next token", K(ret), KPC(iter));
|
||||
} else if (OB_FAIL(check_ith_word(word, word_len, char_len))) {
|
||||
LOG_WARN("fail to check ith word", K(ret), KP(word), K(word_len), K(char_len));
|
||||
}
|
||||
}
|
||||
if (OB_ITER_END == ret) {
|
||||
ret = OB_SUCCESS;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObTestAddWord::check_ith_word(
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
@ -146,9 +158,9 @@ int ObTestAddWord::operator()(
|
||||
int ret = OB_SUCCESS;
|
||||
ObFTWord src_word(word_len, word, collation_type_);
|
||||
ObFTWord dst_word;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
|
||||
if (OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), K(word_len), K(char_cnt));
|
||||
} else if (is_min_max_word(char_cnt)) {
|
||||
// skip min/max word
|
||||
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
|
||||
@ -194,7 +206,6 @@ void TestDefaultFTParser::SetUp()
|
||||
ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_));
|
||||
|
||||
ft_parser_param_.allocator_ = &allocator_;
|
||||
ft_parser_param_.add_word_ = &add_word_;
|
||||
ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_UTF8MB4_BIN);
|
||||
ft_parser_param_.parser_version_ = 0x00001;
|
||||
ASSERT_TRUE(nullptr != ft_parser_param_.cs_);
|
||||
@ -209,54 +220,74 @@ void TestDefaultFTParser::TearDown()
|
||||
|
||||
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
|
||||
{
|
||||
ObSpaceFTParser parser;
|
||||
const char *fulltext = ObTestAddWord::TEST_FULLTEXT;
|
||||
const int64_t ft_len = strlen(fulltext);
|
||||
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(nullptr, nullptr, 0));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, nullptr, 0));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, 0));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, -1));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(nullptr));
|
||||
|
||||
ft_parser_param_.fulltext_ = nullptr;
|
||||
ft_parser_param_.ft_length_ = 0;
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_));
|
||||
|
||||
ft_parser_param_.fulltext_ = fulltext;
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_));
|
||||
|
||||
ft_parser_param_.ft_length_ = -1;
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parser.init(&ft_parser_param_));
|
||||
|
||||
ft_parser_param_.fulltext_ = fulltext;
|
||||
ft_parser_param_.ft_length_ = ft_len;
|
||||
|
||||
LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
|
||||
ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len));
|
||||
ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_));
|
||||
ASSERT_EQ(OB_SUCCESS, add_word_.check_words(&parser));
|
||||
LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
|
||||
}
|
||||
|
||||
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
|
||||
{
|
||||
common::ObArray<ObFTWord> words;
|
||||
ObAddWordFlag flag;
|
||||
ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
|
||||
ObSpaceFTParser parser;
|
||||
const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
|
||||
const int64_t ft_len = strlen(fulltext);
|
||||
|
||||
ft_parser_param_.fulltext_ = fulltext;
|
||||
ft_parser_param_.ft_length_ = ft_len;
|
||||
ft_parser_param_.add_word_ = &add_word;
|
||||
ft_parser_param_.cs_ = common::ObCharset::get_charset(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI);
|
||||
|
||||
LOG_INFO("before space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
|
||||
ASSERT_EQ(OB_SUCCESS, ObSpaceFTParser::segment(&ft_parser_param_, fulltext, ft_len));
|
||||
LOG_INFO("after space segment", KCSTRING(fulltext), K(words), K(ft_len), K(ft_parser_param_));
|
||||
ASSERT_EQ(OB_SUCCESS, parser.init(&ft_parser_param_));
|
||||
const char *word = nullptr;
|
||||
int64_t word_len = 0;
|
||||
int64_t char_len = 0;
|
||||
int64_t word_freq = 0;
|
||||
int ret = OB_SUCCESS;
|
||||
while (OB_SUCC(ret)) {
|
||||
if (OB_FAIL(parser.get_next_token(word, word_len, char_len, word_freq))) {
|
||||
LOG_WARN("fail to get next token", K(ret), K(parser));
|
||||
} else {
|
||||
LOG_INFO("succeed to get next token", K(ret), K(ObString(word_len, word)), K(char_len));
|
||||
}
|
||||
}
|
||||
LOG_INFO("after space segment", KCSTRING(fulltext), K(ft_len), K(ft_parser_param_));
|
||||
}
|
||||
|
||||
TEST_F(TestDefaultFTParser, test_default_ft_parser_desc)
|
||||
{
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_));
|
||||
ObITokenIterator *iter = nullptr;
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(&ft_parser_param_, iter));
|
||||
|
||||
ft_parser_param_.fulltext_ = ObTestAddWord::TEST_FULLTEXT;
|
||||
ft_parser_param_.ft_length_ = strlen(ft_parser_param_.fulltext_);
|
||||
|
||||
ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_));
|
||||
ASSERT_EQ(OB_SUCCESS, desc_.segment(&ft_parser_param_, iter));
|
||||
ASSERT_EQ(OB_SUCCESS, add_word_.check_words(iter));
|
||||
|
||||
ASSERT_EQ(OB_SUCCESS, desc_.deinit(&plugin_param_));
|
||||
ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_));
|
||||
ASSERT_EQ(OB_NOT_INIT, desc_.segment(&ft_parser_param_, iter));
|
||||
|
||||
ASSERT_EQ(OB_SUCCESS, desc_.init(&plugin_param_));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, desc_.segment(nullptr, iter));
|
||||
}
|
||||
|
||||
class ObTestFTPluginHelper : public ::testing::Test
|
||||
@ -442,29 +473,35 @@ void ObTestFTParseHelper::TearDownTestCase()
|
||||
|
||||
TEST_F(ObTestFTParseHelper, test_parse_fulltext)
|
||||
{
|
||||
common::ObSEArray<ObFTWord, 16> words;
|
||||
ObFTWordMap ft_word_map;
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse"));
|
||||
int64_t doc_length = 0;
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
|
||||
ObTestAddWord test_add_word(cs_type_, allocator_);
|
||||
for (int64_t i = 0; i < words.count(); ++i) {
|
||||
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
|
||||
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
|
||||
for (int64_t i = 0; i < ft_word_map.size(); ++i) {
|
||||
int64_t word_cnt = 0;
|
||||
ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_);
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt));
|
||||
ASSERT_TRUE(word_cnt >= 1);
|
||||
}
|
||||
|
||||
ObFTWordMap ft_word_map;
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse"));
|
||||
ft_word_map.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_,
|
||||
cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map));
|
||||
ASSERT_EQ(words.count(), ft_word_map.size());
|
||||
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
|
||||
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words));
|
||||
ft_word_map.clear();
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, ft_word_map));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, ft_word_map));
|
||||
|
||||
parse_helper_.reset();
|
||||
ft_word_map.clear();
|
||||
ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(nullptr, plugin_name_));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.init(&allocator_, ObString()));
|
||||
@ -472,9 +509,9 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
|
||||
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_INVALID, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(CS_TYPE_EXTENDED_MARK, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
|
||||
ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_));
|
||||
|
||||
@ -484,57 +521,80 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
|
||||
parse_helper_.reset();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
for (int64_t i = 0; i < words.count(); ++i) {
|
||||
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
|
||||
for (int64_t i = 0; i < ft_word_map.size(); ++i) {
|
||||
int64_t word_cnt = 0;
|
||||
ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_);
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt));
|
||||
ASSERT_TRUE(word_cnt >= 1);
|
||||
}
|
||||
parse_helper_.reset();
|
||||
ft_word_map.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, "beng.1"));
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, ft_word_map));
|
||||
ASSERT_EQ(ObTestAddWord::get_word_cnt_without_stopword(), ft_word_map.size());
|
||||
for (int64_t i = 0; i < ft_word_map.size(); ++i) {
|
||||
int64_t word_cnt = 0;
|
||||
ObFTWord word(strlen(test_add_word.words_without_stopword_[i]), test_add_word.words_without_stopword_[i], cs_type_);
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.get_refactored(word, word_cnt));
|
||||
ASSERT_TRUE(word_cnt >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(ObTestFTParseHelper, test_min_and_max_word_len)
|
||||
{
|
||||
common::ObSEArray<ObFTWord, 16> words;
|
||||
ObFTWordMap words;
|
||||
ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse"));
|
||||
int64_t doc_length = 0;
|
||||
|
||||
// word len = 2;
|
||||
const char *word_len_2 = "ab";
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_2, std::strlen(word_len_2), doc_length, words));
|
||||
ASSERT_EQ(0, words.count());
|
||||
ASSERT_EQ(0, words.size());
|
||||
|
||||
// word len = 3;
|
||||
const char *word_len_3 = "abc";
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_3, std::strlen(word_len_3), doc_length, words));
|
||||
ASSERT_EQ(1, words.count());
|
||||
ASSERT_EQ(1, words.size());
|
||||
|
||||
// word len = 4;
|
||||
const char *word_len_4 = "abcd";
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_4, std::strlen(word_len_4), doc_length, words));
|
||||
ASSERT_EQ(1, words.count());
|
||||
ASSERT_EQ(1, words.size());
|
||||
|
||||
// word len = 76;
|
||||
const char *word_len_76 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_76, std::strlen(word_len_76), doc_length, words));
|
||||
ASSERT_EQ(1, words.count());
|
||||
ASSERT_EQ(1, words.size());
|
||||
|
||||
// word len = 84;
|
||||
const char *word_len_84 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz123456";
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_84, std::strlen(word_len_84), doc_length, words));
|
||||
ASSERT_EQ(1, words.count());
|
||||
ASSERT_EQ(1, words.size());
|
||||
|
||||
// word len = 85;
|
||||
const char *word_len_85 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz1234567";
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, word_len_85, std::strlen(word_len_85), doc_length, words));
|
||||
ASSERT_EQ(0, words.count());
|
||||
ASSERT_EQ(0, words.size());
|
||||
}
|
||||
|
||||
class ObTestNgramFTParseHelper : public ::testing::Test
|
||||
{
|
||||
public:
|
||||
static const char *name_;
|
||||
static const int64_t TEST_WORD_COUNT = 29;
|
||||
static const int64_t TEST_WORD_COUNT = 27;
|
||||
typedef common::hash::ObHashMap<ObFTWord, int64_t> ObFTWordMap;
|
||||
public:
|
||||
ObTestNgramFTParseHelper();
|
||||
virtual ~ObTestNgramFTParseHelper() = default;
|
||||
static int64_t get_word_count() { return TEST_WORD_COUNT; }
|
||||
|
||||
static void SetUpTestCase();
|
||||
static void TearDownTestCase();
|
||||
@ -553,7 +613,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";
|
||||
|
||||
ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
|
||||
: plugin_name_(STRLEN(name_), name_),
|
||||
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
|
||||
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
|
||||
cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
|
||||
allocator_()
|
||||
{
|
||||
@ -583,26 +643,33 @@ void ObTestNgramFTParseHelper::TearDownTestCase()
|
||||
|
||||
TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext)
|
||||
{
|
||||
ObFTWordMap words;
|
||||
ASSERT_EQ(OB_SUCCESS, words.create(10, "TestParse"));
|
||||
int64_t doc_length = 0;
|
||||
common::ObSEArray<ObFTWord, 16> words;
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
|
||||
for (int64_t i = 0; i < words.count(); ++i) {
|
||||
ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length()));
|
||||
ASSERT_EQ(get_word_count(), words.size());
|
||||
for (int64_t i = 0; i < words.size(); ++i) {
|
||||
int64_t word_cnt = 0;
|
||||
ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_);
|
||||
ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt));
|
||||
ASSERT_TRUE(word_cnt >= 1);
|
||||
}
|
||||
|
||||
ObFTWordMap ft_word_map;
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(words.count(), "TestParse"));
|
||||
ASSERT_EQ(OB_SUCCESS, ft_word_map.create(10, "TestParse"));
|
||||
ASSERT_EQ(OB_SUCCESS, segment_and_calc_word_count(allocator_, &parse_helper_,
|
||||
cs_type_, ObTestAddWord::TEST_FULLTEXT, ft_word_map));
|
||||
ASSERT_EQ(words.count(), ft_word_map.size() + 2);
|
||||
ASSERT_EQ(words.size(), ft_word_map.size());
|
||||
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, nullptr, std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, 0, doc_length, words));
|
||||
ASSERT_EQ(OB_INVALID_ARGUMENT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT, -1, doc_length, words));
|
||||
|
||||
parse_helper_.reset();
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_NOT_INIT, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
|
||||
@ -620,14 +687,19 @@ TEST_F(ObTestNgramFTParseHelper, test_parse_fulltext)
|
||||
ASSERT_EQ(OB_INIT_TWICE, parse_helper_.init(&allocator_, plugin_name_));
|
||||
|
||||
parse_helper_.reset();
|
||||
words.clear();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
|
||||
|
||||
parse_helper_.reset();
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.init(&allocator_, plugin_name_));
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
for (int64_t i = 0; i < words.count(); ++i) {
|
||||
ASSERT_TRUE(0 == strncmp(ngram_words_[i], words[i].word_.ptr(), words[i].word_.length()));
|
||||
ASSERT_EQ(get_word_count(), words.size());
|
||||
for (int64_t i = 0; i < words.size(); ++i) {
|
||||
int64_t word_cnt = 0;
|
||||
ObFTWord word(strlen(ngram_words_[i]), ngram_words_[i], cs_type_);
|
||||
ASSERT_EQ(OB_SUCCESS, words.get_refactored(word, word_cnt));
|
||||
ASSERT_TRUE(word_cnt >= 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -638,7 +710,7 @@ int main(int argc, char **argv)
|
||||
{
|
||||
system("rm -rf test_fts_plugin.log");
|
||||
OB_LOGGER.set_file_name("test_fts_plugin.log", true);
|
||||
OB_LOGGER.set_log_level("INFO");
|
||||
OB_LOGGER.set_log_level("DEBUG");
|
||||
oceanbase::storage::ObTestFTPluginHelper::file_name = argv[0];
|
||||
testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
||||
Reference in New Issue
Block a user