[FTS.BUGFIX] fix compatibility of MySQL default fulltext parser

This commit is contained in:
Tyshawn 2024-04-28 03:58:17 +00:00 committed by ob-robot
parent 1442b06e77
commit c4ed3f10af
16 changed files with 541 additions and 296 deletions

View File

@ -223,7 +223,11 @@ public:
public:
ObIAddWord() = default;
virtual ~ObIAddWord() = default;
virtual int operator()(ObFTParserParam *param, const char *word, const int64_t word_len) = 0;
virtual int operator()(
ObFTParserParam *param,
const char *word,
const int64_t word_len,
const int64_t char_cnt) = 0;
virtual int64_t get_add_word_count() const = 0;
DECLARE_PURE_VIRTUAL_TO_STRING;
};
@ -247,9 +251,9 @@ public:
&& 0 < ft_length_
&& 0 <= parser_version_;
}
inline int add_word(ObFTParserParam *param, const char *word, int64_t word_len)
inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt)
{
return (*add_word_)(param, word, word_len);
return (*add_word_)(param, word, word_len, char_cnt);
}
inline void reset()
{

View File

@ -170,6 +170,7 @@ ob_set_subtarget(ob_storage ckpt
)
ob_set_subtarget(ob_storage fts
fts/ob_beng_ft_parser.cpp
fts/ob_fts_plugin_mgr.cpp
fts/ob_fts_plugin_helper.cpp
fts/ob_fts_stop_word.cpp

View File

@ -0,0 +1,179 @@
/**
* Copyright (c) 2023 OceanBase
* OceanBase is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#define USING_LOG_PREFIX STORAGE_FTS
#include "lib/string/ob_string.h"
#include "storage/fts/ob_beng_ft_parser.h"
using namespace oceanbase::common;
namespace oceanbase
{
namespace storage
{
/*static*/ int ObBEngFTParser::segment(
lib::ObFTParserParam *param,
const char *ft,
const int64_t ft_len)
{
int ret = OB_SUCCESS;
ObDatum doc;
doc.set_string(ft, ft_len);
ObBEngFTParser parser;
share::ObITokenStream *token_stream = nullptr;
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
} else if (OB_FAIL(parser.init(param))) {
LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param));
} else if (FALSE_IT(doc.set_string(ft, ft_len))) {
} else if (OB_FAIL(parser.segment(doc, token_stream))) {
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len));
} else if (OB_ISNULL(token_stream)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream));
} else {
ObDatum token;
int64_t token_freq = 0;
while (OB_SUCC(ret)) {
if (OB_FAIL(token_stream->get_next(token, token_freq))) {
if (OB_ITER_END != ret) {
LOG_WARN("fail to get next token", K(ret), KPC(token_stream));
}
} else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) {
LOG_WARN("fail to add word", K(ret), K(token), KPC(param));
}
}
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
}
}
return ret;
}
/*static*/ int ObBEngFTParser::add_word(
lib::ObFTParserParam *param,
common::ObIAllocator *allocator,
const char *word,
int64_t word_len)
{
int ret = OB_SUCCESS;
char *buf = nullptr;
if (OB_ISNULL(param)
|| OB_ISNULL(allocator)
|| OB_ISNULL(word)
|| OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len));
} else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) {
LOG_DEBUG("skip too small or large word", K(ret), K(word_len));
} else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate word memory", K(ret), K(word_len));
} else if (FALSE_IT(MEMCPY(buf, word, word_len))) {
} else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word)));
} else {
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)));
}
return ret;
}
int ObBEngFTParser::init(lib::ObFTParserParam *param)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(is_inited_)) {
ret = OB_INIT_TWICE;
LOG_WARN("init twice", K(ret), K(is_inited_));
} else if (OB_ISNULL(param) || OB_UNLIKELY(!param->is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("param is nullptr", K(ret), KPC(param));
} else if (OB_UNLIKELY(UINT32_MAX < param->ft_length_)) {
ret = OB_NOT_SUPPORTED;
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_));
} else {
analysis_ctx_.cs_ = param->cs_;
analysis_ctx_.filter_stopword_ = false;
analysis_ctx_.need_grouping_ = false;
if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) {
LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_));
} else {
is_inited_ = true;
}
}
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
reset();
}
return ret;
}
int ObBEngFTParser::segment(
const common::ObDatum &doc,
share::ObITokenStream *&token_stream)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(doc.ptr_) || OB_UNLIKELY(0 >= doc.len_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(doc.ptr_), K(doc.len_));
} else if (OB_UNLIKELY(UINT32_MAX < doc.len_)) {
ret = OB_NOT_SUPPORTED;
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(doc.len_));
} else if (OB_FAIL(english_analyzer_.analyze(doc, token_stream))) {
LOG_WARN("fail to analyze document", K(ret), K(english_analyzer_), KP(doc.ptr_), K(doc.len_));
}
return ret;
}
void ObBEngFTParser::reset()
{
analysis_ctx_.reset();
english_analyzer_.reset();
is_inited_ = false;
}
ObBasicEnglishFTParserDesc::ObBasicEnglishFTParserDesc()
: is_inited_(false)
{
}
int ObBasicEnglishFTParserDesc::init(lib::ObPluginParam *param)
{
is_inited_ = true;
return OB_SUCCESS;
}
int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param)
{
reset();
return OB_SUCCESS;
}
int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_));
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), KPC(param));
} else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) {
LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param),
K(param->fulltext_), K(param->ft_length_));
}
return ret;
}
} // end namespace storage
} // end namespace oceanbase

View File

@ -0,0 +1,83 @@
/**
* Copyright (c) 2023 OceanBase
* OceanBase is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#ifndef OB_BENG_FT_PARSER_H_
#define OB_BENG_FT_PARSER_H_
#include "lib/ob_plugin.h"
#include "lib/utility/ob_macro_utils.h"
#include "lib/utility/ob_print_utils.h"
#include "share/text_analysis/ob_text_analyzer.h"
namespace oceanbase
{
namespace storage
{
class ObBEngFTParser final
{
public:
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
public:
static int segment(
lib::ObFTParserParam *param,
const char *fulltext,
const int64_t ft_len);
private:
ObBEngFTParser()
: analysis_ctx_(),
english_analyzer_(),
is_inited_(false)
{}
~ObBEngFTParser() = default;
static int add_word(
lib::ObFTParserParam *param,
common::ObIAllocator *allocator,
const char *word,
int64_t word_len);
int init(lib::ObFTParserParam *param);
void reset();
int segment(
const common::ObDatum &doc,
share::ObITokenStream *&token_stream);
TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited));
private:
share::ObTextAnalysisCtx analysis_ctx_;
share::ObEnglishTextAnalyzer english_analyzer_;
bool is_inited_;
DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser);
};
class ObBasicEnglishFTParserDesc final : public lib::ObIFTParserDesc
{
public:
ObBasicEnglishFTParserDesc();
virtual ~ObBasicEnglishFTParserDesc() = default;
virtual int init(lib::ObPluginParam *param) override;
virtual int deinit(lib::ObPluginParam *param) override;
virtual int segment(lib::ObFTParserParam *param) const override;
OB_INLINE void reset() { is_inited_ = false; }
private:
bool is_inited_;
};
static ObBasicEnglishFTParserDesc beng_parser;
} // end namespace storage
} // end namespace oceanbase
#endif // OB_BENG_FT_PARSER_H_

View File

@ -15,6 +15,7 @@
#include "storage/fts/ob_whitespace_ft_parser.h"
#include "storage/fts/ob_ngram_ft_parser.h"
#include "storage/fts/ob_beng_ft_parser.h"
///////////////////////////////////// Default fulltext parser //////////////////////////////////////////
@ -23,10 +24,10 @@ OB_DECLARE_PLUGIN(whitespace_parser)
oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type
"space", // name
OB_PLUGIN_AUTHOR_OCEANBASE, // author
"This is a default space parser plugin.", // brief specification
"This is a default whitespace parser plugin.", // brief specification
0x00001, // version
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
&oceanbase::storage::whitespace_parser, // default space parser plugin instance
&oceanbase::storage::whitespace_parser, // default space parser plugin instance
};
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser);
@ -46,4 +47,19 @@ OB_DECLARE_PLUGIN(ngram_parser)
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser);
///////////////////////////////////// Default fulltext parser //////////////////////////////////////////
OB_DECLARE_PLUGIN(beng_parser)
{
oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type
"beng", // name
OB_PLUGIN_AUTHOR_OCEANBASE, // author
"This is a basic english parser plugin.", // brief specification
0x00001, // version
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
&oceanbase::storage::beng_parser, // default space parser plugin instance
};
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser);
#endif // OB_FTS_BUILD_IN_PARSER_REGISTER_H_

View File

@ -145,7 +145,7 @@ ObFTParseHelper::ObFTParseHelper()
allocator_(nullptr),
parser_desc_(nullptr),
parser_name_(),
filter_stopword_(false),
add_word_flag_(),
is_inited_(false)
{
}
@ -178,7 +178,9 @@ int ObFTParseHelper::init(
LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler));
} else {
plugin_param_.desc_ = parser_desc_;
filter_stopword_ = need_stopword_list(parser_name_);
if (need_min_max_word(parser_name_)) { add_word_flag_.set_min_max_word(); }
if (need_castdn(parser_name_)) { add_word_flag_.set_casedown(); }
if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word(); }
allocator_ = allocator;
is_inited_ = true;
}
@ -193,7 +195,7 @@ void ObFTParseHelper::reset()
parser_desc_ = nullptr;
plugin_param_.reset();
allocator_ = nullptr;
filter_stopword_ = false;
add_word_flag_.clear();
is_inited_ = false;
}
@ -220,17 +222,14 @@ int ObFTParseHelper::segment(
LOG_WARN("unexpected error, charset info is nullptr", K(ret), K(type));
} else {
words.reuse();
lib::ObFTParserParam::ObIAddWord *add_word = nullptr;
if (OB_FAIL(alloc_add_word(type, words, add_word))) {
LOG_WARN("fail to allocate add word", K(ret), K(type));
} else if (OB_FAIL(segment(parser_name_.get_parser_version(), parser_desc_, cs, fulltext, fulltext_len, *allocator_,
*add_word))) {
ObAddWord add_word(type, add_word_flag_, *allocator_, words);
if (OB_FAIL(segment(parser_name_.get_parser_version(), parser_desc_, cs, fulltext, fulltext_len, *allocator_,
add_word))) {
LOG_WARN("fail to segment fulltext", K(ret), K(parser_name_), KP(parser_desc_), KP(cs), KP(fulltext),
K(fulltext_len), KP(allocator_));
} else {
doc_length = add_word->get_add_word_count();
doc_length = add_word.get_add_word_count();
}
free_add_word(add_word);
}
LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words));
return ret;
@ -238,45 +237,23 @@ int ObFTParseHelper::segment(
bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser)
{
share::ObPluginName name("space");
return parser.get_parser_name() == name;
share::ObPluginName space("space");
share::ObPluginName beng("beng");
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
}
int ObFTParseHelper::alloc_add_word(
const ObCollationType &type,
common::ObIArray<ObFTWord> &words,
lib::ObFTParserParam::ObIAddWord *&add_word) const
bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser)
{
int ret = OB_SUCCESS;
common::ObMemAttr mem_attr(MTL_ID(), "FTAddWord");
void *buf = nullptr;
const int64_t buf_size = filter_stopword_ ? sizeof(ObStopWordAddWord) : sizeof(ObNoStopWordAddWord);
if (OB_NOT_NULL(add_word)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("add word isn't nullptr", K(ret), KPC(add_word));
} else if (OB_ISNULL(buf = ob_malloc(buf_size, mem_attr))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate memory", K(ret), K(buf_size));
} else if (filter_stopword_) {
add_word = new (buf) ObStopWordAddWord(type, *allocator_, words);
} else {
add_word = new (buf) ObNoStopWordAddWord(type, *allocator_, words);
}
if (OB_FAIL(ret) && OB_NOT_NULL(buf)) {
ob_free(buf);
buf = nullptr;
add_word = nullptr;
}
return ret;
share::ObPluginName space("space");
share::ObPluginName beng("beng");
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
}
void ObFTParseHelper::free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const
bool ObFTParseHelper::need_castdn(const ObFTParser &parser)
{
if (OB_NOT_NULL(add_word)) {
add_word->~ObIAddWord();
ob_free(static_cast<void *>(add_word));
add_word = nullptr;
}
share::ObPluginName space("space");
share::ObPluginName ngram("ngram");
return parser.get_parser_name() == space || parser.get_parser_name() == ngram;
}
} // end namespace storage

View File

@ -107,6 +107,8 @@ private:
common::ObIAllocator &allocator,
lib::ObFTParserParam::ObIAddWord &add_word);
static bool need_stopword_list(const ObFTParser &parser);
static bool need_castdn(const ObFTParser &parser);
static bool need_min_max_word(const ObFTParser &parser);
int alloc_add_word(
const ObCollationType &type,
@ -119,7 +121,7 @@ private:
common::ObIAllocator *allocator_;
lib::ObIFTParserDesc *parser_desc_;
ObFTParser parser_name_;
bool filter_stopword_;
ObAddWordFlag add_word_flag_;
bool is_inited_;
private:

View File

@ -56,6 +56,8 @@ int ObTenantFTPluginMgr::register_plugins()
LOG_WARN("fail to register default fulltext parser", K(ret));
} else if (OB_FAIL(register_plugin<ObBuildInNgramFTParser>())) {
LOG_WARN("fail to register ngram fulltext parser", K(ret));
} else if (OB_FAIL(register_plugin<ObBuildInBEngFTParser>())) {
LOG_WARN("fail to register basic english fulltext parser", K(ret));
}
return ret;
}
@ -67,6 +69,8 @@ void ObTenantFTPluginMgr::unregister_plugins()
LOG_ERROR("fail to unregister default fulltext parser", K(ret));
} else if (OB_FAIL(unregister_plugin<ObBuildInNgramFTParser>())) {
LOG_ERROR("fail to unregister ngram fulltext parser", K(ret));
} else if (OB_FAIL(unregister_plugin<ObBuildInBEngFTParser>())) {
LOG_ERROR("fail to unregister basic english fulltext parser", K(ret));
}
}
@ -133,6 +137,8 @@ int ObTenantFTPluginMgr::init_plugin_handler()
LOG_WARN("fail to set default fulltext parser", K(ret));
} else if (OB_FAIL(set_plugin_handler<ObBuildInNgramFTParser>())) {
LOG_WARN("fail to set ngram fulltext parser", K(ret));
} else if (OB_FAIL(set_plugin_handler<ObBuildInBEngFTParser>())) {
LOG_WARN("fail to set basic english fulltext parser", K(ret));
}
return ret;
}

View File

@ -20,99 +20,87 @@ namespace oceanbase
namespace storage
{
ObNoStopWordAddWord::ObNoStopWordAddWord(
ObAddWord::ObAddWord(
const ObCollationType &type,
const ObAddWordFlag &flag,
common::ObIAllocator &allocator,
common::ObIArray<ObFTWord> &word)
: collation_type_(type),
allocator_(allocator),
words_(word),
word_count_(0)
min_max_word_cnt_(0),
non_stopword_cnt_(0),
stopword_cnt_(0),
flag_(flag)
{
}
int ObNoStopWordAddWord::operator()(
int ObAddWord::operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len)
{
int ret = OB_SUCCESS;
char *w_buf = nullptr;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
} else if (OB_ISNULL(w_buf = static_cast<char *>(allocator_.alloc(word_len)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate memory for fulltext word", K(ret), K(word_len));
} else {
MEMCPY(w_buf, word, word_len);
ObFTWord ft_word(word_len, w_buf, collation_type_);
if (OB_FAIL(words_.push_back(ft_word))) {
LOG_WARN("fail to push word into words array", K(ret), K(ft_word));
} else {
++word_count_;
}
}
if (OB_FAIL(ret)) {
if (OB_NOT_NULL(w_buf)) {
allocator_.free(w_buf);
w_buf = nullptr;
}
}
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len));
return ret;
}
ObStopWordAddWord::ObStopWordAddWord(
const ObCollationType &type,
common::ObIAllocator &allocator,
common::ObIArray<ObFTWord> &word)
: collation_type_(type),
allocator_(allocator),
words_(word),
non_stopword_count_(0),
stopword_count_(0)
{
}
int ObStopWordAddWord::operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len)
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
bool is_stopword = false;
ObFTWord ft_word(word_len, word, collation_type_);
ObFTWord src_word(word_len, word, collation_type_);
ObFTWord dst_word;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
} else if (OB_FAIL(OB_FT_PLUGIN_MGR.check_stopword(ft_word, is_stopword))) {
LOG_WARN("fail to check stopword", K(ret));
} else if (is_stopword) {
// the word is stop word, just skip it.
++stopword_count_;
} else if (is_min_max_word(char_cnt)) {
++min_max_word_cnt_;
LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt));
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
LOG_WARN("fail to casedown word", K(ret), K(src_word));
} else if (check_stopword(dst_word, is_stopword)) {
LOG_WARN("fail to check stopword", K(ret), K(dst_word));
} else if (OB_UNLIKELY(is_stopword)) {
++stopword_cnt_;
LOG_DEBUG("skip stopword", K(ret), K(dst_word));
} else if (OB_FAIL(words_.push_back(dst_word))) {
LOG_WARN("fail to push word into words array", K(ret), K(dst_word));
} else {
char *w_buf = nullptr;
if (OB_ISNULL(w_buf = static_cast<char *>(allocator_.alloc(word_len)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate memory for fulltext word", K(ret), K(word_len));
} else {
MEMCPY(w_buf, word, word_len);
ObFTWord non_stopword_ft_word(word_len, w_buf, collation_type_);
if (OB_FAIL(words_.push_back(non_stopword_ft_word))) {
LOG_WARN("fail to push word into words array", K(ret), K(non_stopword_ft_word));
} else {
++non_stopword_count_;
}
}
if (OB_FAIL(ret)) {
if (OB_NOT_NULL(w_buf)) {
allocator_.free(w_buf);
w_buf = nullptr;
}
}
++non_stopword_cnt_;
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word));
}
return ret;
}
bool ObAddWord::is_min_max_word(const int64_t c_len) const
{
return flag_.min_max_word() && (c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN);
}
int ObAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(src.empty())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid src ft word", K(ret), K(src));
} else if (flag_.casedown()) {
ObString dst_str;
if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
} else {
ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
dst = tmp;
}
} else {
dst = src;
}
return ret;
}
int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(ft_word.empty())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), K(ft_word));
} else if (flag_.stopword() && OB_FAIL(OB_FT_PLUGIN_MGR.check_stopword(ft_word, is_stopword))) {
LOG_WARN("fail to check stopword", K(ret));
}
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(is_stopword));
return ret;
}

View File

@ -63,49 +63,38 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = {
"www"
};
class ObNoStopWordAddWord final : public lib::ObFTParserParam::ObIAddWord
class ObAddWord final : public lib::ObFTParserParam::ObIAddWord
{
public:
ObNoStopWordAddWord(
ObAddWord(
const ObCollationType &type,
const ObAddWordFlag &flag,
common::ObIAllocator &allocator,
common::ObIArray<ObFTWord> &word);
virtual ~ObNoStopWordAddWord() = default;
virtual ~ObAddWord() = default;
virtual int operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len) override;
virtual int64_t get_add_word_count() const override { return word_count_; }
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(word_count), K_(words));
const int64_t word_len,
const int64_t char_cnt) override;
virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; }
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt),
K_(words));
public:
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
private:
OB_INLINE common::ObIArray<ObFTWord> &get_words() { return words_; }
bool is_min_max_word(const int64_t c_len) const;
int casedown_word(const ObFTWord &src, ObFTWord &dst);
int check_stopword(const ObFTWord &word, bool &is_stopword);
private:
ObCollationType collation_type_;
common::ObIAllocator &allocator_;
common::ObIArray<ObFTWord> &words_;
int64_t word_count_;
};
class ObStopWordAddWord final : public lib::ObFTParserParam::ObIAddWord
{
public:
ObStopWordAddWord(
const ObCollationType &type,
common::ObIAllocator &allocator,
common::ObIArray<ObFTWord> &word);
virtual ~ObStopWordAddWord() = default;
virtual int operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len) override;
virtual int64_t get_add_word_count() const { return non_stopword_count_; }
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(non_stopword_count), K_(stopword_count), K_(words));
private:
ObCollationType collation_type_;
common::ObIAllocator &allocator_;
common::ObIArray<ObFTWord> &words_;
int64_t non_stopword_count_;
int64_t stopword_count_;
int64_t min_max_word_cnt_;
int64_t non_stopword_cnt_;
int64_t stopword_cnt_;
ObAddWordFlag flag_;
};
} // end namespace storage

View File

@ -76,6 +76,36 @@ public:
int64_t word_cnt_;
};
class ObAddWordFlag final
{
private:
static const uint64_t AWF_NONE = 0;
static const uint64_t AWF_MIN_MAX_WORD = 1 << 0; // filter words that are less than a minimum or greater
// than a maximum word length.
static const uint64_t AWF_STOPWORD = 1 << 1; // filter by sotp word table.
static const uint64_t AWF_CASEDOWN = 1 << 2; // convert characters from uppercase to lowercase.
public:
ObAddWordFlag() : flag_(AWF_NONE) {}
~ObAddWordFlag() = default;
private:
void set_flag(const uint64_t flag) { flag_ |= flag; }
void clear_flag(const uint64_t flag) { flag_ &= ~flag; }
bool has_flag(const uint64 flag) const { return (flag_ & flag) == flag; }
public:
void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); }
void set_stop_word() { set_flag(AWF_STOPWORD); }
void set_casedown() { set_flag(AWF_CASEDOWN); }
void clear() { flag_ = AWF_NONE; }
void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); }
void clear_stop_word() { clear_flag(AWF_STOPWORD); }
void clear_casedown() { clear_flag(AWF_CASEDOWN); }
bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); }
bool stopword() const { return has_flag(AWF_STOPWORD); }
bool casedown() const { return has_flag(AWF_CASEDOWN); }
private:
uint64_t flag_;
};
} // end namespace storage
} // end namespace oceanbase

View File

@ -56,8 +56,8 @@ namespace storage
++c_nums;
}
if (NGRAM_TOKEN_SIZE == c_nums) {
if (OB_FAIL(add_word(param, start, next - start))) {
LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next));
if (OB_FAIL(add_word(param, start, next - start, c_nums))) {
LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums));
} else {
start += ob_mbcharlen_ptr(cs, start, end);
c_nums = NGRAM_TOKEN_SIZE - 1;
@ -71,7 +71,8 @@ namespace storage
/*static*/ int ObNgramFTParser::add_word(
lib::ObFTParserParam *param,
const char *word,
int64_t word_len)
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(param)
@ -79,8 +80,8 @@ namespace storage
|| OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
} else if (OB_FAIL(param->add_word(param, word, word_len))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, word)));
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
}
return ret;
}

View File

@ -37,7 +37,8 @@ private:
static int add_word(
lib::ObFTParserParam *param,
const char *word,
int64_t word_len);
const int64_t word_len,
const int64_t char_cnt);
private:
DISABLE_COPY_ASSIGN(ObNgramFTParser);
};

View File

@ -22,37 +22,51 @@ namespace oceanbase
namespace storage
{
/*static*/ int ObSpaceFTParser::segment(
#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
int ObSpaceFTParser::segment(
lib::ObFTParserParam *param,
const char *ft,
const int64_t ft_len)
{
int ret = OB_SUCCESS;
ObDatum doc;
doc.set_string(ft, ft_len);
ObSpaceFTParser parser;
share::ObITokenStream *token_stream = nullptr;
const char *start = ft;
const char *next = start;
const char *end = start + ft_len;
int mbl = 0;
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
} else if (OB_FAIL(parser.init(param))) {
LOG_WARN("fail to initialize space parser", K(ret), KPC(param));
} else if (FALSE_IT(doc.set_string(ft, ft_len))) {
} else if (OB_FAIL(parser.segment(doc, token_stream))) {
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len));
} else if (OB_ISNULL(token_stream)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream));
} else {
ObDatum token;
int64_t token_freq = 0;
while (OB_SUCC(ret)) {
if (OB_FAIL(token_stream->get_next(token, token_freq))) {
if (OB_ITER_END != ret) {
LOG_WARN("fail to get next token", K(ret), KPC(token_stream));
const ObCharsetInfo *cs = param->cs_;
while (OB_SUCC(ret) && next < end) {
while (next < end) {
int ctype;
mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end);
if (true_word_char(ctype, *next)) {
break;
}
next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
}
if (next >= end) {
ret = OB_ITER_END;
} else {
int64_t c_nums = 0;
start = next;
while (next < end) {
int ctype;
mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end);
if (!true_word_char(ctype, *next)) {
break;
}
++c_nums;
next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
}
if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) {
LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next));
} else {
start = next;
}
} else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) {
LOG_WARN("fail to add word", K(ret), K(token), KPC(param));
}
}
if (OB_ITER_END == ret) {
@ -62,86 +76,24 @@ namespace storage
return ret;
}
/*static*/ int ObSpaceFTParser::add_word(
int ObSpaceFTParser::add_word(
lib::ObFTParserParam *param,
common::ObIAllocator *allocator,
const char *word,
int64_t word_len)
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
char *buf = nullptr;
if (OB_ISNULL(param)
|| OB_ISNULL(allocator)
|| OB_ISNULL(word)
|| OB_UNLIKELY(0 >= word_len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len));
} else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) {
LOG_DEBUG("skip too small or large word", K(ret), K(word_len));
} else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("fail to allocate word memory", K(ret), K(word_len));
} else if (FALSE_IT(MEMCPY(buf, word, word_len))) {
} else if (OB_FAIL(param->add_word(param, buf, word_len))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word)));
} else {
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)));
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
}
return ret;
}
int ObSpaceFTParser::init(lib::ObFTParserParam *param)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(is_inited_)) {
ret = OB_INIT_TWICE;
LOG_WARN("init twice", K(ret), K(is_inited_));
} else if (OB_ISNULL(param) || OB_UNLIKELY(!param->is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("param is nullptr", K(ret), KPC(param));
} else if (OB_UNLIKELY(UINT32_MAX < param->ft_length_)) {
ret = OB_NOT_SUPPORTED;
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_));
} else {
analysis_ctx_.cs_ = param->cs_;
analysis_ctx_.filter_stopword_ = false;
analysis_ctx_.need_grouping_ = false;
if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) {
LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_));
} else {
is_inited_ = true;
}
}
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
reset();
}
return ret;
}
int ObSpaceFTParser::segment(
const common::ObDatum &doc,
share::ObITokenStream *&token_stream)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(doc.ptr_) || OB_UNLIKELY(0 >= doc.len_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(doc.ptr_), K(doc.len_));
} else if (OB_UNLIKELY(UINT32_MAX < doc.len_)) {
ret = OB_NOT_SUPPORTED;
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(doc.len_));
} else if (OB_FAIL(english_analyzer_.analyze(doc, token_stream))) {
LOG_WARN("fail to analyze document", K(ret), K(english_analyzer_), KP(doc.ptr_), K(doc.len_));
}
return ret;
}
void ObSpaceFTParser::reset()
{
analysis_ctx_.reset();
english_analyzer_.reset();
is_inited_ = false;
}
ObWhiteSpaceFTParserDesc::ObWhiteSpaceFTParserDesc()
: is_inited_(false)
{

View File

@ -10,8 +10,8 @@
* See the Mulan PubL v2 for more details.
*/
#ifndef OB_DEFAULT_FT_PARSER_H_
#define OB_DEFAULT_FT_PARSER_H_
#ifndef OB_WHITESPACE_FT_PARSER_H_
#define OB_WHITESPACE_FT_PARSER_H_
#include "lib/ob_plugin.h"
#include "lib/utility/ob_macro_utils.h"
@ -26,40 +26,18 @@ namespace storage
class ObSpaceFTParser final
{
public:
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
public:
ObSpaceFTParser() = default;
~ObSpaceFTParser() = default;
static int segment(
lib::ObFTParserParam *param,
const char *fulltext,
const int64_t ft_len);
private:
ObSpaceFTParser()
: analysis_ctx_(),
english_analyzer_(),
is_inited_(false)
{}
~ObSpaceFTParser() = default;
static int add_word(
lib::ObFTParserParam *param,
common::ObIAllocator *allocator,
const char *word,
int64_t word_len);
int init(lib::ObFTParserParam *param);
void reset();
int segment(
const common::ObDatum &doc,
share::ObITokenStream *&token_stream);
TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited));
private:
share::ObTextAnalysisCtx analysis_ctx_;
share::ObEnglishTextAnalyzer english_analyzer_;
bool is_inited_;
DISALLOW_COPY_AND_ASSIGN(ObSpaceFTParser);
const int64_t word_len,
const int64_t char_cnt);
};
class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc
@ -76,8 +54,7 @@ private:
};
static ObWhiteSpaceFTParserDesc whitespace_parser;
} // end namespace storage
} // end namespace oceanbase
#endif // OB_DEFAULT_FT_PARSER_H_
#endif // OB_WHITESPACE_FT_PARSER_H_

View File

@ -81,16 +81,23 @@ public:
static const char *TEST_FULLTEXT;
static const int64_t TEST_WORD_COUNT = 5;
static const int64_t TEST_WORD_COUNT_WITHOUT_STOPWORD = 4;
static const int64_t FT_MIN_WORD_LEN = 3;
static const int64_t FT_MAX_WORD_LEN = 84;
public:
ObTestAddWord();
ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
virtual ~ObTestAddWord() = default;
virtual int operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len) override;
const int64_t word_len,
const int64_t char_cnt) override;
virtual int64_t get_add_word_count() const override { return ith_word_; }
VIRTUAL_TO_STRING_KV(K_(ith_word));
private:
bool is_min_max_word(const int64_t c_len) const;
int casedown_word(const ObFTWord &src, ObFTWord &dst);
ObCollationType collation_type_;
common::ObIAllocator &allocator_;
const char *words_[TEST_WORD_COUNT];
const char *words_without_stopword_[TEST_WORD_COUNT_WITHOUT_STOPWORD];
int64_t ith_word_;
@ -98,26 +105,57 @@ private:
const char *ObTestAddWord::TEST_FULLTEXT = "OceanBase fulltext search is No.1 in the world.";
ObTestAddWord::ObTestAddWord()
: words_{"oceanbase", "fulltext", "search", "the", "world"},
ObTestAddWord::ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator)
: collation_type_(type),
allocator_(allocator),
words_{"oceanbase", "fulltext", "search", "the", "world"},
words_without_stopword_{"oceanbase", "fulltext", "search", "world"},
ith_word_(0)
{
}
bool ObTestAddWord::is_min_max_word(const int64_t c_len) const
{
return c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN;
}
int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(src.empty())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid src ft word", K(ret), K(src));
} else {
ObString dst_str;
if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
} else {
ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
dst = tmp;
}
}
return ret;
}
int ObTestAddWord::operator()(
lib::ObFTParserParam *param,
const char *word,
const int64_t word_len)
const int64_t word_len,
const int64_t char_cnt)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
ObFTWord src_word(word_len, word, collation_type_);
ObFTWord dst_word;
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len));
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], word, word_len))) {
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
} else if (is_min_max_word(char_cnt)) {
// skip min/max word
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
LOG_WARN("fail to casedown word", K(ret), K(src_word));
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], dst_word.get_word().ptr(), dst_word.get_word().length()))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]),
KCSTRING(word), K(word_len));
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), K(dst_word));
} else {
++ith_word_;
}
@ -136,17 +174,17 @@ public:
private:
lib::ObPluginParam plugin_param_;
lib::ObFTParserParam ft_parser_param_;
ObTestAddWord add_word_;
ObWhiteSpaceFTParserDesc desc_;
common::ObArenaAllocator allocator_;
ObTestAddWord add_word_;
};
TestDefaultFTParser::TestDefaultFTParser()
: plugin_param_(),
ft_parser_param_(),
add_word_(),
desc_(),
allocator_()
allocator_(),
add_word_(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_)
{
plugin_param_.desc_ = &desc_;
}
@ -190,7 +228,8 @@ TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
{
common::ObArray<ObFTWord> words;
ObNoStopWordAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, allocator_, words);
ObAddWordFlag flag;
ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
const int64_t ft_len = strlen(fulltext);
@ -291,7 +330,7 @@ void ObTestFTPluginHelper::TearDown()
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
// ASSERT_TRUE(nullptr != desc);
//
// ObTestAddWord test_add_word;
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
//}
@ -326,7 +365,7 @@ void ObTestFTPluginHelper::TearDown()
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
// ASSERT_TRUE(nullptr != desc);
//
// ObTestAddWord test_add_word;
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
//
@ -408,7 +447,7 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
ObTestAddWord test_add_word;
ObTestAddWord test_add_word(cs_type_, allocator_);
for (int64_t i = 0; i < words.count(); ++i) {
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
}
@ -514,7 +553,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";
ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
: plugin_name_(STRLEN(name_), name_),
ngram_words_{"Oc", "ce", "ea", "an", "nB", "Ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "No", "in", "th", "he", "wo", "or", "rl", "ld"},
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
allocator_()
{