[FTS.BUGFIX] fix compatibility of MySQL default fulltext parser
This commit is contained in:
parent
1442b06e77
commit
c4ed3f10af
10
deps/oblib/src/lib/ob_plugin.h
vendored
10
deps/oblib/src/lib/ob_plugin.h
vendored
@ -223,7 +223,11 @@ public:
|
||||
public:
|
||||
ObIAddWord() = default;
|
||||
virtual ~ObIAddWord() = default;
|
||||
virtual int operator()(ObFTParserParam *param, const char *word, const int64_t word_len) = 0;
|
||||
virtual int operator()(
|
||||
ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) = 0;
|
||||
virtual int64_t get_add_word_count() const = 0;
|
||||
DECLARE_PURE_VIRTUAL_TO_STRING;
|
||||
};
|
||||
@ -247,9 +251,9 @@ public:
|
||||
&& 0 < ft_length_
|
||||
&& 0 <= parser_version_;
|
||||
}
|
||||
inline int add_word(ObFTParserParam *param, const char *word, int64_t word_len)
|
||||
inline int add_word(ObFTParserParam *param, const char *word, const int64_t word_len, const int64_t char_cnt)
|
||||
{
|
||||
return (*add_word_)(param, word, word_len);
|
||||
return (*add_word_)(param, word, word_len, char_cnt);
|
||||
}
|
||||
inline void reset()
|
||||
{
|
||||
|
@ -170,6 +170,7 @@ ob_set_subtarget(ob_storage ckpt
|
||||
)
|
||||
|
||||
ob_set_subtarget(ob_storage fts
|
||||
fts/ob_beng_ft_parser.cpp
|
||||
fts/ob_fts_plugin_mgr.cpp
|
||||
fts/ob_fts_plugin_helper.cpp
|
||||
fts/ob_fts_stop_word.cpp
|
||||
|
179
src/storage/fts/ob_beng_ft_parser.cpp
Normal file
179
src/storage/fts/ob_beng_ft_parser.cpp
Normal file
@ -0,0 +1,179 @@
|
||||
/**
|
||||
* Copyright (c) 2023 OceanBase
|
||||
* OceanBase is licensed under Mulan PubL v2.
|
||||
* You can use this software according to the terms and conditions of the Mulan PubL v2.
|
||||
* You may obtain a copy of Mulan PubL v2 at:
|
||||
* http://license.coscl.org.cn/MulanPubL-2.0
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
* See the Mulan PubL v2 for more details.
|
||||
*/
|
||||
|
||||
#define USING_LOG_PREFIX STORAGE_FTS
|
||||
|
||||
#include "lib/string/ob_string.h"
|
||||
#include "storage/fts/ob_beng_ft_parser.h"
|
||||
|
||||
using namespace oceanbase::common;
|
||||
|
||||
namespace oceanbase
|
||||
{
|
||||
namespace storage
|
||||
{
|
||||
|
||||
/*static*/ int ObBEngFTParser::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *ft,
|
||||
const int64_t ft_len)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObDatum doc;
|
||||
doc.set_string(ft, ft_len);
|
||||
ObBEngFTParser parser;
|
||||
share::ObITokenStream *token_stream = nullptr;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
|
||||
} else if (OB_FAIL(parser.init(param))) {
|
||||
LOG_WARN("fail to initialize basic english parser", K(ret), KPC(param));
|
||||
} else if (FALSE_IT(doc.set_string(ft, ft_len))) {
|
||||
} else if (OB_FAIL(parser.segment(doc, token_stream))) {
|
||||
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len));
|
||||
} else if (OB_ISNULL(token_stream)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream));
|
||||
} else {
|
||||
ObDatum token;
|
||||
int64_t token_freq = 0;
|
||||
while (OB_SUCC(ret)) {
|
||||
if (OB_FAIL(token_stream->get_next(token, token_freq))) {
|
||||
if (OB_ITER_END != ret) {
|
||||
LOG_WARN("fail to get next token", K(ret), KPC(token_stream));
|
||||
}
|
||||
} else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) {
|
||||
LOG_WARN("fail to add word", K(ret), K(token), KPC(param));
|
||||
}
|
||||
}
|
||||
if (OB_ITER_END == ret) {
|
||||
ret = OB_SUCCESS;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*static*/ int ObBEngFTParser::add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
common::ObIAllocator *allocator,
|
||||
const char *word,
|
||||
int64_t word_len)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
char *buf = nullptr;
|
||||
if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(allocator)
|
||||
|| OB_ISNULL(word)
|
||||
|| OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len));
|
||||
} else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) {
|
||||
LOG_DEBUG("skip too small or large word", K(ret), K(word_len));
|
||||
} else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate word memory", K(ret), K(word_len));
|
||||
} else if (FALSE_IT(MEMCPY(buf, word, word_len))) {
|
||||
} else if (OB_FAIL(param->add_word(param, buf, word_len, word_len))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word)));
|
||||
} else {
|
||||
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObBEngFTParser::init(lib::ObFTParserParam *param)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(is_inited_)) {
|
||||
ret = OB_INIT_TWICE;
|
||||
LOG_WARN("init twice", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(param) || OB_UNLIKELY(!param->is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("param is nullptr", K(ret), KPC(param));
|
||||
} else if (OB_UNLIKELY(UINT32_MAX < param->ft_length_)) {
|
||||
ret = OB_NOT_SUPPORTED;
|
||||
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_));
|
||||
} else {
|
||||
analysis_ctx_.cs_ = param->cs_;
|
||||
analysis_ctx_.filter_stopword_ = false;
|
||||
analysis_ctx_.need_grouping_ = false;
|
||||
if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) {
|
||||
LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_));
|
||||
} else {
|
||||
is_inited_ = true;
|
||||
}
|
||||
}
|
||||
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
|
||||
reset();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObBEngFTParser::segment(
|
||||
const common::ObDatum &doc,
|
||||
share::ObITokenStream *&token_stream)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(doc.ptr_) || OB_UNLIKELY(0 >= doc.len_)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(doc.ptr_), K(doc.len_));
|
||||
} else if (OB_UNLIKELY(UINT32_MAX < doc.len_)) {
|
||||
ret = OB_NOT_SUPPORTED;
|
||||
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(doc.len_));
|
||||
} else if (OB_FAIL(english_analyzer_.analyze(doc, token_stream))) {
|
||||
LOG_WARN("fail to analyze document", K(ret), K(english_analyzer_), KP(doc.ptr_), K(doc.len_));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ObBEngFTParser::reset()
|
||||
{
|
||||
analysis_ctx_.reset();
|
||||
english_analyzer_.reset();
|
||||
is_inited_ = false;
|
||||
}
|
||||
|
||||
ObBasicEnglishFTParserDesc::ObBasicEnglishFTParserDesc()
|
||||
: is_inited_(false)
|
||||
{
|
||||
}
|
||||
|
||||
int ObBasicEnglishFTParserDesc::init(lib::ObPluginParam *param)
|
||||
{
|
||||
is_inited_ = true;
|
||||
return OB_SUCCESS;
|
||||
}
|
||||
|
||||
int ObBasicEnglishFTParserDesc::deinit(lib::ObPluginParam *param)
|
||||
{
|
||||
reset();
|
||||
return OB_SUCCESS;
|
||||
}
|
||||
|
||||
int ObBasicEnglishFTParserDesc::segment(lib::ObFTParserParam *param) const
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(!is_inited_)) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("default ft parser desc hasn't be initialized", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(param) || OB_ISNULL(param->fulltext_) || OB_UNLIKELY(!param->is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid argument", K(ret), KPC(param));
|
||||
} else if (OB_FAIL(ObBEngFTParser::segment(param, param->fulltext_, param->ft_length_))) {
|
||||
LOG_WARN("fail to segment words for fulltext by beng", K(ret), KPC(param),
|
||||
K(param->fulltext_), K(param->ft_length_));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
83
src/storage/fts/ob_beng_ft_parser.h
Normal file
83
src/storage/fts/ob_beng_ft_parser.h
Normal file
@ -0,0 +1,83 @@
|
||||
/**
|
||||
* Copyright (c) 2023 OceanBase
|
||||
* OceanBase is licensed under Mulan PubL v2.
|
||||
* You can use this software according to the terms and conditions of the Mulan PubL v2.
|
||||
* You may obtain a copy of Mulan PubL v2 at:
|
||||
* http://license.coscl.org.cn/MulanPubL-2.0
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
* See the Mulan PubL v2 for more details.
|
||||
*/
|
||||
|
||||
#ifndef OB_BENG_FT_PARSER_H_
|
||||
#define OB_BENG_FT_PARSER_H_
|
||||
|
||||
#include "lib/ob_plugin.h"
|
||||
#include "lib/utility/ob_macro_utils.h"
|
||||
#include "lib/utility/ob_print_utils.h"
|
||||
#include "share/text_analysis/ob_text_analyzer.h"
|
||||
|
||||
namespace oceanbase
|
||||
{
|
||||
namespace storage
|
||||
{
|
||||
|
||||
class ObBEngFTParser final
|
||||
{
|
||||
public:
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
public:
|
||||
static int segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *fulltext,
|
||||
const int64_t ft_len);
|
||||
|
||||
private:
|
||||
ObBEngFTParser()
|
||||
: analysis_ctx_(),
|
||||
english_analyzer_(),
|
||||
is_inited_(false)
|
||||
{}
|
||||
~ObBEngFTParser() = default;
|
||||
|
||||
static int add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
common::ObIAllocator *allocator,
|
||||
const char *word,
|
||||
int64_t word_len);
|
||||
int init(lib::ObFTParserParam *param);
|
||||
void reset();
|
||||
int segment(
|
||||
const common::ObDatum &doc,
|
||||
share::ObITokenStream *&token_stream);
|
||||
TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited));
|
||||
|
||||
private:
|
||||
share::ObTextAnalysisCtx analysis_ctx_;
|
||||
share::ObEnglishTextAnalyzer english_analyzer_;
|
||||
bool is_inited_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(ObBEngFTParser);
|
||||
};
|
||||
|
||||
class ObBasicEnglishFTParserDesc final : public lib::ObIFTParserDesc
|
||||
{
|
||||
public:
|
||||
ObBasicEnglishFTParserDesc();
|
||||
virtual ~ObBasicEnglishFTParserDesc() = default;
|
||||
virtual int init(lib::ObPluginParam *param) override;
|
||||
virtual int deinit(lib::ObPluginParam *param) override;
|
||||
virtual int segment(lib::ObFTParserParam *param) const override;
|
||||
OB_INLINE void reset() { is_inited_ = false; }
|
||||
private:
|
||||
bool is_inited_;
|
||||
};
|
||||
|
||||
static ObBasicEnglishFTParserDesc beng_parser;
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
||||
|
||||
#endif // OB_BENG_FT_PARSER_H_
|
@ -15,6 +15,7 @@
|
||||
|
||||
#include "storage/fts/ob_whitespace_ft_parser.h"
|
||||
#include "storage/fts/ob_ngram_ft_parser.h"
|
||||
#include "storage/fts/ob_beng_ft_parser.h"
|
||||
|
||||
///////////////////////////////////// Default fulltext parser //////////////////////////////////////////
|
||||
|
||||
@ -23,10 +24,10 @@ OB_DECLARE_PLUGIN(whitespace_parser)
|
||||
oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type
|
||||
"space", // name
|
||||
OB_PLUGIN_AUTHOR_OCEANBASE, // author
|
||||
"This is a default space parser plugin.", // brief specification
|
||||
"This is a default whitespace parser plugin.", // brief specification
|
||||
0x00001, // version
|
||||
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
|
||||
&oceanbase::storage::whitespace_parser, // default space parser plugin instance
|
||||
&oceanbase::storage::whitespace_parser, // default space parser plugin instance
|
||||
};
|
||||
|
||||
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInWhitespaceFTParser, whitespace_parser);
|
||||
@ -46,4 +47,19 @@ OB_DECLARE_PLUGIN(ngram_parser)
|
||||
|
||||
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInNgramFTParser, ngram_parser);
|
||||
|
||||
///////////////////////////////////// Default fulltext parser //////////////////////////////////////////
|
||||
|
||||
OB_DECLARE_PLUGIN(beng_parser)
|
||||
{
|
||||
oceanbase::lib::ObPluginType::OB_FT_PARSER_PLUGIN, // fulltext parser type
|
||||
"beng", // name
|
||||
OB_PLUGIN_AUTHOR_OCEANBASE, // author
|
||||
"This is a basic english parser plugin.", // brief specification
|
||||
0x00001, // version
|
||||
oceanbase::lib::ObPluginLicenseType::OB_MULAN_V2_LICENSE, // Mulan PubL v2 license
|
||||
&oceanbase::storage::beng_parser, // default space parser plugin instance
|
||||
};
|
||||
|
||||
OB_DECLARE_BUILDIN_PLUGIN_HANDLER(ObBuildInBEngFTParser, beng_parser);
|
||||
|
||||
#endif // OB_FTS_BUILD_IN_PARSER_REGISTER_H_
|
||||
|
@ -145,7 +145,7 @@ ObFTParseHelper::ObFTParseHelper()
|
||||
allocator_(nullptr),
|
||||
parser_desc_(nullptr),
|
||||
parser_name_(),
|
||||
filter_stopword_(false),
|
||||
add_word_flag_(),
|
||||
is_inited_(false)
|
||||
{
|
||||
}
|
||||
@ -178,7 +178,9 @@ int ObFTParseHelper::init(
|
||||
LOG_WARN("fail to get fulltext parser descriptor", K(ret), KPC(parse_handler));
|
||||
} else {
|
||||
plugin_param_.desc_ = parser_desc_;
|
||||
filter_stopword_ = need_stopword_list(parser_name_);
|
||||
if (need_min_max_word(parser_name_)) { add_word_flag_.set_min_max_word(); }
|
||||
if (need_castdn(parser_name_)) { add_word_flag_.set_casedown(); }
|
||||
if (need_stopword_list(parser_name_)) { add_word_flag_.set_stop_word(); }
|
||||
allocator_ = allocator;
|
||||
is_inited_ = true;
|
||||
}
|
||||
@ -193,7 +195,7 @@ void ObFTParseHelper::reset()
|
||||
parser_desc_ = nullptr;
|
||||
plugin_param_.reset();
|
||||
allocator_ = nullptr;
|
||||
filter_stopword_ = false;
|
||||
add_word_flag_.clear();
|
||||
is_inited_ = false;
|
||||
}
|
||||
|
||||
@ -220,17 +222,14 @@ int ObFTParseHelper::segment(
|
||||
LOG_WARN("unexpected error, charset info is nullptr", K(ret), K(type));
|
||||
} else {
|
||||
words.reuse();
|
||||
lib::ObFTParserParam::ObIAddWord *add_word = nullptr;
|
||||
if (OB_FAIL(alloc_add_word(type, words, add_word))) {
|
||||
LOG_WARN("fail to allocate add word", K(ret), K(type));
|
||||
} else if (OB_FAIL(segment(parser_name_.get_parser_version(), parser_desc_, cs, fulltext, fulltext_len, *allocator_,
|
||||
*add_word))) {
|
||||
ObAddWord add_word(type, add_word_flag_, *allocator_, words);
|
||||
if (OB_FAIL(segment(parser_name_.get_parser_version(), parser_desc_, cs, fulltext, fulltext_len, *allocator_,
|
||||
add_word))) {
|
||||
LOG_WARN("fail to segment fulltext", K(ret), K(parser_name_), KP(parser_desc_), KP(cs), KP(fulltext),
|
||||
K(fulltext_len), KP(allocator_));
|
||||
} else {
|
||||
doc_length = add_word->get_add_word_count();
|
||||
doc_length = add_word.get_add_word_count();
|
||||
}
|
||||
free_add_word(add_word);
|
||||
}
|
||||
LOG_DEBUG("ft parse segment", K(ret), K(type), K(ObString(fulltext_len, fulltext)), K(words));
|
||||
return ret;
|
||||
@ -238,45 +237,23 @@ int ObFTParseHelper::segment(
|
||||
|
||||
bool ObFTParseHelper::need_stopword_list(const ObFTParser &parser)
|
||||
{
|
||||
share::ObPluginName name("space");
|
||||
return parser.get_parser_name() == name;
|
||||
share::ObPluginName space("space");
|
||||
share::ObPluginName beng("beng");
|
||||
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
|
||||
}
|
||||
|
||||
int ObFTParseHelper::alloc_add_word(
|
||||
const ObCollationType &type,
|
||||
common::ObIArray<ObFTWord> &words,
|
||||
lib::ObFTParserParam::ObIAddWord *&add_word) const
|
||||
bool ObFTParseHelper::need_min_max_word(const ObFTParser &parser)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
common::ObMemAttr mem_attr(MTL_ID(), "FTAddWord");
|
||||
void *buf = nullptr;
|
||||
const int64_t buf_size = filter_stopword_ ? sizeof(ObStopWordAddWord) : sizeof(ObNoStopWordAddWord);
|
||||
if (OB_NOT_NULL(add_word)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("add word isn't nullptr", K(ret), KPC(add_word));
|
||||
} else if (OB_ISNULL(buf = ob_malloc(buf_size, mem_attr))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate memory", K(ret), K(buf_size));
|
||||
} else if (filter_stopword_) {
|
||||
add_word = new (buf) ObStopWordAddWord(type, *allocator_, words);
|
||||
} else {
|
||||
add_word = new (buf) ObNoStopWordAddWord(type, *allocator_, words);
|
||||
}
|
||||
if (OB_FAIL(ret) && OB_NOT_NULL(buf)) {
|
||||
ob_free(buf);
|
||||
buf = nullptr;
|
||||
add_word = nullptr;
|
||||
}
|
||||
return ret;
|
||||
share::ObPluginName space("space");
|
||||
share::ObPluginName beng("beng");
|
||||
return parser.get_parser_name() == space || parser.get_parser_name() == beng;
|
||||
}
|
||||
|
||||
void ObFTParseHelper::free_add_word(lib::ObFTParserParam::ObIAddWord *&add_word) const
|
||||
bool ObFTParseHelper::need_castdn(const ObFTParser &parser)
|
||||
{
|
||||
if (OB_NOT_NULL(add_word)) {
|
||||
add_word->~ObIAddWord();
|
||||
ob_free(static_cast<void *>(add_word));
|
||||
add_word = nullptr;
|
||||
}
|
||||
share::ObPluginName space("space");
|
||||
share::ObPluginName ngram("ngram");
|
||||
return parser.get_parser_name() == space || parser.get_parser_name() == ngram;
|
||||
}
|
||||
|
||||
} // end namespace storage
|
||||
|
@ -107,6 +107,8 @@ private:
|
||||
common::ObIAllocator &allocator,
|
||||
lib::ObFTParserParam::ObIAddWord &add_word);
|
||||
static bool need_stopword_list(const ObFTParser &parser);
|
||||
static bool need_castdn(const ObFTParser &parser);
|
||||
static bool need_min_max_word(const ObFTParser &parser);
|
||||
|
||||
int alloc_add_word(
|
||||
const ObCollationType &type,
|
||||
@ -119,7 +121,7 @@ private:
|
||||
common::ObIAllocator *allocator_;
|
||||
lib::ObIFTParserDesc *parser_desc_;
|
||||
ObFTParser parser_name_;
|
||||
bool filter_stopword_;
|
||||
ObAddWordFlag add_word_flag_;
|
||||
bool is_inited_;
|
||||
|
||||
private:
|
||||
|
@ -56,6 +56,8 @@ int ObTenantFTPluginMgr::register_plugins()
|
||||
LOG_WARN("fail to register default fulltext parser", K(ret));
|
||||
} else if (OB_FAIL(register_plugin<ObBuildInNgramFTParser>())) {
|
||||
LOG_WARN("fail to register ngram fulltext parser", K(ret));
|
||||
} else if (OB_FAIL(register_plugin<ObBuildInBEngFTParser>())) {
|
||||
LOG_WARN("fail to register basic english fulltext parser", K(ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@ -67,6 +69,8 @@ void ObTenantFTPluginMgr::unregister_plugins()
|
||||
LOG_ERROR("fail to unregister default fulltext parser", K(ret));
|
||||
} else if (OB_FAIL(unregister_plugin<ObBuildInNgramFTParser>())) {
|
||||
LOG_ERROR("fail to unregister ngram fulltext parser", K(ret));
|
||||
} else if (OB_FAIL(unregister_plugin<ObBuildInBEngFTParser>())) {
|
||||
LOG_ERROR("fail to unregister basic english fulltext parser", K(ret));
|
||||
}
|
||||
}
|
||||
|
||||
@ -133,6 +137,8 @@ int ObTenantFTPluginMgr::init_plugin_handler()
|
||||
LOG_WARN("fail to set default fulltext parser", K(ret));
|
||||
} else if (OB_FAIL(set_plugin_handler<ObBuildInNgramFTParser>())) {
|
||||
LOG_WARN("fail to set ngram fulltext parser", K(ret));
|
||||
} else if (OB_FAIL(set_plugin_handler<ObBuildInBEngFTParser>())) {
|
||||
LOG_WARN("fail to set basic english fulltext parser", K(ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -20,99 +20,87 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
ObNoStopWordAddWord::ObNoStopWordAddWord(
|
||||
ObAddWord::ObAddWord(
|
||||
const ObCollationType &type,
|
||||
const ObAddWordFlag &flag,
|
||||
common::ObIAllocator &allocator,
|
||||
common::ObIArray<ObFTWord> &word)
|
||||
: collation_type_(type),
|
||||
allocator_(allocator),
|
||||
words_(word),
|
||||
word_count_(0)
|
||||
min_max_word_cnt_(0),
|
||||
non_stopword_cnt_(0),
|
||||
stopword_cnt_(0),
|
||||
flag_(flag)
|
||||
{
|
||||
}
|
||||
|
||||
int ObNoStopWordAddWord::operator()(
|
||||
int ObAddWord::operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
char *w_buf = nullptr;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
} else if (OB_ISNULL(w_buf = static_cast<char *>(allocator_.alloc(word_len)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate memory for fulltext word", K(ret), K(word_len));
|
||||
} else {
|
||||
MEMCPY(w_buf, word, word_len);
|
||||
ObFTWord ft_word(word_len, w_buf, collation_type_);
|
||||
if (OB_FAIL(words_.push_back(ft_word))) {
|
||||
LOG_WARN("fail to push word into words array", K(ret), K(ft_word));
|
||||
} else {
|
||||
++word_count_;
|
||||
}
|
||||
}
|
||||
if (OB_FAIL(ret)) {
|
||||
if (OB_NOT_NULL(w_buf)) {
|
||||
allocator_.free(w_buf);
|
||||
w_buf = nullptr;
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ObStopWordAddWord::ObStopWordAddWord(
|
||||
const ObCollationType &type,
|
||||
common::ObIAllocator &allocator,
|
||||
common::ObIArray<ObFTWord> &word)
|
||||
: collation_type_(type),
|
||||
allocator_(allocator),
|
||||
words_(word),
|
||||
non_stopword_count_(0),
|
||||
stopword_count_(0)
|
||||
{
|
||||
}
|
||||
|
||||
int ObStopWordAddWord::operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len)
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
bool is_stopword = false;
|
||||
ObFTWord ft_word(word_len, word, collation_type_);
|
||||
ObFTWord src_word(word_len, word, collation_type_);
|
||||
ObFTWord dst_word;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
} else if (OB_FAIL(OB_FT_PLUGIN_MGR.check_stopword(ft_word, is_stopword))) {
|
||||
LOG_WARN("fail to check stopword", K(ret));
|
||||
} else if (is_stopword) {
|
||||
// the word is stop word, just skip it.
|
||||
++stopword_count_;
|
||||
} else if (is_min_max_word(char_cnt)) {
|
||||
++min_max_word_cnt_;
|
||||
LOG_DEBUG("skip too small or large word", K(ret), K(src_word), K(char_cnt));
|
||||
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
|
||||
LOG_WARN("fail to casedown word", K(ret), K(src_word));
|
||||
} else if (check_stopword(dst_word, is_stopword)) {
|
||||
LOG_WARN("fail to check stopword", K(ret), K(dst_word));
|
||||
} else if (OB_UNLIKELY(is_stopword)) {
|
||||
++stopword_cnt_;
|
||||
LOG_DEBUG("skip stopword", K(ret), K(dst_word));
|
||||
} else if (OB_FAIL(words_.push_back(dst_word))) {
|
||||
LOG_WARN("fail to push word into words array", K(ret), K(dst_word));
|
||||
} else {
|
||||
char *w_buf = nullptr;
|
||||
if (OB_ISNULL(w_buf = static_cast<char *>(allocator_.alloc(word_len)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate memory for fulltext word", K(ret), K(word_len));
|
||||
} else {
|
||||
MEMCPY(w_buf, word, word_len);
|
||||
ObFTWord non_stopword_ft_word(word_len, w_buf, collation_type_);
|
||||
if (OB_FAIL(words_.push_back(non_stopword_ft_word))) {
|
||||
LOG_WARN("fail to push word into words array", K(ret), K(non_stopword_ft_word));
|
||||
} else {
|
||||
++non_stopword_count_;
|
||||
}
|
||||
}
|
||||
if (OB_FAIL(ret)) {
|
||||
if (OB_NOT_NULL(w_buf)) {
|
||||
allocator_.free(w_buf);
|
||||
w_buf = nullptr;
|
||||
}
|
||||
}
|
||||
++non_stopword_cnt_;
|
||||
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(char_cnt), K(src_word), K(dst_word));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool ObAddWord::is_min_max_word(const int64_t c_len) const
|
||||
{
|
||||
return flag_.min_max_word() && (c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN);
|
||||
}
|
||||
|
||||
int ObAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(src.empty())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid src ft word", K(ret), K(src));
|
||||
} else if (flag_.casedown()) {
|
||||
ObString dst_str;
|
||||
if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
|
||||
LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
|
||||
} else {
|
||||
ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
|
||||
dst = tmp;
|
||||
}
|
||||
} else {
|
||||
dst = src;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObAddWord::check_stopword(const ObFTWord &ft_word, bool &is_stopword)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(ft_word.empty())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), K(ft_word));
|
||||
} else if (flag_.stopword() && OB_FAIL(OB_FT_PLUGIN_MGR.check_stopword(ft_word, is_stopword))) {
|
||||
LOG_WARN("fail to check stopword", K(ret));
|
||||
}
|
||||
LOG_DEBUG("add word", K(ret), KPC(param), KP(word), K(word_len), K(is_stopword));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -63,49 +63,38 @@ static const char ob_stop_word_list[][FTS_STOP_WORD_MAX_LENGTH] = {
|
||||
"www"
|
||||
};
|
||||
|
||||
class ObNoStopWordAddWord final : public lib::ObFTParserParam::ObIAddWord
|
||||
class ObAddWord final : public lib::ObFTParserParam::ObIAddWord
|
||||
{
|
||||
public:
|
||||
ObNoStopWordAddWord(
|
||||
ObAddWord(
|
||||
const ObCollationType &type,
|
||||
const ObAddWordFlag &flag,
|
||||
common::ObIAllocator &allocator,
|
||||
common::ObIArray<ObFTWord> &word);
|
||||
virtual ~ObNoStopWordAddWord() = default;
|
||||
virtual ~ObAddWord() = default;
|
||||
virtual int operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len) override;
|
||||
virtual int64_t get_add_word_count() const override { return word_count_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(word_count), K_(words));
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) override;
|
||||
virtual int64_t get_add_word_count() const override { return non_stopword_cnt_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(min_max_word_cnt), K_(non_stopword_cnt), K_(stopword_cnt),
|
||||
K_(words));
|
||||
public:
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
private:
|
||||
OB_INLINE common::ObIArray<ObFTWord> &get_words() { return words_; }
|
||||
bool is_min_max_word(const int64_t c_len) const;
|
||||
int casedown_word(const ObFTWord &src, ObFTWord &dst);
|
||||
int check_stopword(const ObFTWord &word, bool &is_stopword);
|
||||
private:
|
||||
ObCollationType collation_type_;
|
||||
common::ObIAllocator &allocator_;
|
||||
common::ObIArray<ObFTWord> &words_;
|
||||
int64_t word_count_;
|
||||
};
|
||||
|
||||
class ObStopWordAddWord final : public lib::ObFTParserParam::ObIAddWord
|
||||
{
|
||||
public:
|
||||
ObStopWordAddWord(
|
||||
const ObCollationType &type,
|
||||
common::ObIAllocator &allocator,
|
||||
common::ObIArray<ObFTWord> &word);
|
||||
virtual ~ObStopWordAddWord() = default;
|
||||
virtual int operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len) override;
|
||||
virtual int64_t get_add_word_count() const { return non_stopword_count_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(collation_type), K_(non_stopword_count), K_(stopword_count), K_(words));
|
||||
private:
|
||||
ObCollationType collation_type_;
|
||||
common::ObIAllocator &allocator_;
|
||||
common::ObIArray<ObFTWord> &words_;
|
||||
int64_t non_stopword_count_;
|
||||
int64_t stopword_count_;
|
||||
int64_t min_max_word_cnt_;
|
||||
int64_t non_stopword_cnt_;
|
||||
int64_t stopword_cnt_;
|
||||
ObAddWordFlag flag_;
|
||||
};
|
||||
|
||||
} // end namespace storage
|
||||
|
@ -76,6 +76,36 @@ public:
|
||||
int64_t word_cnt_;
|
||||
};
|
||||
|
||||
class ObAddWordFlag final
|
||||
{
|
||||
private:
|
||||
static const uint64_t AWF_NONE = 0;
|
||||
static const uint64_t AWF_MIN_MAX_WORD = 1 << 0; // filter words that are less than a minimum or greater
|
||||
// than a maximum word length.
|
||||
static const uint64_t AWF_STOPWORD = 1 << 1; // filter by sotp word table.
|
||||
static const uint64_t AWF_CASEDOWN = 1 << 2; // convert characters from uppercase to lowercase.
|
||||
public:
|
||||
ObAddWordFlag() : flag_(AWF_NONE) {}
|
||||
~ObAddWordFlag() = default;
|
||||
private:
|
||||
void set_flag(const uint64_t flag) { flag_ |= flag; }
|
||||
void clear_flag(const uint64_t flag) { flag_ &= ~flag; }
|
||||
bool has_flag(const uint64 flag) const { return (flag_ & flag) == flag; }
|
||||
public:
|
||||
void set_min_max_word() { set_flag(AWF_MIN_MAX_WORD); }
|
||||
void set_stop_word() { set_flag(AWF_STOPWORD); }
|
||||
void set_casedown() { set_flag(AWF_CASEDOWN); }
|
||||
void clear() { flag_ = AWF_NONE; }
|
||||
void clear_min_max_word() { clear_flag(AWF_MIN_MAX_WORD); }
|
||||
void clear_stop_word() { clear_flag(AWF_STOPWORD); }
|
||||
void clear_casedown() { clear_flag(AWF_CASEDOWN); }
|
||||
bool min_max_word() const { return has_flag(AWF_MIN_MAX_WORD); }
|
||||
bool stopword() const { return has_flag(AWF_STOPWORD); }
|
||||
bool casedown() const { return has_flag(AWF_CASEDOWN); }
|
||||
private:
|
||||
uint64_t flag_;
|
||||
};
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
||||
|
||||
|
@ -56,8 +56,8 @@ namespace storage
|
||||
++c_nums;
|
||||
}
|
||||
if (NGRAM_TOKEN_SIZE == c_nums) {
|
||||
if (OB_FAIL(add_word(param, start, next - start))) {
|
||||
LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next));
|
||||
if (OB_FAIL(add_word(param, start, next - start, c_nums))) {
|
||||
LOG_WARN("fail to add word", K(ret), KP(param), KP(start), KP(next), K(c_nums));
|
||||
} else {
|
||||
start += ob_mbcharlen_ptr(cs, start, end);
|
||||
c_nums = NGRAM_TOKEN_SIZE - 1;
|
||||
@ -71,7 +71,8 @@ namespace storage
|
||||
/*static*/ int ObNgramFTParser::add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
int64_t word_len)
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(param)
|
||||
@ -79,8 +80,8 @@ namespace storage
|
||||
|| OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
} else if (OB_FAIL(param->add_word(param, word, word_len))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, word)));
|
||||
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -37,7 +37,8 @@ private:
|
||||
static int add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
int64_t word_len);
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt);
|
||||
private:
|
||||
DISABLE_COPY_ASSIGN(ObNgramFTParser);
|
||||
};
|
||||
|
@ -22,37 +22,51 @@ namespace oceanbase
|
||||
namespace storage
|
||||
{
|
||||
|
||||
/*static*/ int ObSpaceFTParser::segment(
|
||||
#define true_word_char(ctype, character) ((ctype) & (_MY_U | _MY_L | _MY_NMR) || (character) == '_')
|
||||
|
||||
int ObSpaceFTParser::segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *ft,
|
||||
const int64_t ft_len)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObDatum doc;
|
||||
doc.set_string(ft, ft_len);
|
||||
ObSpaceFTParser parser;
|
||||
share::ObITokenStream *token_stream = nullptr;
|
||||
const char *start = ft;
|
||||
const char *next = start;
|
||||
const char *end = start + ft_len;
|
||||
int mbl = 0;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(param->cs_) || OB_ISNULL(ft) || OB_UNLIKELY(0 >= ft_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(ft), K(ft_len));
|
||||
} else if (OB_FAIL(parser.init(param))) {
|
||||
LOG_WARN("fail to initialize space parser", K(ret), KPC(param));
|
||||
} else if (FALSE_IT(doc.set_string(ft, ft_len))) {
|
||||
} else if (OB_FAIL(parser.segment(doc, token_stream))) {
|
||||
LOG_WARN("fail to segment fulltext by parser", K(ret), KP(ft), K(ft_len));
|
||||
} else if (OB_ISNULL(token_stream)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("token stream is nullptr", K(ret), KP(token_stream));
|
||||
} else {
|
||||
ObDatum token;
|
||||
int64_t token_freq = 0;
|
||||
while (OB_SUCC(ret)) {
|
||||
if (OB_FAIL(token_stream->get_next(token, token_freq))) {
|
||||
if (OB_ITER_END != ret) {
|
||||
LOG_WARN("fail to get next token", K(ret), KPC(token_stream));
|
||||
const ObCharsetInfo *cs = param->cs_;
|
||||
while (OB_SUCC(ret) && next < end) {
|
||||
while (next < end) {
|
||||
int ctype;
|
||||
mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end);
|
||||
if (true_word_char(ctype, *next)) {
|
||||
break;
|
||||
}
|
||||
next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
|
||||
}
|
||||
if (next >= end) {
|
||||
ret = OB_ITER_END;
|
||||
} else {
|
||||
int64_t c_nums = 0;
|
||||
start = next;
|
||||
while (next < end) {
|
||||
int ctype;
|
||||
mbl = cs->cset->ctype(cs, &ctype, (uchar *)next, (uchar *)end);
|
||||
if (!true_word_char(ctype, *next)) {
|
||||
break;
|
||||
}
|
||||
++c_nums;
|
||||
next += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
|
||||
}
|
||||
if (0 < c_nums && OB_FAIL(add_word(param, start, next - start, c_nums))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), KP(start), K(next));
|
||||
} else {
|
||||
start = next;
|
||||
}
|
||||
} else if (OB_FAIL(add_word(param, param->allocator_, token.ptr_, token.len_))) {
|
||||
LOG_WARN("fail to add word", K(ret), K(token), KPC(param));
|
||||
}
|
||||
}
|
||||
if (OB_ITER_END == ret) {
|
||||
@ -62,86 +76,24 @@ namespace storage
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*static*/ int ObSpaceFTParser::add_word(
|
||||
int ObSpaceFTParser::add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
common::ObIAllocator *allocator,
|
||||
const char *word,
|
||||
int64_t word_len)
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
char *buf = nullptr;
|
||||
if (OB_ISNULL(param)
|
||||
|| OB_ISNULL(allocator)
|
||||
|| OB_ISNULL(word)
|
||||
|| OB_UNLIKELY(0 >= word_len)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(allocator), KP(word), K(word_len));
|
||||
} else if (word_len < FT_MIN_WORD_LEN || word_len > FT_MAX_WORD_LEN) {
|
||||
LOG_DEBUG("skip too small or large word", K(ret), K(word_len));
|
||||
} else if (OB_ISNULL(buf = static_cast<char *>(allocator->alloc(word_len)))) {
|
||||
ret = OB_ALLOCATE_MEMORY_FAILED;
|
||||
LOG_WARN("fail to allocate word memory", K(ret), K(word_len));
|
||||
} else if (FALSE_IT(MEMCPY(buf, word, word_len))) {
|
||||
} else if (OB_FAIL(param->add_word(param, buf, word_len))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(ObString(word_len, buf)), K(ObString(word_len, word)));
|
||||
} else {
|
||||
LOG_DEBUG("succeed to add word", K(ObString(word_len, word)));
|
||||
LOG_WARN("invalid arguments", K(ret), KPC(param), KP(word), K(word_len));
|
||||
} else if (OB_FAIL(param->add_word(param, word, word_len, char_cnt))) {
|
||||
LOG_WARN("fail to add word", K(ret), KPC(param), K(char_cnt), K(ObString(word_len, word)));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObSpaceFTParser::init(lib::ObFTParserParam *param)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(is_inited_)) {
|
||||
ret = OB_INIT_TWICE;
|
||||
LOG_WARN("init twice", K(ret), K(is_inited_));
|
||||
} else if (OB_ISNULL(param) || OB_UNLIKELY(!param->is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("param is nullptr", K(ret), KPC(param));
|
||||
} else if (OB_UNLIKELY(UINT32_MAX < param->ft_length_)) {
|
||||
ret = OB_NOT_SUPPORTED;
|
||||
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(param->ft_length_));
|
||||
} else {
|
||||
analysis_ctx_.cs_ = param->cs_;
|
||||
analysis_ctx_.filter_stopword_ = false;
|
||||
analysis_ctx_.need_grouping_ = false;
|
||||
if (OB_FAIL(english_analyzer_.init(analysis_ctx_, *param->allocator_))) {
|
||||
LOG_WARN("fail to init english analyzer", K(ret), KPC(param), K(analysis_ctx_));
|
||||
} else {
|
||||
is_inited_ = true;
|
||||
}
|
||||
}
|
||||
if (OB_FAIL(ret) && OB_UNLIKELY(!is_inited_)) {
|
||||
reset();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObSpaceFTParser::segment(
|
||||
const common::ObDatum &doc,
|
||||
share::ObITokenStream *&token_stream)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(doc.ptr_) || OB_UNLIKELY(0 >= doc.len_)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(doc.ptr_), K(doc.len_));
|
||||
} else if (OB_UNLIKELY(UINT32_MAX < doc.len_)) {
|
||||
ret = OB_NOT_SUPPORTED;
|
||||
LOG_WARN("too large document, english analyzer hasn't be supported", K(ret), K(doc.len_));
|
||||
} else if (OB_FAIL(english_analyzer_.analyze(doc, token_stream))) {
|
||||
LOG_WARN("fail to analyze document", K(ret), K(english_analyzer_), KP(doc.ptr_), K(doc.len_));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ObSpaceFTParser::reset()
|
||||
{
|
||||
analysis_ctx_.reset();
|
||||
english_analyzer_.reset();
|
||||
is_inited_ = false;
|
||||
}
|
||||
|
||||
ObWhiteSpaceFTParserDesc::ObWhiteSpaceFTParserDesc()
|
||||
: is_inited_(false)
|
||||
{
|
||||
|
@ -10,8 +10,8 @@
|
||||
* See the Mulan PubL v2 for more details.
|
||||
*/
|
||||
|
||||
#ifndef OB_DEFAULT_FT_PARSER_H_
|
||||
#define OB_DEFAULT_FT_PARSER_H_
|
||||
#ifndef OB_WHITESPACE_FT_PARSER_H_
|
||||
#define OB_WHITESPACE_FT_PARSER_H_
|
||||
|
||||
#include "lib/ob_plugin.h"
|
||||
#include "lib/utility/ob_macro_utils.h"
|
||||
@ -26,40 +26,18 @@ namespace storage
|
||||
class ObSpaceFTParser final
|
||||
{
|
||||
public:
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
public:
|
||||
ObSpaceFTParser() = default;
|
||||
~ObSpaceFTParser() = default;
|
||||
static int segment(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *fulltext,
|
||||
const int64_t ft_len);
|
||||
|
||||
private:
|
||||
ObSpaceFTParser()
|
||||
: analysis_ctx_(),
|
||||
english_analyzer_(),
|
||||
is_inited_(false)
|
||||
{}
|
||||
~ObSpaceFTParser() = default;
|
||||
|
||||
static int add_word(
|
||||
lib::ObFTParserParam *param,
|
||||
common::ObIAllocator *allocator,
|
||||
const char *word,
|
||||
int64_t word_len);
|
||||
int init(lib::ObFTParserParam *param);
|
||||
void reset();
|
||||
int segment(
|
||||
const common::ObDatum &doc,
|
||||
share::ObITokenStream *&token_stream);
|
||||
TO_STRING_KV(K_(analysis_ctx), K_(english_analyzer), K_(is_inited));
|
||||
|
||||
private:
|
||||
share::ObTextAnalysisCtx analysis_ctx_;
|
||||
share::ObEnglishTextAnalyzer english_analyzer_;
|
||||
bool is_inited_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(ObSpaceFTParser);
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt);
|
||||
};
|
||||
|
||||
class ObWhiteSpaceFTParserDesc final : public lib::ObIFTParserDesc
|
||||
@ -76,8 +54,7 @@ private:
|
||||
};
|
||||
|
||||
static ObWhiteSpaceFTParserDesc whitespace_parser;
|
||||
|
||||
} // end namespace storage
|
||||
} // end namespace oceanbase
|
||||
|
||||
#endif // OB_DEFAULT_FT_PARSER_H_
|
||||
#endif // OB_WHITESPACE_FT_PARSER_H_
|
||||
|
@ -81,16 +81,23 @@ public:
|
||||
static const char *TEST_FULLTEXT;
|
||||
static const int64_t TEST_WORD_COUNT = 5;
|
||||
static const int64_t TEST_WORD_COUNT_WITHOUT_STOPWORD = 4;
|
||||
static const int64_t FT_MIN_WORD_LEN = 3;
|
||||
static const int64_t FT_MAX_WORD_LEN = 84;
|
||||
public:
|
||||
ObTestAddWord();
|
||||
ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator);
|
||||
virtual ~ObTestAddWord() = default;
|
||||
virtual int operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len) override;
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt) override;
|
||||
virtual int64_t get_add_word_count() const override { return ith_word_; }
|
||||
VIRTUAL_TO_STRING_KV(K_(ith_word));
|
||||
private:
|
||||
bool is_min_max_word(const int64_t c_len) const;
|
||||
int casedown_word(const ObFTWord &src, ObFTWord &dst);
|
||||
ObCollationType collation_type_;
|
||||
common::ObIAllocator &allocator_;
|
||||
const char *words_[TEST_WORD_COUNT];
|
||||
const char *words_without_stopword_[TEST_WORD_COUNT_WITHOUT_STOPWORD];
|
||||
int64_t ith_word_;
|
||||
@ -98,26 +105,57 @@ private:
|
||||
|
||||
const char *ObTestAddWord::TEST_FULLTEXT = "OceanBase fulltext search is No.1 in the world.";
|
||||
|
||||
ObTestAddWord::ObTestAddWord()
|
||||
: words_{"oceanbase", "fulltext", "search", "the", "world"},
|
||||
ObTestAddWord::ObTestAddWord(const ObCollationType &type, common::ObIAllocator &allocator)
|
||||
: collation_type_(type),
|
||||
allocator_(allocator),
|
||||
words_{"oceanbase", "fulltext", "search", "the", "world"},
|
||||
words_without_stopword_{"oceanbase", "fulltext", "search", "world"},
|
||||
ith_word_(0)
|
||||
{
|
||||
}
|
||||
|
||||
bool ObTestAddWord::is_min_max_word(const int64_t c_len) const
|
||||
{
|
||||
return c_len < FT_MIN_WORD_LEN || c_len > FT_MAX_WORD_LEN;
|
||||
}
|
||||
|
||||
int ObTestAddWord::casedown_word(const ObFTWord &src, ObFTWord &dst)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_UNLIKELY(src.empty())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid src ft word", K(ret), K(src));
|
||||
} else {
|
||||
ObString dst_str;
|
||||
if (OB_FAIL(ObCharset::tolower(collation_type_, src.get_word(), dst_str, allocator_))) {
|
||||
LOG_WARN("fail to tolower", K(ret), K(src), K(collation_type_));
|
||||
} else {
|
||||
ObFTWord tmp(dst_str.length(), dst_str.ptr(), collation_type_);
|
||||
dst = tmp;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObTestAddWord::operator()(
|
||||
lib::ObFTParserParam *param,
|
||||
const char *word,
|
||||
const int64_t word_len)
|
||||
const int64_t word_len,
|
||||
const int64_t char_cnt)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len)) {
|
||||
ObFTWord src_word(word_len, word, collation_type_);
|
||||
ObFTWord dst_word;
|
||||
if (OB_ISNULL(param) || OB_ISNULL(word) || OB_UNLIKELY(0 >= word_len || 0 >= char_cnt)) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len));
|
||||
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], word, word_len))) {
|
||||
LOG_WARN("invalid arguments", K(ret), KP(word), KP(param), K(word_len), K(char_cnt));
|
||||
} else if (is_min_max_word(char_cnt)) {
|
||||
// skip min/max word
|
||||
} else if (OB_FAIL(casedown_word(src_word, dst_word))) {
|
||||
LOG_WARN("fail to casedown word", K(ret), K(src_word));
|
||||
} else if (OB_UNLIKELY(0 != strncmp(words_[ith_word_], dst_word.get_word().ptr(), dst_word.get_word().length()))) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]),
|
||||
KCSTRING(word), K(word_len));
|
||||
LOG_WARN("the ith word isn't default word", K(ret), K(ith_word_), KCSTRING(words_[ith_word_]), K(dst_word));
|
||||
} else {
|
||||
++ith_word_;
|
||||
}
|
||||
@ -136,17 +174,17 @@ public:
|
||||
private:
|
||||
lib::ObPluginParam plugin_param_;
|
||||
lib::ObFTParserParam ft_parser_param_;
|
||||
ObTestAddWord add_word_;
|
||||
ObWhiteSpaceFTParserDesc desc_;
|
||||
common::ObArenaAllocator allocator_;
|
||||
ObTestAddWord add_word_;
|
||||
};
|
||||
|
||||
TestDefaultFTParser::TestDefaultFTParser()
|
||||
: plugin_param_(),
|
||||
ft_parser_param_(),
|
||||
add_word_(),
|
||||
desc_(),
|
||||
allocator_()
|
||||
allocator_(),
|
||||
add_word_(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_)
|
||||
{
|
||||
plugin_param_.desc_ = &desc_;
|
||||
}
|
||||
@ -190,7 +228,8 @@ TEST_F(TestDefaultFTParser, test_space_ft_parser_segment)
|
||||
TEST_F(TestDefaultFTParser, test_space_ft_parser_segment_bug_56324268)
|
||||
{
|
||||
common::ObArray<ObFTWord> words;
|
||||
ObNoStopWordAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, allocator_, words);
|
||||
ObAddWordFlag flag;
|
||||
ObAddWord add_word(ObCollationType::CS_TYPE_LATIN1_SWEDISH_CI, flag, allocator_, words);
|
||||
const char *fulltext = "\201 想 将 数据 添加 到 数据库\f\026 ";
|
||||
const int64_t ft_len = strlen(fulltext);
|
||||
|
||||
@ -291,7 +330,7 @@ void ObTestFTPluginHelper::TearDown()
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
|
||||
// ASSERT_TRUE(nullptr != desc);
|
||||
//
|
||||
// ObTestAddWord test_add_word;
|
||||
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
|
||||
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
|
||||
//}
|
||||
@ -326,7 +365,7 @@ void ObTestFTPluginHelper::TearDown()
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::get_fulltext_parser_desc(handler_, desc));
|
||||
// ASSERT_TRUE(nullptr != desc);
|
||||
//
|
||||
// ObTestAddWord test_add_word;
|
||||
// ObTestAddWord test_add_word(ObCollationType::CS_TYPE_UTF8MB4_BIN, allocator_);
|
||||
// ASSERT_EQ(OB_SUCCESS, ObFTParseHelper::segment(1/*plugin_vserion*/, desc, cs_, TEST_FULLTEXT,
|
||||
// strlen(TEST_FULLTEXT), allocator_, test_add_word));
|
||||
//
|
||||
@ -408,7 +447,7 @@ TEST_F(ObTestFTParseHelper, test_parse_fulltext)
|
||||
ASSERT_EQ(OB_SUCCESS, parse_helper_.segment(cs_type_, ObTestAddWord::TEST_FULLTEXT,
|
||||
std::strlen(ObTestAddWord::TEST_FULLTEXT), doc_length, words));
|
||||
|
||||
ObTestAddWord test_add_word;
|
||||
ObTestAddWord test_add_word(cs_type_, allocator_);
|
||||
for (int64_t i = 0; i < words.count(); ++i) {
|
||||
ASSERT_TRUE(0 == strncmp(test_add_word.words_without_stopword_[i], words[i].word_.ptr(), words[i].word_.length()));
|
||||
}
|
||||
@ -514,7 +553,7 @@ const char *ObTestNgramFTParseHelper::name_ = "ngram.1";
|
||||
|
||||
ObTestNgramFTParseHelper::ObTestNgramFTParseHelper()
|
||||
: plugin_name_(STRLEN(name_), name_),
|
||||
ngram_words_{"Oc", "ce", "ea", "an", "nB", "Ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "No", "in", "th", "he", "wo", "or", "rl", "ld"},
|
||||
ngram_words_{"oc", "ce", "ea", "an", "nb", "ba", "as", "se", "fu", "ul", "ll", "lt", "te", "ex", "xt", "se", "ea", "ar", "rc", "ch", "is", "no", "in", "th", "he", "wo", "or", "rl", "ld"},
|
||||
cs_type_(ObCollationType::CS_TYPE_UTF8MB4_BIN),
|
||||
allocator_()
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user