From fb6f777b86a61dde93223a4224c93c7ea9dbc1ae Mon Sep 17 00:00:00 2001 From: saltonz Date: Fri, 26 Apr 2024 12:46:07 +0000 Subject: [PATCH] [bugfix] fix invalid character in tokenization --- src/share/text_analysis/ob_token_stream.cpp | 5 ++++- .../text_analysis/test_text_analyzer.cpp | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/share/text_analysis/ob_token_stream.cpp b/src/share/text_analysis/ob_token_stream.cpp index 40a0d5c5b..32791cf0d 100644 --- a/src/share/text_analysis/ob_token_stream.cpp +++ b/src/share/text_analysis/ob_token_stream.cpp @@ -116,8 +116,11 @@ int ObTextWhitespaceTokenizer::get_next(ObDatum &next_token, int64_t &token_freq const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len); trav_pos_ += c_len; token_len += c_len; - if (trav_pos_ >= doc_len) { + if (trav_pos_ >= doc_len || 0 == c_len) { iter_end_ = true; + if (0 == token_len) { + ret = OB_ITER_END; + } break; } } diff --git a/unittest/share/text_analysis/test_text_analyzer.cpp b/unittest/share/text_analysis/test_text_analyzer.cpp index aa60cdec5..06ee5dbcf 100644 --- a/unittest/share/text_analysis/test_text_analyzer.cpp +++ b/unittest/share/text_analysis/test_text_analyzer.cpp @@ -192,6 +192,25 @@ TEST_F(TestTextAnalyzer, test_basic_english_analyzer) const int64_t tokens_len_6[token_cnt_6] = {9, 3, 5, 3}; const int64_t tokens_freq_6[token_cnt_6] = {3, 2, 1, 1}; analyze_test(analyzer, doc_6, doc_len_6, tokens_6, tokens_len_6, tokens_freq_6, token_cnt_6); + + // test invalid character + const int64_t doc_len_7 = 128; + const char doc_7[doc_len_7] = {(char)0xFF}; + const int64_t token_cnt_7 = 0; + const char *tokens_7[token_cnt_7] = {}; + const int64_t tokens_len_7[token_cnt_7] = {}; + const int64_t tokens_freq_7[token_cnt_7] = {}; + analyze_test(analyzer, doc_7, doc_len_7, tokens_7, tokens_len_7, tokens_freq_7, token_cnt_7); + + // test invalid character in string + const int64_t doc_len_8 = 128; + const char doc_8[doc_len_8] = {"test invalid character here"}; + ((char *)doc_8)[16] = char(0xFF); + const int64_t token_cnt_8 = 3; + const char *tokens_8[token_cnt_8] = {"test", "invalid", "cha"}; + const int64_t tokens_len_8[token_cnt_8] = {4, 7, 3}; + const int64_t tokens_freq_8[token_cnt_8] = {1, 1, 1}; + analyze_test(analyzer, doc_8, doc_len_8, tokens_8, tokens_len_8, tokens_freq_8, token_cnt_8); } }; // namespace share