[bugfix] fix invalid character in tokenization

This commit is contained in:
saltonz 2024-04-26 12:46:07 +00:00 committed by ob-robot
parent af51225977
commit fb6f777b86
2 changed files with 23 additions and 1 deletions

View File

@ -116,8 +116,11 @@ int ObTextWhitespaceTokenizer::get_next(ObDatum &next_token, int64_t &token_freq
const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len);
trav_pos_ += c_len;
token_len += c_len;
if (trav_pos_ >= doc_len) {
if (trav_pos_ >= doc_len || 0 == c_len) {
iter_end_ = true;
if (0 == token_len) {
ret = OB_ITER_END;
}
break;
}
}

View File

@ -192,6 +192,25 @@ TEST_F(TestTextAnalyzer, test_basic_english_analyzer)
const int64_t tokens_len_6[token_cnt_6] = {9, 3, 5, 3};
const int64_t tokens_freq_6[token_cnt_6] = {3, 2, 1, 1};
analyze_test(analyzer, doc_6, doc_len_6, tokens_6, tokens_len_6, tokens_freq_6, token_cnt_6);
// test invalid character
const int64_t doc_len_7 = 128;
const char doc_7[doc_len_7] = {(char)0xFF};
const int64_t token_cnt_7 = 0;
const char *tokens_7[token_cnt_7] = {};
const int64_t tokens_len_7[token_cnt_7] = {};
const int64_t tokens_freq_7[token_cnt_7] = {};
analyze_test(analyzer, doc_7, doc_len_7, tokens_7, tokens_len_7, tokens_freq_7, token_cnt_7);
// test invalid character in string
const int64_t doc_len_8 = 128;
const char doc_8[doc_len_8] = {"test invalid character here"};
((char *)doc_8)[16] = char(0xFF);
const int64_t token_cnt_8 = 3;
const char *tokens_8[token_cnt_8] = {"test", "invalid", "cha"};
const int64_t tokens_len_8[token_cnt_8] = {4, 7, 3};
const int64_t tokens_freq_8[token_cnt_8] = {1, 1, 1};
analyze_test(analyzer, doc_8, doc_len_8, tokens_8, tokens_len_8, tokens_freq_8, token_cnt_8);
}
}; // namespace share