[bugfix] fix invalid character in tokenization
This commit is contained in:
@ -116,8 +116,11 @@ int ObTextWhitespaceTokenizer::get_next(ObDatum &next_token, int64_t &token_freq
|
|||||||
const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len);
|
const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len);
|
||||||
trav_pos_ += c_len;
|
trav_pos_ += c_len;
|
||||||
token_len += c_len;
|
token_len += c_len;
|
||||||
if (trav_pos_ >= doc_len) {
|
if (trav_pos_ >= doc_len || 0 == c_len) {
|
||||||
iter_end_ = true;
|
iter_end_ = true;
|
||||||
|
if (0 == token_len) {
|
||||||
|
ret = OB_ITER_END;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -192,6 +192,25 @@ TEST_F(TestTextAnalyzer, test_basic_english_analyzer)
|
|||||||
const int64_t tokens_len_6[token_cnt_6] = {9, 3, 5, 3};
|
const int64_t tokens_len_6[token_cnt_6] = {9, 3, 5, 3};
|
||||||
const int64_t tokens_freq_6[token_cnt_6] = {3, 2, 1, 1};
|
const int64_t tokens_freq_6[token_cnt_6] = {3, 2, 1, 1};
|
||||||
analyze_test(analyzer, doc_6, doc_len_6, tokens_6, tokens_len_6, tokens_freq_6, token_cnt_6);
|
analyze_test(analyzer, doc_6, doc_len_6, tokens_6, tokens_len_6, tokens_freq_6, token_cnt_6);
|
||||||
|
|
||||||
|
// test invalid character
|
||||||
|
const int64_t doc_len_7 = 128;
|
||||||
|
const char doc_7[doc_len_7] = {(char)0xFF};
|
||||||
|
const int64_t token_cnt_7 = 0;
|
||||||
|
const char *tokens_7[token_cnt_7] = {};
|
||||||
|
const int64_t tokens_len_7[token_cnt_7] = {};
|
||||||
|
const int64_t tokens_freq_7[token_cnt_7] = {};
|
||||||
|
analyze_test(analyzer, doc_7, doc_len_7, tokens_7, tokens_len_7, tokens_freq_7, token_cnt_7);
|
||||||
|
|
||||||
|
// test invalid character in string
|
||||||
|
const int64_t doc_len_8 = 128;
|
||||||
|
const char doc_8[doc_len_8] = {"test invalid character here"};
|
||||||
|
((char *)doc_8)[16] = char(0xFF);
|
||||||
|
const int64_t token_cnt_8 = 3;
|
||||||
|
const char *tokens_8[token_cnt_8] = {"test", "invalid", "cha"};
|
||||||
|
const int64_t tokens_len_8[token_cnt_8] = {4, 7, 3};
|
||||||
|
const int64_t tokens_freq_8[token_cnt_8] = {1, 1, 1};
|
||||||
|
analyze_test(analyzer, doc_8, doc_len_8, tokens_8, tokens_len_8, tokens_freq_8, token_cnt_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace share
|
}; // namespace share
|
||||||
|
|||||||
Reference in New Issue
Block a user