[bugfix] fix invalid character in tokenization
This commit is contained in:
		| @ -116,8 +116,11 @@ int ObTextWhitespaceTokenizer::get_next(ObDatum &next_token, int64_t &token_freq | |||||||
|       const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len); |       const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len); | ||||||
|       trav_pos_ += c_len; |       trav_pos_ += c_len; | ||||||
|       token_len += c_len; |       token_len += c_len; | ||||||
|       if (trav_pos_ >= doc_len) { |       if (trav_pos_ >= doc_len || 0 == c_len) { | ||||||
|         iter_end_ = true; |         iter_end_ = true; | ||||||
|  |         if (0 == token_len) { | ||||||
|  |           ret = OB_ITER_END; | ||||||
|  |         } | ||||||
|         break; |         break; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  | |||||||
| @ -192,6 +192,25 @@ TEST_F(TestTextAnalyzer, test_basic_english_analyzer) | |||||||
|   const int64_t tokens_len_6[token_cnt_6] = {9, 3, 5, 3}; |   const int64_t tokens_len_6[token_cnt_6] = {9, 3, 5, 3}; | ||||||
|   const int64_t tokens_freq_6[token_cnt_6] = {3, 2, 1, 1}; |   const int64_t tokens_freq_6[token_cnt_6] = {3, 2, 1, 1}; | ||||||
|   analyze_test(analyzer, doc_6, doc_len_6, tokens_6, tokens_len_6, tokens_freq_6, token_cnt_6); |   analyze_test(analyzer, doc_6, doc_len_6, tokens_6, tokens_len_6, tokens_freq_6, token_cnt_6); | ||||||
|  |  | ||||||
|  |   // test invalid character | ||||||
|  |   const int64_t doc_len_7 = 128; | ||||||
|  |   const char doc_7[doc_len_7] = {(char)0xFF}; | ||||||
|  |   const int64_t token_cnt_7 = 0; | ||||||
|  |   const char *tokens_7[token_cnt_7] = {}; | ||||||
|  |   const int64_t tokens_len_7[token_cnt_7] = {}; | ||||||
|  |   const int64_t tokens_freq_7[token_cnt_7] = {}; | ||||||
|  |   analyze_test(analyzer, doc_7, doc_len_7, tokens_7, tokens_len_7, tokens_freq_7, token_cnt_7); | ||||||
|  |  | ||||||
|  |   // test invalid character in string | ||||||
|  |   const int64_t doc_len_8 = 128; | ||||||
|  |   const char doc_8[doc_len_8] = {"test invalid character here"}; | ||||||
|  |   ((char *)doc_8)[16] = char(0xFF); | ||||||
|  |   const int64_t token_cnt_8 = 3; | ||||||
|  |   const char *tokens_8[token_cnt_8] = {"test", "invalid", "cha"}; | ||||||
|  |   const int64_t tokens_len_8[token_cnt_8] = {4, 7, 3}; | ||||||
|  |   const int64_t tokens_freq_8[token_cnt_8] = {1, 1, 1}; | ||||||
|  |   analyze_test(analyzer, doc_8, doc_len_8, tokens_8, tokens_len_8, tokens_freq_8, token_cnt_8); | ||||||
| } | } | ||||||
|  |  | ||||||
| }; // namespace share | }; // namespace share | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user
	 saltonz
					saltonz