[bugfix] fix invalid character in tokenization

2024-04-26 12:46:07 +00:00
parent af51225977
commit fb6f777b86
2 changed files with 23 additions and 1 deletions
--- a/src/share/text_analysis/ob_token_stream.cpp
+++ b/src/share/text_analysis/ob_token_stream.cpp
@ -116,8 +116,11 @@ int ObTextWhitespaceTokenizer::get_next(ObDatum &next_token, int64_t &token_freq
      const int64_t c_len = ob_mbcharlen_ptr(cs_, doc + trav_pos_, doc + doc_len);
      trav_pos_ += c_len;
      token_len += c_len;
-      if (trav_pos_ >= doc_len) {
+      if (trav_pos_ >= doc_len || 0 == c_len) {
        iter_end_ = true;
+        if (0 == token_len) {
+          ret = OB_ITER_END;
+        }
        break;
      }
    }