statement_digest、statement_digest_text等处理latin1字符有乱>码，报错1064

2023-12-29 03:18:10 +00:00
parent 0483e0f943
commit b6e1b48a33
3 changed files with 25 additions and 6 deletions
--- a/src/sql/parser/ob_fast_parser.cpp
+++ b/src/sql/parser/ob_fast_parser.cpp
@ -449,6 +449,8 @@ inline int64_t ObFastParserBase::is_identifier_flags(const int64_t pos)
    // Most of the time, if it is not an identifier character, it maybe a space,
    // comma, opening parenthesis, or closing parenthesis. This judgment logic is
    // added here to avoid the next judgment whether it is utf8 char or gbk char
  } else if (!is_oracle_mode_) {
    idf_pos = notascii_gb_char(pos);
  } else if (CHARSET_UTF8MB4 == charset_type_ || CHARSET_UTF16 == charset_type_) {
    idf_pos = is_utf8_char(pos);
  } else if (ObCharset::is_gb_charset(charset_type_)) {
@ -921,6 +923,17 @@ int ObFastParserBase::get_one_insert_row_str(ObRawSql &raw_sql,
  return ret;
 }
 inline int64_t ObFastParserBase::notascii_gb_char(const int64_t pos)
 {
  int64_t idf_pos = -1;
  if (notascii(raw_sql_.char_at(pos))) {
    idf_pos = pos + 1;
  } else {
    idf_pos = is_gbk_char(pos);
  }
  return idf_pos;
 }
 inline int64_t ObFastParserBase::is_latin1_char(const int64_t pos)
 {
  int64_t idf_pos = -1;
@ -1672,6 +1685,8 @@ inline int64_t ObFastParserBase::is_first_identifier_flags(const int64_t pos)
    // Most of the time, if it is not an identifier character, it maybe a space,
    // comma, opening parenthesis, or closing parenthesis. This judgment logic is
    // added here to avoid the next judgment whether it is utf8 char or gbk char
  } else if (!is_oracle_mode_) {
    idf_pos = notascii_gb_char(pos);
  } else if (CHARSET_UTF8MB4 == charset_type_ || CHARSET_UTF16 == charset_type_) {
    idf_pos = is_utf8_char(pos);
  } else if (ObCharset::is_gb_charset(charset_type_)) {
--- a/src/sql/parser/ob_fast_parser.h
+++ b/src/sql/parser/ob_fast_parser.h
@ -381,6 +381,7 @@ protected:
 		return is_valid_char(ch) && USER_VAR_CHAR[static_cast<uint8_t>(ch)];
 	}
 	void reset_parser_node(ParseNode *node);
 	int64_t notascii_gb_char(const int64_t pos);
 	//{U}
 	int64_t is_latin1_char(const int64_t pos);
 	// ({U_2}{U}|{U_3}{U}{U}|{U_4}{U}{U}{U}
@ -440,6 +441,11 @@ protected:
 		return is_valid_char(ch) &&
 		static_cast<uint8_t>(ch) >= 0x40 && static_cast<uint8_t>(ch) <= 0xfe;
 	}
 	inline bool notascii(char ch)
 	{
 		return 	is_valid_char(ch) &&
 				(static_cast<uint8_t>(ch) >= 0x80 && static_cast<uint8_t>(ch) <= 0xFF);
 	}
    inline bool is_latin1(char ch)
 	{
 		return is_valid_char(ch) &&
--- a/src/sql/parser/sql_parser_mysql_mode.l
+++ b/src/sql/parser/sql_parser_mysql_mode.l
@ -38,14 +38,12 @@ extern void obsql_mysql_parser_fatal_error(int32_t errcoyyde, yyscan_t yyscanner
 /* the adq is used to process dq in ANSI_QUOTES sql_mode*/
 %x adq
-U  [\x80-\xbf]
+NOTASCII [\x80-\xFF]
 U_2  [\xc2-\xdf]
 U_3  [\xe0-\xef]
 U_4  [\xf0-\xf4]
 GB_1 [\x81-\xfe]
 GB_2 [\x40-\xfe]
 GB_3 [\x30-\x39]
-UTF8_GB_CHAR ({U_2}{U}|{U_3}{U}{U}|{U_4}{U}{U}{U}|{GB_1}{GB_2}|{GB_1}{GB_3}{GB_1}{GB_3})
+NOTASCII_GB_CHAR ({NOTASCII}|{GB_1}{GB_2}|{GB_1}{GB_3}{GB_1}{GB_3})
 space            [ \t\n\r\f]
 non_newline      [^\n\r]
 sql_comment      ("--"[ \t]+{non_newline}*)|(#{non_newline}*|"--"[\n\r])
@ -62,7 +60,7 @@ common_hint_begin (\/\*\+({space}*hint{space}+)?)
 c_cmt_begin      \/\*
 c_cmt_end        \*+\/
 comment          ({sql_comment})
-identifier        (([A-Za-z0-9$_]|{UTF8_GB_CHAR})+)
+identifier        (([A-Za-z0-9$_]|{NOTASCII_GB_CHAR})+)
 system_variable  (@@[A-Za-z_][A-Za-z0-9_]*)|(@@[`][`A-Za-z_][`A-Za-z_]*)
 user_variable    (@[A-Za-z0-9_\.$]*)|(@[`'\"][`'\"A-Za-z0-9_\.$/%]*)
 version_num      ([0-9]+\.+[0-9]*)