diff --git a/src/sql/parser/ob_fast_parser.cpp b/src/sql/parser/ob_fast_parser.cpp index 8217c7593d..b080bb6c97 100644 --- a/src/sql/parser/ob_fast_parser.cpp +++ b/src/sql/parser/ob_fast_parser.cpp @@ -449,6 +449,8 @@ inline int64_t ObFastParserBase::is_identifier_flags(const int64_t pos) // Most of the time, if it is not an identifier character, it maybe a space, // comma, opening parenthesis, or closing parenthesis. This judgment logic is // added here to avoid the next judgment whether it is utf8 char or gbk char + } else if (!is_oracle_mode_) { + idf_pos = notascii_gb_char(pos); } else if (CHARSET_UTF8MB4 == charset_type_ || CHARSET_UTF16 == charset_type_) { idf_pos = is_utf8_char(pos); } else if (ObCharset::is_gb_charset(charset_type_)) { @@ -921,6 +923,17 @@ int ObFastParserBase::get_one_insert_row_str(ObRawSql &raw_sql, return ret; } +inline int64_t ObFastParserBase::notascii_gb_char(const int64_t pos) +{ + int64_t idf_pos = -1; + if (notascii(raw_sql_.char_at(pos))) { + idf_pos = pos + 1; + } else { + idf_pos = is_gbk_char(pos); + } + return idf_pos; +} + inline int64_t ObFastParserBase::is_latin1_char(const int64_t pos) { int64_t idf_pos = -1; @@ -1672,6 +1685,8 @@ inline int64_t ObFastParserBase::is_first_identifier_flags(const int64_t pos) // Most of the time, if it is not an identifier character, it maybe a space, // comma, opening parenthesis, or closing parenthesis. This judgment logic is // added here to avoid the next judgment whether it is utf8 char or gbk char + } else if (!is_oracle_mode_) { + idf_pos = notascii_gb_char(pos); } else if (CHARSET_UTF8MB4 == charset_type_ || CHARSET_UTF16 == charset_type_) { idf_pos = is_utf8_char(pos); } else if (ObCharset::is_gb_charset(charset_type_)) { diff --git a/src/sql/parser/ob_fast_parser.h b/src/sql/parser/ob_fast_parser.h index 10c4da67ab..e89a6399ab 100644 --- a/src/sql/parser/ob_fast_parser.h +++ b/src/sql/parser/ob_fast_parser.h @@ -381,6 +381,7 @@ protected: return is_valid_char(ch) && USER_VAR_CHAR[static_cast(ch)]; } void reset_parser_node(ParseNode *node); + int64_t notascii_gb_char(const int64_t pos); //{U} int64_t is_latin1_char(const int64_t pos); // ({U_2}{U}|{U_3}{U}{U}|{U_4}{U}{U}{U} @@ -440,6 +441,11 @@ protected: return is_valid_char(ch) && static_cast(ch) >= 0x40 && static_cast(ch) <= 0xfe; } + inline bool notascii(char ch) + { + return is_valid_char(ch) && + (static_cast(ch) >= 0x80 && static_cast(ch) <= 0xFF); + } inline bool is_latin1(char ch) { return is_valid_char(ch) && diff --git a/src/sql/parser/sql_parser_mysql_mode.l b/src/sql/parser/sql_parser_mysql_mode.l index 4898d325d5..ae764543d5 100644 --- a/src/sql/parser/sql_parser_mysql_mode.l +++ b/src/sql/parser/sql_parser_mysql_mode.l @@ -38,14 +38,12 @@ extern void obsql_mysql_parser_fatal_error(int32_t errcoyyde, yyscan_t yyscanner /* the adq is used to process dq in ANSI_QUOTES sql_mode*/ %x adq -U [\x80-\xbf] -U_2 [\xc2-\xdf] -U_3 [\xe0-\xef] -U_4 [\xf0-\xf4] +NOTASCII [\x80-\xFF] GB_1 [\x81-\xfe] GB_2 [\x40-\xfe] GB_3 [\x30-\x39] -UTF8_GB_CHAR ({U_2}{U}|{U_3}{U}{U}|{U_4}{U}{U}{U}|{GB_1}{GB_2}|{GB_1}{GB_3}{GB_1}{GB_3}) +NOTASCII_GB_CHAR ({NOTASCII}|{GB_1}{GB_2}|{GB_1}{GB_3}{GB_1}{GB_3}) + space [ \t\n\r\f] non_newline [^\n\r] sql_comment ("--"[ \t]+{non_newline}*)|(#{non_newline}*|"--"[\n\r]) @@ -62,7 +60,7 @@ common_hint_begin (\/\*\+({space}*hint{space}+)?) c_cmt_begin \/\* c_cmt_end \*+\/ comment ({sql_comment}) -identifier (([A-Za-z0-9$_]|{UTF8_GB_CHAR})+) +identifier (([A-Za-z0-9$_]|{NOTASCII_GB_CHAR})+) system_variable (@@[A-Za-z_][A-Za-z0-9_]*)|(@@[`][`A-Za-z_][`A-Za-z_]*) user_variable (@[A-Za-z0-9_\.$]*)|(@[`'\"][`'\"A-Za-z0-9_\.$/%]*) version_num ([0-9]+\.+[0-9]*)