/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #ifndef OCEANBASE_SQL_PARSER_FAST_PARSER_ #define OCEANBASE_SQL_PARSER_FAST_PARSER_ #include "lib/allocator/ob_allocator.h" #include "lib/string/ob_string.h" #include "lib/charset/ob_charset.h" #include "sql/parser/ob_parser_utils.h" #include "sql/parser/ob_char_type.h" #include "sql/parser/parse_malloc.h" namespace oceanbase { namespace sql { struct ObFastParser final { public: static int parse(const common::ObString &stmt, const bool enable_batched_multi_stmt, char *&no_param_sql, int64_t &no_param_sql_len, ParamList *¶m_list, int64_t ¶m_num, common::ObCollationType connection_collation, common::ObIAllocator &allocator, ObSQLMode sql_mode = 0); }; class ObFastParserBase { public: // For performance reasons, virtual functions are not used // So the callback function below must be implemented in a derived class typedef int (ObFastParserBase::*ParseNextTokenFunc) (); typedef int (ObFastParserBase::*ProcessIdfFunc) (bool is_number_begin); explicit ObFastParserBase(common::ObIAllocator &allocator, const common::ObCollationType connection_collation, const bool enable_batched_multi_stmt); ~ObFastParserBase() {} int parse(const common::ObString &stmt, char *&no_param_sql, int64_t &no_param_sql_len, ParamList *¶m_list, int64_t ¶m_num); protected: enum TokenType { INVALID_TOKEN, NORMAL_TOKEN, // token that needs to be kept as it is PARAM_TOKEN, // token that needs to be parameterized IGNORE_TOKEN // token that need to be ignored such as comments }; // In the process of judging the identifer, we need to continuously scan next char // in order to prevent the problem of memory access out of bounds, so we need to write // a lot of judgment logic like the following // if (cur_pos_ < raw_sql_len_ - 1) { // scan(); // do something // } // There are too many such branches, making our code look very ugly. therefore, we use a special // value of -1. when the allowed length is exceeded, peek, scan, reverse_scan, char_at return -1 // this will not affect any correctness issues, and will make the code look better static const char INVALID_CHAR = -1; static const int64_t PARSER_NODE_SIZE = sizeof(ParseNode); static const int64_t FIEXED_PARAM_NODE_SIZE = PARSER_NODE_SIZE + sizeof(ParamList); struct ObRawSql { explicit ObRawSql() : raw_sql_(nullptr), raw_sql_len_(0), cur_pos_(0), search_end_(false) {} inline void init(const char *raw_sql, const int64_t len) { cur_pos_ = 0; raw_sql_ = raw_sql; raw_sql_len_ = len; } inline bool is_search_end() { return search_end_ || cur_pos_ > raw_sql_len_ - 1; } inline char peek() { if (cur_pos_ >= raw_sql_len_ - 1) { return INVALID_CHAR; } return raw_sql_[cur_pos_ + 1]; } inline char scan(const int64_t offset) { if (cur_pos_ + offset >= raw_sql_len_) { search_end_ = true; cur_pos_ = raw_sql_len_; return INVALID_CHAR; } cur_pos_ += offset; return raw_sql_[cur_pos_]; } inline char scan() { return scan(1); } inline char reverse_scan() { if (cur_pos_ <= 0 || cur_pos_ >= raw_sql_len_ + 1) { search_end_ = true; return INVALID_CHAR; } return raw_sql_[--cur_pos_]; } inline char char_at(int64_t idx) { if (idx < 0 || idx >= raw_sql_len_) { return INVALID_CHAR; } return raw_sql_[idx]; } inline const char *ptr(const int64_t pos) { if (pos < 0 || pos >= raw_sql_len_) { return nullptr; } return &(raw_sql_[pos]); } int64_t strncasecmp(int64_t pos, const char *str, const int64_t size); inline int64_t strncasecmp(const char *str, const int64_t size) { return strncasecmp(cur_pos_, str, size); } // Access a character (no bounds check) // char operator[] (const int64_t idx) const { return raw_sql_[idx]; } // For debug common::ObString to_string() const { if (OB_UNLIKELY(nullptr == raw_sql_ || 0 == raw_sql_len_)) { return common::ObString(0, nullptr); } return common::ObString(raw_sql_len_, raw_sql_); } const char *raw_sql_; int64_t raw_sql_len_; int64_t cur_pos_; bool search_end_; }; protected: /** * Check whether it is a specify character and whether it is a multi byte specify * character corresponding to the character set * @param [in] : str the string being retrieved * @param [in] : len the length of the string being retrieved * @param [in] : pos the location to start the retrieved * @param [out] : byte_len byte size of a specify character */ #define DEF_MULTI_BYTE_CHARACTER_CHECK_FUNCS(CHARACTER_NAME) \ inline bool is_multi_byte_##CHARACTER_NAME(const char *str, const int64_t len, \ const int64_t pos, int64_t &byte_len) \ { \ bool bool_ret = false; \ if (pos >= len) { \ return bool_ret; \ } \ if (is_##CHARACTER_NAME(str[pos])) { \ bool_ret = true; \ byte_len = 1; \ } else if (is_oracle_mode_ \ && (CHARSET_UTF8MB4 == charset_type_ || CHARSET_UTF16 == charset_type_)) { \ if (pos + 3 < len && -1 != is_utf8_multi_byte_##CHARACTER_NAME(str, pos)) { \ bool_ret = true; \ byte_len = 3; \ } \ } else if (is_oracle_mode_ \ && (CHARSET_GBK == charset_type_ || CHARSET_GB18030 == charset_type_)) { \ if (pos + 2 < len && -1 != is_gbk_multi_byte_##CHARACTER_NAME(str, pos)) { \ bool_ret = true; \ byte_len = 2; \ } \ } \ return bool_ret; \ } #define DEF_CHARACTER_CHECK_FUNCS(CHARACTER_NAME, SPECIFY_CHARACTER) \ inline bool is_##CHARACTER_NAME(char ch) \ { \ return SPECIFY_CHARACTER == ch; \ } #define DEF_RAW_SQL_MULTI_BYTE_CHARACTER_CHECK(CHARACTER_NAME, pos, byte_len) \ is_multi_byte_##CHARACTER_NAME(raw_sql_.raw_sql_, raw_sql_.raw_sql_len_, pos, byte_len) #define IS_MULTI_SPACE(pos, byte_len) \ DEF_RAW_SQL_MULTI_BYTE_CHARACTER_CHECK(space, pos, byte_len) #define IS_MULTI_COMMA(pos, byte_len) \ DEF_RAW_SQL_MULTI_BYTE_CHARACTER_CHECK(comma, pos, byte_len) #define IS_MULTI_LEFT_PARENTHESIS(pos, byte_len) \ DEF_RAW_SQL_MULTI_BYTE_CHARACTER_CHECK(left_parenthesis, pos, byte_len) #define IS_MULTI_RIGHT_PARENTHESIS(pos, byte_len) \ DEF_RAW_SQL_MULTI_BYTE_CHARACTER_CHECK(right_parenthesis, pos, byte_len) #define CHECK_EQ_STRNCASECMP(str, size) (0 == raw_sql_.strncasecmp(str, size)) DEF_CHARACTER_CHECK_FUNCS(comma, ','); DEF_CHARACTER_CHECK_FUNCS(left_parenthesis, '('); DEF_CHARACTER_CHECK_FUNCS(right_parenthesis, ')'); DEF_MULTI_BYTE_CHARACTER_CHECK_FUNCS(space); DEF_MULTI_BYTE_CHARACTER_CHECK_FUNCS(comma); DEF_MULTI_BYTE_CHARACTER_CHECK_FUNCS(left_parenthesis); DEF_MULTI_BYTE_CHARACTER_CHECK_FUNCS(right_parenthesis); void process_leading_space(); void remove_multi_stmt_end_space(); inline void set_callback_func(ParseNextTokenFunc func1, ProcessIdfFunc func2) { parse_next_token_func_ = func1; process_idf_func_ = func2; } inline bool is_valid_token() { return cur_token_type_ != INVALID_TOKEN; } inline bool is_valid_char(char ch) { return ch != INVALID_CHAR; } // [ \t\n\r\f\v] inline bool is_space(char ch) { return is_valid_char(ch) && SPACE_FLAGS[static_cast(ch)]; } char *parse_strdup_with_replace_multi_byte_char( const char *str, const size_t dup_len, char *out_str, int64_t &out_len); inline bool is_digit(char ch) { return is_valid_char(ch) && DIGIT_FLAGS[static_cast(ch)]; } // [^\n\r] inline bool is_non_newline(char ch) { return is_valid_char(ch) && '\n' != ch && '\r' != ch; } // [0-9a-fA-F] inline bool is_hex(char ch) { return is_valid_char(ch) && HEX_FLAGS[static_cast(ch)]; } // [0-1] inline bool is_binary(char ch) { return '0' == ch || '1' == ch; } // [A-Za-z0-9$-] inline bool is_identifier_char(char ch) { uint8_t ind = static_cast(ch); return is_valid_char(ch) && (is_oracle_mode_ ? ORACLE_IDENTIFIER_FALGS[ind] : MYSQL_IDENTIFIER_FALGS[ind]); } // [A-Za-z_] inline bool is_sys_var_first_char(char ch) { return is_valid_char(ch) && SYS_VAR_FIRST_CHAR[static_cast(ch)]; } // [A-Za-z0-9_] inline bool is_sys_var_char(char ch) { return is_valid_char(ch) && SYS_VAR_CHAR[static_cast(ch)]; } // [`'\"A-Za-z0-9_\.$/%] inline bool is_user_var_char(char ch) { bool is_oracle_user_var = USER_VAR_CHAR[static_cast(ch)] || '/' == ch || '%' == ch || '\"' == ch || '\'' == ch; return is_valid_char(ch) && (is_oracle_mode_ ? is_oracle_user_var : (is_oracle_user_var || '`' == ch)); } // [A-Za-z0-9_\.$] inline bool is_user_var_char_without_quota(char ch) { return is_valid_char(ch) && USER_VAR_CHAR[static_cast(ch)]; } void reset_parser_node(ParseNode *node); // ({U_2}{U}|{U_3}{U}{U}|{U_4}{U}{U}{U} int64_t is_utf8_char(const int64_t pos); // NOTES: No boundary check, the caller guarantees safety!!! // ([\\\xe3\][\\\x80\][\\\x80]) int64_t is_utf8_multi_byte_space(const char *str, const int64_t start_pos); // ([\\\xef\][\\\xbc\][\\\x8c]) int64_t is_utf8_multi_byte_comma(const char *str, const int64_t start_pos); // ([\\\xef\][\\\xbc\][\\\x88]) int64_t is_utf8_multi_byte_left_parenthesis(const char *str, const int64_t start_pos); // ([\\\xef\][\\\xbc\][\\\x89]) int64_t is_utf8_multi_byte_right_parenthesis(const char *str, const int64_t start_pos); // {GB_1}{GB_2} int64_t is_gbk_char(const int64_t pos); // ([\\\xa1][\\\xa1]) int64_t is_gbk_multi_byte_space(const char *str, const int64_t start_pos); // ([\\\xa3][\\\xac]) int64_t is_gbk_multi_byte_comma(const char *str, const int64_t start_pos); // ([\\\xa3][\\\xa8]) int64_t is_gbk_multi_byte_left_parenthesis(const char *str, const int64_t start_pos); // ([\\\xa3][\\\xa9]) int64_t is_gbk_multi_byte_right_parenthesis(const char *str, const int64_t start_pos); // [\x80-\xbf] inline bool is_u(char ch) { return is_valid_char(ch) && static_cast(ch) >= 0x80 && static_cast(ch) <= 0xbf; } // [\xc2-\xdf] inline bool is_u2(char ch) { return is_valid_char(ch) && static_cast(ch) >= 0xc2 && static_cast(ch) <= 0xdf; } // [\xe0-\xef] inline bool is_u3(char ch) { return is_valid_char(ch) && static_cast(ch) >= 0xe0 && static_cast(ch) <= 0xef; } // [\xf0-\xf4] inline bool is_u4(char ch) { return is_valid_char(ch) && static_cast(ch) >= 0xf0 && static_cast(ch) <= 0xf4; } // [\x81-\xfe] inline bool is_gb1(char ch) { return is_valid_char(ch) && static_cast(ch) >= 0x81 && static_cast(ch) <= 0xfe; } // [\x40-\xfe] inline bool is_gb2(char ch) { return is_valid_char(ch) && static_cast(ch) >= 0x40 && static_cast(ch) <= 0xfe; } // [0-9]{n} inline bool is_n_continuous_digits(const char *str, const int64_t pos, const int64_t len, const int64_t n); inline bool is_normal_char(char ch) { return is_valid_char(ch) && (is_oracle_mode_ ? ORACLE_NORMAL_CHAR_FLAGS[static_cast(ch)] : MYSQL_NORMAL_CHAR_FLAGS[static_cast(ch)]); } // [A-Za-z] inline bool is_first_identifier_char(char ch) { return is_valid_char(ch) && ORACLE_FIRST_IDENTIFIER_FLAGS[static_cast(ch)]; } /** * Used to parse [A-Za-z] or {UTF8_GB_CHAR} * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_first_identifier_flags(const int64_t pos); // Add parameterized nodes int add_bool_type_node(bool is_true); int add_null_type_node(); int add_nowait_type_node(); void lex_store_param(ParseNode *node, char *buf); void append_no_param_sql(); void process_escape_string(char *str_buf, int64_t &str_buf_len); ParseNode *new_node(char *&buf, ObItemType type); char* parse_strndup(const char *str, size_t nbyte, char *buf); int64_t get_question_mark(ObQuestionMarkCtx *ctx, void *malloc_pool, const char *name, const int64_t name_len, char *buf); /** * The hexadecimal number in mysql mode has the following two representations: * x'([0-9A-F])*' or 0x([0-9A-F])+ * @param [in] : when is_quote is true, it means the first one. when "\`" does not appear * as a pair, only an 'x' is reserved */ int process_hex_number(bool is_quote); /** * The binary in mysql mode has the following two representations: * b'([01])*' or 0b([01])+ * @param [in] : when is_quote is true, it means the first one. when "\`" does not appear * as a pair, only an 'b' is reserved */ int process_binary(bool is_quote); int process_hint(); int process_question_mark(); int process_number(bool has_minus); int process_negative(); int process_identifier_begin_with_l(bool &need_process_ws); int process_identifier_begin_with_t(bool &need_process_ws); int process_date_related_type(const char quote, ObItemType item_type); int process_time_relate_type(bool &need_process_ws, ObItemType type = T_INVALID); void process_token(); void process_system_variable(bool is_contain_quote); void parse_integer(ParseNode *node); void process_user_variable(bool is_contain_quote); // Used to process '`' and keep all characters before the next '`' int process_backtick(); // Used to process '\"' and keep all characters before the next '\"' int process_double_quote(); // Until "*/" appears, all characters before it should be ignored int process_comment_content(); /** * Used to check the escape character encountered in the string * Character sets marked with escape_with_backslash_is_dangerous, such as * big5, cp932, gbk, sjis. the escape character (0x5C) may be part of a multi-byte * character and requires special judgment */ void check_real_escape(bool &is_real_escape); /** * Used to parse whitespace * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_whitespace(int64_t pos); /** * Used to parse ({space}*(\/\*([^+*]|\*+[^*\/])*\*+\/{space}*)*(\/\*\+({space}*hint{space}+)?)) * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_hint_begin(int64_t pos); /** * Used to parse [A-Za-z0-9$_#] or {UTF8_GB_CHAR} * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_identifier_flags(const int64_t pos); // \({space}*{int_num}{space}*,{space}*{int_num}{space}*\) int64_t is_2num_second(int64_t pos); // to{space}+(day|hour|minute|second{interval_pricision}?) int64_t is_interval_ds(int64_t pos); // to{space}+(year|month) int64_t is_interval_ym(int64_t pos); /** * Used to parse ({interval_pricision}{space}*|{space}+)to{space}+ * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_interval_pricision_with_space(int64_t pos); /** * Used to parse {space}*\({space}*{int_num}{space}*\) * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_interval_pricision(int64_t pos); /** * Used to parse {space}*{int_num}{space}* * @param [in] : pos the position of the first character * Return the next position of the position that meets the condition * and return -1 if it is not satisfied */ int64_t is_digit_with_space(int64_t pos); /** * Used to parse the following interval-related tokens compatible with oracle * Interval{whitespace}?'[^']*'{space}*(year|month){interval_pricision}? * Interval{whitespace}?'[^']*'{space}*(year|month)({interval_pricision}{space}*| * {space}+)to{space}+(year|month) * Interval{whitespace}?'[^']*'{space}*second{space}*\({space}*{int_num}{space}*, * {space}*{int_num}{space}*\) * Interval{whitespace}?'[^']*'{space}*(day|hour|minute|second){interval_pricision}? * Interval{whitespace}?'[^']*'{space}*(day|hour|minute|second)({interval_pricision}{space}*| * {space}+)to{space}+(day|hour|minute|second{interval_pricision}?) */ int process_interval(); protected: ObRawSql raw_sql_; char *no_param_sql_; int64_t no_param_sql_len_; int param_num_; bool is_oracle_mode_; bool is_batched_multi_stmt_split_on_; bool is_mysql_compatible_comment_; int64_t cur_token_begin_pos_; int64_t copy_begin_pos_; int64_t copy_end_pos_; char *tmp_buf_; int64_t tmp_buf_len_; int64_t last_well_formed_len_; ParamList *param_node_list_; ParamList *tail_param_node_; TokenType cur_token_type_; ObQuestionMarkCtx question_mark_ctx_; common::ObIAllocator &allocator_; common::ObCharsetType charset_type_; const ObCharsetInfo *charset_info_; ParseNextTokenFunc parse_next_token_func_; ProcessIdfFunc process_idf_func_; private: DISALLOW_COPY_AND_ASSIGN(ObFastParserBase); }; class ObFastParserMysql final : public ObFastParserBase { public: explicit ObFastParserMysql( common::ObIAllocator &allocator, const common::ObCollationType connection_collation, const bool enable_batched_multi_stmt, ObSQLMode sql_mode) : ObFastParserBase(allocator, connection_collation, enable_batched_multi_stmt),sql_mode_(sql_mode) { is_oracle_mode_ = false; set_callback_func( static_cast(&ObFastParserMysql::parse_next_token), static_cast(&ObFastParserMysql::process_identifier)); } ~ObFastParserMysql() {} private: ObSQLMode sql_mode_; int parse_next_token(); int process_identifier(bool is_number_begin); /** * In case of two adjacent string literal, such as " 'a' 'b' ", the two string will be * concatenate into 'ab'. However, the string 'a' will used as the column name if it appears * in the select list, which means we must save it rather than just skipping the 'sqnewline'. * so, we remember the first string as a child of the 'T_VARCHAR' node which represents * " 'a' 'b' ", whose str_value_ is 'ab'. This will save us from modifying our grammar and a * a lot of troubles. */ int process_string(const char quote); int process_zero_identifier(); int process_ps_statement(); int process_identifier_begin_with_n(); private: DISALLOW_COPY_AND_ASSIGN(ObFastParserMysql); }; class ObFastParserOracle final : public ObFastParserBase { public: explicit ObFastParserOracle( common::ObIAllocator &allocator, const common::ObCollationType connection_collation, const bool enable_batched_multi_stmt) : ObFastParserBase(allocator, connection_collation, enable_batched_multi_stmt) { is_oracle_mode_ = true; set_callback_func( static_cast(&ObFastParserOracle::parse_next_token), static_cast(&ObFastParserOracle::process_identifier)); } ~ObFastParserOracle() {} private: int parse_next_token(); int process_identifier(bool is_number_begin); /** * @param [in] : if in_q_quote is true, means that the current token * starts with ("N"|"n")?("Q"|"q"){sqbegin} * else, means that the current token starts with ("N"|"n")?{sqbegin } */ int process_string(const bool in_q_quote); int process_ps_statement(); int process_identifier_begin_with_n(); private: DISALLOW_COPY_AND_ASSIGN(ObFastParserOracle); }; } // end namespace sql } // end namespace oceanbase #endif /* OCEANBASE_SQL_PARSER_FAST_PARSER_ */