From ad5b24431387e6fcca38c0c963a2fcdaf6ee9c98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Mon, 10 Dec 2018 23:53:44 +0200 Subject: [PATCH] Optimize canonicalization code Switched from default character type functions to ones that use lookup tables. Eliminated the internal state and replaced with in-place iteration of the query. Added code to allow single-lookup detection of normal characters. --- server/core/modutil.cc | 300 ++++++++++-------- .../core/test/canonical_tests/alter.expected | 2 +- 2 files changed, 167 insertions(+), 135 deletions(-) diff --git a/server/core/modutil.cc b/server/core/modutil.cc index b1eaecb8c..4a77984b6 100644 --- a/server/core/modutil.cc +++ b/server/core/modutil.cc @@ -18,8 +18,11 @@ #include #include +#include #include #include +#include +#include #include #include @@ -1261,11 +1264,54 @@ static inline bool is_next(mxs::Buffer::iterator it, mxs::Buffer::iterator end, return true; } + +// Class for fast char type lookups +class LUT +{ +public: + LUT(const std::string& values) + { + for (const auto& a : values) + { + m_table[(uint8_t)a] = true; + } + } + + LUT(std::function is_type) + { + for (int i = 0; i <= std::numeric_limits::max(); i++) + { + m_table[i] = is_type(i); + } + } + + inline bool operator()(uint8_t c) const + { + return m_table[c]; + } + +private: + std::array m_table = {}; +}; + +// Optimized versions of standard functions that ignore the locale and use a lookup table +static const LUT is_space(::isspace); +static const LUT is_digit(::isdigit); +static const LUT is_alpha(::isalpha); +static const LUT is_alnum(::isalnum); +static const LUT is_xdigit(::isxdigit); + +// For detection of characters that need special treatment, helps speed up processing of keywords etc. +static const LUT is_special([](uint8_t c) { + return isdigit(c) || isspace(c) || std::string("\"'`#-/\\").find( + c) != std::string::npos; + }); + static std::pair probe_number(mxs::Buffer::iterator it, mxs::Buffer::iterator end) { mxb_assert(it != end); - mxb_assert(isdigit(*it)); + mxb_assert(is_digit(*it)); std::pair rval = std::make_pair(true, it); bool is_hex = *it == '0'; bool allow_hex = false; @@ -1275,7 +1321,7 @@ static std::pair probe_number(mxs::Buffer::iterator while (it != end) { - if (isdigit(*it) || (allow_hex && isxdigit(*it))) + if (is_digit(*it) || (allow_hex && is_xdigit(*it))) { // Digit or hex-digit, skip it } @@ -1295,7 +1341,7 @@ static std::pair probe_number(mxs::Buffer::iterator // Possible scientific notation number auto next_it = std::next(it); - if (next_it == end || (!isdigit(*next_it) && *next_it != '-')) + if (next_it == end || (!is_digit(*next_it) && *next_it != '-')) { rval.first = false; break; @@ -1312,19 +1358,19 @@ static std::pair probe_number(mxs::Buffer::iterator // Possible decimal number auto next_it = std::next(it); - if (next_it != end && !isdigit(*next_it)) + if (next_it != end && !is_digit(*next_it)) { /** No number after the period, not a decimal number. * The fractional part of the number is optional in MariaDB. */ rval.first = false; break; } - mxb_assert(isdigit(*next_it)); + mxb_assert(is_digit(*next_it)); } else { // If we have a non-text character, we treat it as a number - rval.first = !isalpha(*it); + rval.first = !is_alpha(*it); break; } } @@ -1337,18 +1383,18 @@ static std::pair probe_number(mxs::Buffer::iterator return rval; } -bool is_negation(const std::string& str) +static inline bool is_negation(const std::string& str) { bool rval = false; - if (!str.empty() && str[str.size() - 1] == '-') + if (!str.empty() && str.back() == '-') { // Possibly a negative number rval = true; for (auto it = std::next(str.rbegin()); it != str.rend(); it++) { - if (!isspace(*it)) + if (!is_space(*it)) { /** If we find a previously converted value, we know that it * is not a negation but a subtraction. */ @@ -1361,6 +1407,26 @@ bool is_negation(const std::string& str) return rval; } +mxs::Buffer::iterator find_char(mxs::Buffer::iterator it, const mxs::Buffer::iterator& end, char c) +{ + for (; it != end; ++it) + { + if (*it == '\\') + { + if (++it == end) + { + break; + } + } + else if (*it == c) + { + return it; + } + } + + return it; +} + namespace maxscale { @@ -1370,142 +1436,108 @@ std::string get_canonical(GWBUF* querybuf) rval.reserve(gwbuf_length(querybuf) - MYSQL_HEADER_LEN + 1); mxs::Buffer buf(querybuf); - enum state - { - NONE, - SINGLE_QUOTE, - DOUBLE_QUOTE, - BACKTICK, - UNTIL_NEWLINE, - INLINE_COMMENT - } my_state = NONE; - for (auto it = std::next(buf.begin(), MYSQL_HEADER_LEN + 1); // Skip packet header and command it != buf.end(); ++it) { - if (*it == '\\') + if (!is_special(*it)) + { + // Normal character, no special handling required + rval += *it; + } + else if (*it == '\\') { // Jump over any escaped values - rval += *it; - it++; - rval += *it; - continue; - } + rval += *it++; - char prev = rval.empty() ? ' ' : rval[rval.size() - 1]; - - switch (my_state) - { - case BACKTICK: - rval += *it; - if (*it == '`') + if (it != buf.end()) { - my_state = NONE; - } - break; - - case SINGLE_QUOTE: - if (*it == '\'') - { - rval += '?'; - my_state = NONE; - } - break; - - case DOUBLE_QUOTE: - if (*it == '"') - { - rval += '?'; - my_state = NONE; - } - break; - - case INLINE_COMMENT: - if (is_next(it, buf.end(), "*/")) - { - // Comment end marker, return to normal parsing - ++it; - my_state = NONE; - } - break; - - case UNTIL_NEWLINE: - if (is_next(it, buf.end(), "\r\n")) - { - ++it; - my_state = NONE; - } - else if (is_next(it, buf.end(), "\n") || is_next(it, buf.end(), "\r")) - { - my_state = NONE; - } - break; - - default: - if (isspace(*it)) - { - if (isspace(prev)) - { - // Repeating space, skip it - continue; - } - *it = ' '; - } - else if (is_next(it, buf.end(), "/*")) - { - auto comment_start = std::next(it, 2); - if (comment_start != buf.end() - && *comment_start != '!' - && *comment_start != 'M') - { - // Non-executable comment - my_state = INLINE_COMMENT; - continue; - } - } - else if (is_next(it, buf.end(), "# ") || is_next(it, buf.end(), "-- ")) - { - // End-of-line comment, jump to the next line if one exists - my_state = UNTIL_NEWLINE; - continue; - } - else if (isdigit(*it) && !isalpha(prev) && !isdigit(prev) && prev != '_') - { - auto num_end = probe_number(it, buf.end()); - - if (num_end.first) - { - if (is_negation(rval)) - { - // Remove the sign - rval.resize(rval.size() - 1); - } - rval += '?'; - it = num_end.second; - continue; - } - } - - switch (*it) - { - case '\'': - my_state = SINGLE_QUOTE; - break; - - case '"': - my_state = DOUBLE_QUOTE; - break; - - case '`': - my_state = BACKTICK; - - /* falls through */ - default: rval += *it; + } + } + else if (is_space(*it) && (rval.empty() || is_space(rval.back()))) + { + // Repeating space, skip it + } + else if (*it == '/' && is_next(it, buf.end(), "/*")) + { + auto comment_start = std::next(it, 2); + if (comment_start != buf.end() && *comment_start != '!' && *comment_start != 'M') + { + // Non-executable comment + while (it != buf.end()) + { + if (is_next(it, buf.end(), "*/")) + { + // Comment end marker, return to normal parsing + ++it; + break; + } + ++it; + } + } + else + { + // Executable comment, treat it as normal SQL + rval += *it; + } + } + else if ((*it == '#' || *it == '-') + && (is_next(it, buf.end(), "# ") || is_next(it, buf.end(), "-- "))) + { + // End-of-line comment, jump to the next line if one exists + while (it != buf.end()) + { + if (*it == '\n') + { + break; + } + else if (*it == '\r') + { + if ((is_next(it, buf.end(), "\r\n"))) + { + ++it; + } + break; + } + + ++it; + } + if (it == buf.end()) + { break; } + } + else if (is_digit(*it) && (rval.empty() || (!is_alnum(rval.back()) && rval.back() != '_'))) + { + auto num_end = probe_number(it, buf.end()); - break; + if (num_end.first) + { + if (is_negation(rval)) + { + // Remove the sign + rval.resize(rval.size() - 1); + } + rval += '?'; + it = num_end.second; + } + } + else if (*it == '\'' || *it == '"') + { + char c = *it; + it = find_char(std::next(it), buf.end(), c); + rval += '?'; + } + else if (*it == '`') + { + auto start = it; + it = find_char(std::next(it), buf.end(), '`'); + rval.append(start, it); + rval += '`'; + } + else + { + rval += *it; } } diff --git a/server/core/test/canonical_tests/alter.expected b/server/core/test/canonical_tests/alter.expected index 6b8c1a62e..63512437c 100644 --- a/server/core/test/canonical_tests/alter.expected +++ b/server/core/test/canonical_tests/alter.expected @@ -9,7 +9,7 @@ ALTER EVENT e1 DO SELECT ?; ALTER EVENT e1 ON SCHEDULE AT ? ON COMPLETION PRESERVE DISABLE; ALTER TABLE `@0023sql1` RENAME `#sql-1`; ALTER TABLE t1 ADD INDEX (c13) COMMENT ?; -ALTER TABLE t1 ADD PARTITION IF NOT EXISTS(PARTITION `p5` VALUES LESS THAN (?)COMMENT \'?); +ALTER TABLE t1 ADD PARTITION IF NOT EXISTS(PARTITION `p5` VALUES LESS THAN (?)COMMENT ?); ALTER TABLE `t1` ADD PRIMARY KEY (`a`); alter table t1 change a a enum(?,?,?,?,?,?,?,?) character set utf16; alter table t1 change a a int `FKEY1`=?;