MXS-2732 Recognize character set names

In the tokenizer we will now recognize the character set names of MariaDB and return a specific token for those. However, where a character set name is not expected, it will automatically be treated as an identifier. Note that when the character set name is explicitly specified for a literal string, the name must be prefixed with an underscore. That is, if the character set name is "latin1", when used when specifying a literal string, it's used as "_latin1 'a'". Note that this does not fix the sqlite3 bug causing a leak, but since the statement will now correctly be parsed, the leak will not manifest itself.
2019-10-31 12:46:49 +02:00
parent 177d95c3bc
commit 6cba7e8201
2 changed files with 104 additions and 2 deletions
--- a/query_classifier/qc_sqlite/sqlite-src-3110100/src/parse.y
+++ b/query_classifier/qc_sqlite/sqlite-src-3110100/src/parse.y
@ -620,7 +620,7 @@ columnid(A) ::= nm(X). {
  // TODO: BINARY is a reserved word and should not automatically convert into an identifer.
  // TODO: However, if not here then rules such as CAST need to be modified.
  BINARY
-  /*CASCADE*/ CAST CLOSE COLUMNKW COLUMNS COMMENT CONCURRENT /*CONFLICT*/ CONNECTION
+  /*CASCADE*/ CAST CHARSET_NAME_KW CLOSE COLUMNKW COLUMNS COMMENT CONCURRENT /*CONFLICT*/ CONNECTION
  DATA DATABASE DEALLOCATE DEFERRED /*DESC*/ /*DETACH*/ DUMPFILE
  /*EACH*/ END ENGINE ENUM EXCLUSIVE /*EXPLAIN*/ EXTENDED
  FIELDS FIRST FLUSH /*FOR*/ FORMAT
@ -1907,6 +1907,7 @@ expr(A) ::= nm(X) DOT nm(Y) DOT nm(Z). {
 }
 term(A) ::= INTEGER|FLOAT|BLOB(X).  {spanExpr(&A, pParse, @X, &X);}
 term(A) ::= STRING(X).              {spanExpr(&A, pParse, @X, &X);}
+term(A) ::= CHARSET_NAME_KW(X) STRING(Y). {spanExpr(&A, pParse, @X, &Y);}
 expr(A) ::= VARIABLE(X).     {
  if( X.n>=2 && X.z[0]=='#' && sqlite3Isdigit(X.z[1]) ){
    /* When doing a nested parse, one can include terms in an expression
@ -1926,7 +1927,7 @@ expr(A) ::= VARIABLE(X).     {
  spanSet(&A, &X, &X);
 }
 %ifdef MAXSCALE
-expr(A) ::= id(X) INTEGER(Y). {
+expr(A) ::= CHARSET_NAME_KW(X) INTEGER(Y). {
  // The sole purpose of this is to interpret something like '_utf8mb4 0xD091D092D093'
  // as a string. It does not matter that any identifier followed by an integer will
  // be interpreted as a string, as invalid usage will be caught by the server.
--- a/query_classifier/qc_sqlite/sqlite-src-3110100/src/tokenize.c
+++ b/query_classifier/qc_sqlite/sqlite-src-3110100/src/tokenize.c
@ -199,6 +199,91 @@ int sqlite3IsIdChar(u8 c){ return IdChar(c); }
 */
 #ifdef MAXSCALE
 extern int maxscaleComment();
+
+struct mxs_charset_entry
+{
+    const char* name;
+    size_t      len;
+};
+
+// Character set names of MariaDB.
+//
+// NOTE: MUST be kept in alphabetical order.
+const struct mxs_charset_entry mxs_charset_names[] =
+{
+    { "armscii8", 8 },
+    { "ascii",    5 },
+    { "big5",     4 },
+    { "binary",   6 },
+    { "cp1250",   6 },
+    { "cp1251",   6 },
+    { "cp1256",   6 },
+    { "cp1257",   6 },
+    { "cp850",    5 },
+    { "cp852",    5 },
+    { "cp866",    5 },
+    { "cp932",    5 },
+    { "dec8",     4 },
+    { "eucjpms",  7 },
+    { "euckr",    5 },
+    { "gb2312",   6 },
+    { "gbk",      3 },
+    { "geostd8",  7 },
+    { "greek",    5 },
+    { "hebrew",   6 },
+    { "hp8",      3 },
+    { "keybcs2",  7 },
+    { "koi8r",    5 },
+    { "koi8u",    5 },
+    { "latin1",   6 },
+    { "latin2",   6 },
+    { "latin5",   6 },
+    { "latin7",   6 },
+    { "macce",    5 },
+    { "macroman", 8 },
+    { "sjis",     4 },
+    { "swe7",     4 },
+    { "tis620",   6 },
+    { "ucs2",     4 },
+    { "ujis",     4 },
+    { "utf16",    5 },
+    { "utf16le",  7 },
+    { "utf32",    5 },
+    { "utf8",     4 },
+    { "utf8mb4",  7 }
+};
+
+#define N_MXS_CHARSET_NAMES (sizeof(mxs_charset_names)/sizeof(mxs_charset_names[0]))
+
+int mxs_compare_charset_names(const void* l, const void* r)
+{
+    const struct mxs_charset_entry* key = (const struct mxs_charset_entry*)l;
+    const struct mxs_charset_entry* value = (const struct mxs_charset_entry*)r;
+
+    int rv = strncasecmp(key->name, value->name, MIN(key->len, value->len));
+
+    if (key->len != value->len)
+    {
+        if (rv == 0)
+        {
+            rv = key->len < value->len ? -1 : 1;
+        }
+    }
+
+    return rv;
+}
+
+int mxs_is_charset_name(const char* p, size_t n)
+{
+    struct mxs_charset_entry key = { p, n };
+
+    return bsearch(&key,
+                   mxs_charset_names, N_MXS_CHARSET_NAMES, sizeof(mxs_charset_names[0]),
+                   mxs_compare_charset_names) != 0;
+}
+
+
+
 int sqlite3GetToken(Parse* pParse, const unsigned char *z, int *tokenType){
 #else
 int sqlite3GetToken(const unsigned char *z, int *tokenType){
@ -558,6 +643,22 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){
      /* Not a bit field. It may be a keyword so we flow through */
 #endif
      for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
+#ifdef MAXSCALE
+      if ( z[0]== '_' ) {
+        // This can be a case of [_charset_name], so we need to
+        // accept more. We can eat all characters acceptable for
+        // an identifier.
+        while ( IdChar(z[i]) ) { i++; }
+
+        if (mxs_is_charset_name((char*)z + 1, i - 1)) {
+            *tokenType = TK_CHARSET_NAME_KW;
+            return i;
+        } else {
+            // Token type will be TK_ID.
+            break;
+        }
+      }
+#endif
      if( IdChar(z[i]) ){
        /* This token started out using characters that can appear in keywords,
        ** but z[i] is a character not allowed within keywords, so this must