MXS-2732 Recognize character set names
In the tokenizer we will now recognize the character set names of MariaDB and return a specific token for those. However, where a character set name is not expected, it will automatically be treated as an identifier. Note that when the character set name is explicitly specified for a literal string, the name must be prefixed with an underscore. That is, if the character set name is "latin1", when used when specifying a literal string, it's used as "_latin1 'a'". Note that this does not fix the sqlite3 bug causing a leak, but since the statement will now correctly be parsed, the leak will not manifest itself.
This commit is contained in:
@ -620,7 +620,7 @@ columnid(A) ::= nm(X). {
|
||||
// TODO: BINARY is a reserved word and should not automatically convert into an identifer.
|
||||
// TODO: However, if not here then rules such as CAST need to be modified.
|
||||
BINARY
|
||||
/*CASCADE*/ CAST CLOSE COLUMNKW COLUMNS COMMENT CONCURRENT /*CONFLICT*/ CONNECTION
|
||||
/*CASCADE*/ CAST CHARSET_NAME_KW CLOSE COLUMNKW COLUMNS COMMENT CONCURRENT /*CONFLICT*/ CONNECTION
|
||||
DATA DATABASE DEALLOCATE DEFERRED /*DESC*/ /*DETACH*/ DUMPFILE
|
||||
/*EACH*/ END ENGINE ENUM EXCLUSIVE /*EXPLAIN*/ EXTENDED
|
||||
FIELDS FIRST FLUSH /*FOR*/ FORMAT
|
||||
@ -1907,6 +1907,7 @@ expr(A) ::= nm(X) DOT nm(Y) DOT nm(Z). {
|
||||
}
|
||||
term(A) ::= INTEGER|FLOAT|BLOB(X). {spanExpr(&A, pParse, @X, &X);}
|
||||
term(A) ::= STRING(X). {spanExpr(&A, pParse, @X, &X);}
|
||||
term(A) ::= CHARSET_NAME_KW(X) STRING(Y). {spanExpr(&A, pParse, @X, &Y);}
|
||||
expr(A) ::= VARIABLE(X). {
|
||||
if( X.n>=2 && X.z[0]=='#' && sqlite3Isdigit(X.z[1]) ){
|
||||
/* When doing a nested parse, one can include terms in an expression
|
||||
@ -1926,7 +1927,7 @@ expr(A) ::= VARIABLE(X). {
|
||||
spanSet(&A, &X, &X);
|
||||
}
|
||||
%ifdef MAXSCALE
|
||||
expr(A) ::= id(X) INTEGER(Y). {
|
||||
expr(A) ::= CHARSET_NAME_KW(X) INTEGER(Y). {
|
||||
// The sole purpose of this is to interpret something like '_utf8mb4 0xD091D092D093'
|
||||
// as a string. It does not matter that any identifier followed by an integer will
|
||||
// be interpreted as a string, as invalid usage will be caught by the server.
|
||||
|
@ -199,6 +199,91 @@ int sqlite3IsIdChar(u8 c){ return IdChar(c); }
|
||||
*/
|
||||
#ifdef MAXSCALE
|
||||
extern int maxscaleComment();
|
||||
|
||||
struct mxs_charset_entry
|
||||
{
|
||||
const char* name;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
// Character set names of MariaDB.
|
||||
//
|
||||
// NOTE: MUST be kept in alphabetical order.
|
||||
const struct mxs_charset_entry mxs_charset_names[] =
|
||||
{
|
||||
{ "armscii8", 8 },
|
||||
{ "ascii", 5 },
|
||||
{ "big5", 4 },
|
||||
{ "binary", 6 },
|
||||
{ "cp1250", 6 },
|
||||
{ "cp1251", 6 },
|
||||
{ "cp1256", 6 },
|
||||
{ "cp1257", 6 },
|
||||
{ "cp850", 5 },
|
||||
{ "cp852", 5 },
|
||||
{ "cp866", 5 },
|
||||
{ "cp932", 5 },
|
||||
{ "dec8", 4 },
|
||||
{ "eucjpms", 7 },
|
||||
{ "euckr", 5 },
|
||||
{ "gb2312", 6 },
|
||||
{ "gbk", 3 },
|
||||
{ "geostd8", 7 },
|
||||
{ "greek", 5 },
|
||||
{ "hebrew", 6 },
|
||||
{ "hp8", 3 },
|
||||
{ "keybcs2", 7 },
|
||||
{ "koi8r", 5 },
|
||||
{ "koi8u", 5 },
|
||||
{ "latin1", 6 },
|
||||
{ "latin2", 6 },
|
||||
{ "latin5", 6 },
|
||||
{ "latin7", 6 },
|
||||
{ "macce", 5 },
|
||||
{ "macroman", 8 },
|
||||
{ "sjis", 4 },
|
||||
{ "swe7", 4 },
|
||||
{ "tis620", 6 },
|
||||
{ "ucs2", 4 },
|
||||
{ "ujis", 4 },
|
||||
{ "utf16", 5 },
|
||||
{ "utf16le", 7 },
|
||||
{ "utf32", 5 },
|
||||
{ "utf8", 4 },
|
||||
{ "utf8mb4", 7 }
|
||||
};
|
||||
|
||||
#define N_MXS_CHARSET_NAMES (sizeof(mxs_charset_names)/sizeof(mxs_charset_names[0]))
|
||||
|
||||
int mxs_compare_charset_names(const void* l, const void* r)
|
||||
{
|
||||
const struct mxs_charset_entry* key = (const struct mxs_charset_entry*)l;
|
||||
const struct mxs_charset_entry* value = (const struct mxs_charset_entry*)r;
|
||||
|
||||
int rv = strncasecmp(key->name, value->name, MIN(key->len, value->len));
|
||||
|
||||
if (key->len != value->len)
|
||||
{
|
||||
if (rv == 0)
|
||||
{
|
||||
rv = key->len < value->len ? -1 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
int mxs_is_charset_name(const char* p, size_t n)
|
||||
{
|
||||
struct mxs_charset_entry key = { p, n };
|
||||
|
||||
return bsearch(&key,
|
||||
mxs_charset_names, N_MXS_CHARSET_NAMES, sizeof(mxs_charset_names[0]),
|
||||
mxs_compare_charset_names) != 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int sqlite3GetToken(Parse* pParse, const unsigned char *z, int *tokenType){
|
||||
#else
|
||||
int sqlite3GetToken(const unsigned char *z, int *tokenType){
|
||||
@ -558,6 +643,22 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){
|
||||
/* Not a bit field. It may be a keyword so we flow through */
|
||||
#endif
|
||||
for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
|
||||
#ifdef MAXSCALE
|
||||
if ( z[0]== '_' ) {
|
||||
// This can be a case of [_charset_name], so we need to
|
||||
// accept more. We can eat all characters acceptable for
|
||||
// an identifier.
|
||||
while ( IdChar(z[i]) ) { i++; }
|
||||
|
||||
if (mxs_is_charset_name((char*)z + 1, i - 1)) {
|
||||
*tokenType = TK_CHARSET_NAME_KW;
|
||||
return i;
|
||||
} else {
|
||||
// Token type will be TK_ID.
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if( IdChar(z[i]) ){
|
||||
/* This token started out using characters that can appear in keywords,
|
||||
** but z[i] is a character not allowed within keywords, so this must
|
||||
|
Reference in New Issue
Block a user