mirror of
https://git.postgresql.org/git/postgresql.git
synced 2026-02-10 09:37:29 +08:00
Previously, the core scanner's yy_transition[] array had 37045 elements. Since that number is larger than INT16_MAX, Flex generated the array to contain 32-bit integers. By reimplementing some of the bulkier scanner rules, this patch reduces the array to 20495 elements. The much smaller total length, combined with the consequent use of 16-bit integers for the array elements reduces the binary size by over 200kB. This was accomplished in two ways: 1. Consolidate handling of quote continuations into a new start condition, rather than duplicating that logic for five different string types. 2. Treat Unicode strings and identifiers followed by a UESCAPE sequence as three separate tokens, rather than one. The logic to de-escape Unicode strings is moved to the filter code in parser.c, which already had the ability to provide special processing for token sequences. While we could have implemented the conversion in the grammar, that approach was rejected for performance and maintainability reasons. Performance in microbenchmarks of raw parsing seems equal or slightly faster in most cases, and it's reasonable to expect that in real-world usage (with more competition for the CPU cache) there will be a larger win. The exception is UESCAPE sequences; lexing those is about 10% slower, primarily because the scanner now has to be called three times rather than one. This seems acceptable since that feature is very rarely used. The psql and epcg lexers are likewise modified, primarily because we want to keep them all in sync. Since those lexers don't use the space-hogging -CF option, the space savings is much less, but it's still good for perhaps 10kB apiece. While at it, merge the ecpg lexer's handling of C-style comments used in SQL and in C. Those have different rules regarding nested comments, but since we already have the ability to keep track of the previous start condition, we can use that to handle both cases within a single start condition. This matches the core scanner more closely. John Naylor Discussion: https://postgr.es/m/CACPNZCvaoa3EgVWm5yZhcSTX6RAtaLgniCPcBVOCwm8h3xpWkw@mail.gmail.com
135 lines
4.8 KiB
C
135 lines
4.8 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* scanner.h
|
|
* API for the core scanner (flex machine)
|
|
*
|
|
* The core scanner is also used by PL/pgSQL, so we provide a public API
|
|
* for it. However, the rest of the backend is only expected to use the
|
|
* higher-level API provided by parser.h.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/parser/scanner.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#ifndef SCANNER_H
|
|
#define SCANNER_H
|
|
|
|
#include "common/keywords.h"
|
|
|
|
/*
|
|
* The scanner returns extra data about scanned tokens in this union type.
|
|
* Note that this is a subset of the fields used in YYSTYPE of the bison
|
|
* parsers built atop the scanner.
|
|
*/
|
|
typedef union core_YYSTYPE
|
|
{
|
|
int ival; /* for integer literals */
|
|
char *str; /* for identifiers and non-integer literals */
|
|
const char *keyword; /* canonical spelling of keywords */
|
|
} core_YYSTYPE;
|
|
|
|
/*
|
|
* We track token locations in terms of byte offsets from the start of the
|
|
* source string, not the column number/line number representation that
|
|
* bison uses by default. Also, to minimize overhead we track only one
|
|
* location (usually the first token location) for each construct, not
|
|
* the beginning and ending locations as bison does by default. It's
|
|
* therefore sufficient to make YYLTYPE an int.
|
|
*/
|
|
#define YYLTYPE int
|
|
|
|
/*
|
|
* Another important component of the scanner's API is the token code numbers.
|
|
* However, those are not defined in this file, because bison insists on
|
|
* defining them for itself. The token codes used by the core scanner are
|
|
* the ASCII characters plus these:
|
|
* %token <str> IDENT UIDENT FCONST SCONST USCONST BCONST XCONST Op
|
|
* %token <ival> ICONST PARAM
|
|
* %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER
|
|
* %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS
|
|
* The above token definitions *must* be the first ones declared in any
|
|
* bison parser built atop this scanner, so that they will have consistent
|
|
* numbers assigned to them (specifically, IDENT = 258 and so on).
|
|
*/
|
|
|
|
/*
|
|
* The YY_EXTRA data that a flex scanner allows us to pass around.
|
|
* Private state needed by the core scanner goes here. Note that the actual
|
|
* yy_extra struct may be larger and have this as its first component, thus
|
|
* allowing the calling parser to keep some fields of its own in YY_EXTRA.
|
|
*/
|
|
typedef struct core_yy_extra_type
|
|
{
|
|
/*
|
|
* The string the scanner is physically scanning. We keep this mainly so
|
|
* that we can cheaply compute the offset of the current token (yytext).
|
|
*/
|
|
char *scanbuf;
|
|
Size scanbuflen;
|
|
|
|
/*
|
|
* The keyword list to use, and the associated grammar token codes.
|
|
*/
|
|
const ScanKeywordList *keywordlist;
|
|
const uint16 *keyword_tokens;
|
|
|
|
/*
|
|
* Scanner settings to use. These are initialized from the corresponding
|
|
* GUC variables by scanner_init(). Callers can modify them after
|
|
* scanner_init() if they don't want the scanner's behavior to follow the
|
|
* prevailing GUC settings.
|
|
*/
|
|
int backslash_quote;
|
|
bool escape_string_warning;
|
|
bool standard_conforming_strings;
|
|
|
|
/*
|
|
* literalbuf is used to accumulate literal values when multiple rules are
|
|
* needed to parse a single literal. Call startlit() to reset buffer to
|
|
* empty, addlit() to add text. NOTE: the string in literalbuf is NOT
|
|
* necessarily null-terminated, but there always IS room to add a trailing
|
|
* null at offset literallen. We store a null only when we need it.
|
|
*/
|
|
char *literalbuf; /* palloc'd expandable buffer */
|
|
int literallen; /* actual current string length */
|
|
int literalalloc; /* current allocated buffer size */
|
|
|
|
int state_before_str_stop; /* start cond. before end quote */
|
|
int xcdepth; /* depth of nesting in slash-star comments */
|
|
char *dolqstart; /* current $foo$ quote start string */
|
|
|
|
/* first part of UTF16 surrogate pair for Unicode escapes */
|
|
int32 utf16_first_part;
|
|
|
|
/* state variables for literal-lexing warnings */
|
|
bool warn_on_first_escape;
|
|
bool saw_non_ascii;
|
|
} core_yy_extra_type;
|
|
|
|
/*
|
|
* The type of yyscanner is opaque outside scan.l.
|
|
*/
|
|
typedef void *core_yyscan_t;
|
|
|
|
|
|
/* Constant data exported from parser/scan.l */
|
|
extern PGDLLIMPORT const uint16 ScanKeywordTokens[];
|
|
|
|
/* Entry points in parser/scan.l */
|
|
extern core_yyscan_t scanner_init(const char *str,
|
|
core_yy_extra_type *yyext,
|
|
const ScanKeywordList *keywordlist,
|
|
const uint16 *keyword_tokens);
|
|
extern void scanner_finish(core_yyscan_t yyscanner);
|
|
extern int core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp,
|
|
core_yyscan_t yyscanner);
|
|
extern int scanner_errposition(int location, core_yyscan_t yyscanner);
|
|
extern void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn();
|
|
|
|
#endif /* SCANNER_H */
|