PostgreSQL Source Code  git master
parser.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * parser.c
4  * Main entry point/driver for PostgreSQL grammar
5  *
6  * Note that the grammar is not allowed to perform any table access
7  * (since we need to be able to do basic parsing even while inside an
8  * aborted transaction). Therefore, the data structures returned by
9  * the grammar are "raw" parsetrees that still need to be analyzed by
10  * analyze.c and related files.
11  *
12  *
13  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  * IDENTIFICATION
17  * src/backend/parser/parser.c
18  *
19  *-------------------------------------------------------------------------
20  */
21 
22 #include "postgres.h"
23 
24 #include "mb/pg_wchar.h"
25 #include "parser/gramparse.h"
26 #include "parser/parser.h"
27 #include "parser/scansup.h"
28 
29 static bool check_uescapechar(unsigned char escape);
30 static char *str_udeescape(const char *str, char escape,
31  int position, core_yyscan_t yyscanner);
32 
33 
34 /*
35  * raw_parser
36  * Given a query in string form, do lexical and grammatical analysis.
37  *
38  * Returns a list of raw (un-analyzed) parse trees. The contents of the
39  * list have the form required by the specified RawParseMode.
40  */
41 List *
43 {
45  base_yy_extra_type yyextra;
46  int yyresult;
47 
48  /* initialize the flex scanner */
49  yyscanner = scanner_init(str, &yyextra.core_yy_extra,
51 
52  /* base_yylex() only needs us to initialize the lookahead token, if any */
53  if (mode == RAW_PARSE_DEFAULT)
54  yyextra.have_lookahead = false;
55  else
56  {
57  /* this array is indexed by RawParseMode enum */
58  static const int mode_token[] = {
59  0, /* RAW_PARSE_DEFAULT */
60  MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
61  MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
62  MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
63  MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
64  MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
65  };
66 
67  yyextra.have_lookahead = true;
68  yyextra.lookahead_token = mode_token[mode];
69  yyextra.lookahead_yylloc = 0;
70  yyextra.lookahead_end = NULL;
71  }
72 
73  /* initialize the bison parser */
74  parser_init(&yyextra);
75 
76  /* Parse! */
77  yyresult = base_yyparse(yyscanner);
78 
79  /* Clean up (release memory) */
80  scanner_finish(yyscanner);
81 
82  if (yyresult) /* error */
83  return NIL;
84 
85  return yyextra.parsetree;
86 }
87 
88 
89 /*
90  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91  *
92  * This filter is needed because in some cases the standard SQL grammar
93  * requires more than one token lookahead. We reduce these cases to one-token
94  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95  *
96  * Using a filter is simpler than trying to recognize multiword tokens
97  * directly in scan.l, because we'd have to allow for comments between the
98  * words. Furthermore it's not clear how to do that without re-introducing
99  * scanner backtrack, which would cost more performance than this filter
100  * layer does.
101  *
102  * We also use this filter to convert UIDENT and USCONST sequences into
103  * plain IDENT and SCONST tokens. While that could be handled by additional
104  * productions in the main grammar, it's more efficient to do it like this.
105  *
106  * The filter also provides a convenient place to translate between
107  * the core_YYSTYPE and YYSTYPE representations (which are really the
108  * same thing anyway, but notationally they're different).
109  */
110 int
111 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 {
113  base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114  int cur_token;
115  int next_token;
116  int cur_token_length;
117  YYLTYPE cur_yylloc;
118 
119  /* Get next token --- we might already have it */
120  if (yyextra->have_lookahead)
121  {
122  cur_token = yyextra->lookahead_token;
123  lvalp->core_yystype = yyextra->lookahead_yylval;
124  *llocp = yyextra->lookahead_yylloc;
125  if (yyextra->lookahead_end)
126  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127  yyextra->have_lookahead = false;
128  }
129  else
130  cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 
132  /*
133  * If this token isn't one that requires lookahead, just return it. If it
134  * does, determine the token length. (We could get that via strlen(), but
135  * since we have such a small set of possibilities, hardwiring seems
136  * feasible and more efficient --- at least for the fixed-length cases.)
137  */
138  switch (cur_token)
139  {
140  case NOT:
141  cur_token_length = 3;
142  break;
143  case NULLS_P:
144  cur_token_length = 5;
145  break;
146  case WITH:
147  cur_token_length = 4;
148  break;
149  case UIDENT:
150  case USCONST:
151  cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
152  break;
153  default:
154  return cur_token;
155  }
156 
157  /*
158  * Identify end+1 of current token. core_yylex() has temporarily stored a
159  * '\0' here, and will undo that when we call it again. We need to redo
160  * it to fully revert the lookahead call for error reporting purposes.
161  */
162  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
163  *llocp + cur_token_length;
164  Assert(*(yyextra->lookahead_end) == '\0');
165 
166  /*
167  * Save and restore *llocp around the call. It might look like we could
168  * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
169  * does not work because flex actually holds onto the last-passed pointer
170  * internally, and will use that for error reporting. We need any error
171  * reports to point to the current token, not the next one.
172  */
173  cur_yylloc = *llocp;
174 
175  /* Get next token, saving outputs into lookahead variables */
176  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
177  yyextra->lookahead_token = next_token;
178  yyextra->lookahead_yylloc = *llocp;
179 
180  *llocp = cur_yylloc;
181 
182  /* Now revert the un-truncation of the current token */
183  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
184  *(yyextra->lookahead_end) = '\0';
185 
186  yyextra->have_lookahead = true;
187 
188  /* Replace cur_token if needed, based on lookahead */
189  switch (cur_token)
190  {
191  case NOT:
192  /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
193  switch (next_token)
194  {
195  case BETWEEN:
196  case IN_P:
197  case LIKE:
198  case ILIKE:
199  case SIMILAR:
200  cur_token = NOT_LA;
201  break;
202  }
203  break;
204 
205  case NULLS_P:
206  /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
207  switch (next_token)
208  {
209  case FIRST_P:
210  case LAST_P:
211  cur_token = NULLS_LA;
212  break;
213  }
214  break;
215 
216  case WITH:
217  /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
218  switch (next_token)
219  {
220  case TIME:
221  case ORDINALITY:
222  cur_token = WITH_LA;
223  break;
224  }
225  break;
226 
227  case UIDENT:
228  case USCONST:
229  /* Look ahead for UESCAPE */
230  if (next_token == UESCAPE)
231  {
232  /* Yup, so get third token, which had better be SCONST */
233  const char *escstr;
234 
235  /* Again save and restore *llocp */
236  cur_yylloc = *llocp;
237 
238  /* Un-truncate current token so errors point to third token */
239  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
240 
241  /* Get third token */
242  next_token = core_yylex(&(yyextra->lookahead_yylval),
243  llocp, yyscanner);
244 
245  /* If we throw error here, it will point to third token */
246  if (next_token != SCONST)
247  scanner_yyerror("UESCAPE must be followed by a simple string literal",
248  yyscanner);
249 
250  escstr = yyextra->lookahead_yylval.str;
251  if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
252  scanner_yyerror("invalid Unicode escape character",
253  yyscanner);
254 
255  /* Now restore *llocp; errors will point to first token */
256  *llocp = cur_yylloc;
257 
258  /* Apply Unicode conversion */
259  lvalp->core_yystype.str =
260  str_udeescape(lvalp->core_yystype.str,
261  escstr[0],
262  *llocp,
263  yyscanner);
264 
265  /*
266  * We don't need to revert the un-truncation of UESCAPE. What
267  * we do want to do is clear have_lookahead, thereby consuming
268  * all three tokens.
269  */
270  yyextra->have_lookahead = false;
271  }
272  else
273  {
274  /* No UESCAPE, so convert using default escape character */
275  lvalp->core_yystype.str =
276  str_udeescape(lvalp->core_yystype.str,
277  '\\',
278  *llocp,
279  yyscanner);
280  }
281 
282  if (cur_token == UIDENT)
283  {
284  /* It's an identifier, so truncate as appropriate */
285  truncate_identifier(lvalp->core_yystype.str,
286  strlen(lvalp->core_yystype.str),
287  true);
288  cur_token = IDENT;
289  }
290  else if (cur_token == USCONST)
291  {
292  cur_token = SCONST;
293  }
294  break;
295  }
296 
297  return cur_token;
298 }
299 
300 /* convert hex digit (caller should have verified that) to value */
301 static unsigned int
302 hexval(unsigned char c)
303 {
304  if (c >= '0' && c <= '9')
305  return c - '0';
306  if (c >= 'a' && c <= 'f')
307  return c - 'a' + 0xA;
308  if (c >= 'A' && c <= 'F')
309  return c - 'A' + 0xA;
310  elog(ERROR, "invalid hexadecimal digit");
311  return 0; /* not reached */
312 }
313 
314 /* is Unicode code point acceptable? */
315 static void
317 {
319  ereport(ERROR,
320  (errcode(ERRCODE_SYNTAX_ERROR),
321  errmsg("invalid Unicode escape value")));
322 }
323 
324 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
325 static bool
326 check_uescapechar(unsigned char escape)
327 {
328  if (isxdigit(escape)
329  || escape == '+'
330  || escape == '\''
331  || escape == '"'
332  || scanner_isspace(escape))
333  return false;
334  else
335  return true;
336 }
337 
338 /*
339  * Process Unicode escapes in "str", producing a palloc'd plain string
340  *
341  * escape: the escape character to use
342  * position: start position of U&'' or U&"" string token
343  * yyscanner: context information needed for error reports
344  */
345 static char *
346 str_udeescape(const char *str, char escape,
347  int position, core_yyscan_t yyscanner)
348 {
349  const char *in;
350  char *new,
351  *out;
352  size_t new_len;
353  pg_wchar pair_first = 0;
354  ScannerCallbackState scbstate;
355 
356  /*
357  * Guesstimate that result will be no longer than input, but allow enough
358  * padding for Unicode conversion.
359  */
360  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
361  new = palloc(new_len);
362 
363  in = str;
364  out = new;
365  while (*in)
366  {
367  /* Enlarge string if needed */
368  size_t out_dist = out - new;
369 
370  if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
371  {
372  new_len *= 2;
373  new = repalloc(new, new_len);
374  out = new + out_dist;
375  }
376 
377  if (in[0] == escape)
378  {
379  /*
380  * Any errors reported while processing this escape sequence will
381  * have an error cursor pointing at the escape.
382  */
383  setup_scanner_errposition_callback(&scbstate, yyscanner,
384  in - str + position + 3); /* 3 for U&" */
385  if (in[1] == escape)
386  {
387  if (pair_first)
388  goto invalid_pair;
389  *out++ = escape;
390  in += 2;
391  }
392  else if (isxdigit((unsigned char) in[1]) &&
393  isxdigit((unsigned char) in[2]) &&
394  isxdigit((unsigned char) in[3]) &&
395  isxdigit((unsigned char) in[4]))
396  {
397  pg_wchar unicode;
398 
399  unicode = (hexval(in[1]) << 12) +
400  (hexval(in[2]) << 8) +
401  (hexval(in[3]) << 4) +
402  hexval(in[4]);
403  check_unicode_value(unicode);
404  if (pair_first)
405  {
406  if (is_utf16_surrogate_second(unicode))
407  {
408  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
409  pair_first = 0;
410  }
411  else
412  goto invalid_pair;
413  }
414  else if (is_utf16_surrogate_second(unicode))
415  goto invalid_pair;
416 
417  if (is_utf16_surrogate_first(unicode))
418  pair_first = unicode;
419  else
420  {
421  pg_unicode_to_server(unicode, (unsigned char *) out);
422  out += strlen(out);
423  }
424  in += 5;
425  }
426  else if (in[1] == '+' &&
427  isxdigit((unsigned char) in[2]) &&
428  isxdigit((unsigned char) in[3]) &&
429  isxdigit((unsigned char) in[4]) &&
430  isxdigit((unsigned char) in[5]) &&
431  isxdigit((unsigned char) in[6]) &&
432  isxdigit((unsigned char) in[7]))
433  {
434  pg_wchar unicode;
435 
436  unicode = (hexval(in[2]) << 20) +
437  (hexval(in[3]) << 16) +
438  (hexval(in[4]) << 12) +
439  (hexval(in[5]) << 8) +
440  (hexval(in[6]) << 4) +
441  hexval(in[7]);
442  check_unicode_value(unicode);
443  if (pair_first)
444  {
445  if (is_utf16_surrogate_second(unicode))
446  {
447  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
448  pair_first = 0;
449  }
450  else
451  goto invalid_pair;
452  }
453  else if (is_utf16_surrogate_second(unicode))
454  goto invalid_pair;
455 
456  if (is_utf16_surrogate_first(unicode))
457  pair_first = unicode;
458  else
459  {
460  pg_unicode_to_server(unicode, (unsigned char *) out);
461  out += strlen(out);
462  }
463  in += 8;
464  }
465  else
466  ereport(ERROR,
467  (errcode(ERRCODE_SYNTAX_ERROR),
468  errmsg("invalid Unicode escape"),
469  errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
470 
472  }
473  else
474  {
475  if (pair_first)
476  goto invalid_pair;
477 
478  *out++ = *in++;
479  }
480  }
481 
482  /* unfinished surrogate pair? */
483  if (pair_first)
484  goto invalid_pair;
485 
486  *out = '\0';
487  return new;
488 
489  /*
490  * We might get here with the error callback active, or not. Call
491  * scanner_errposition to make sure an error cursor appears; if the
492  * callback is active, this is duplicative but harmless.
493  */
494 invalid_pair:
495  ereport(ERROR,
496  (errcode(ERRCODE_SYNTAX_ERROR),
497  errmsg("invalid Unicode surrogate pair"),
498  scanner_errposition(in - str + position + 3, /* 3 for U&" */
499  yyscanner)));
500  return NULL; /* keep compiler quiet */
501 }
#define NIL
Definition: pg_list.h:65
static PgChecksumMode mode
Definition: pg_checksums.c:61
char * lookahead_end
Definition: gramparse.h:49
int errhint(const char *fmt,...)
Definition: elog.c:1156
static unsigned int hexval(unsigned char c)
Definition: parser.c:302
int base_yyparse(core_yyscan_t yyscanner)
void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
void * core_yyscan_t
Definition: scanner.h:121
core_YYSTYPE lookahead_yylval
Definition: gramparse.h:47
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:539
PGDLLIMPORT const uint16 ScanKeywordTokens[]
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:551
int errcode(int sqlerrcode)
Definition: elog.c:698
core_yy_extra_type core_yy_extra
Definition: gramparse.h:40
RawParseMode
Definition: parser.h:37
static bool next_token(char **lineptr, char *buf, int bufsz, bool *initial_quote, bool *terminating_comma, int elevel, char **err_msg)
Definition: hba.c:203
char * scanbuf
Definition: scanner.h:72
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:326
PGDLLIMPORT const ScanKeywordList ScanKeywords
void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn()
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:93
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:343
#define ERROR
Definition: elog.h:46
void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
Definition: parser.c:346
char * c
void parser_init(base_yy_extra_type *yyext)
#define YYLTYPE
Definition: scanner.h:44
unsigned int pg_wchar
Definition: mbprint.c:31
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:557
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
Definition: parser.c:111
int scanner_errposition(int location, core_yyscan_t yyscanner)
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:864
bool scanner_isspace(char ch)
Definition: scansup.c:117
int core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
#define pg_yyget_extra(yyscanner)
Definition: gramparse.h:64
char * str
Definition: scanner.h:32
#define ereport(elevel,...)
Definition: elog.h:157
#define Assert(condition)
Definition: c.h:804
YYLTYPE lookahead_yylloc
Definition: gramparse.h:48
static void check_unicode_value(pg_wchar c)
Definition: parser.c:316
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1182
char lookahead_hold_char
Definition: gramparse.h:50
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
#define elog(elevel,...)
Definition: elog.h:232
void scanner_finish(core_yyscan_t yyscanner)
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:545
List * raw_parser(const char *str, RawParseMode mode)
Definition: parser.c:42
Definition: pg_list.h:50