PostgreSQL Source Code  git master
parser.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * parser.c
4  * Main entry point/driver for PostgreSQL grammar
5  *
6  * Note that the grammar is not allowed to perform any table access
7  * (since we need to be able to do basic parsing even while inside an
8  * aborted transaction). Therefore, the data structures returned by
9  * the grammar are "raw" parsetrees that still need to be analyzed by
10  * analyze.c and related files.
11  *
12  *
13  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  * IDENTIFICATION
17  * src/backend/parser/parser.c
18  *
19  *-------------------------------------------------------------------------
20  */
21 
22 #include "postgres.h"
23 
24 #include "gramparse.h"
25 #include "mb/pg_wchar.h"
26 #include "parser/parser.h"
27 #include "parser/scansup.h"
28 
29 static bool check_uescapechar(unsigned char escape);
30 static char *str_udeescape(const char *str, char escape,
31  int position, core_yyscan_t yyscanner);
32 
33 
34 /*
35  * raw_parser
36  * Given a query in string form, do lexical and grammatical analysis.
37  *
38  * Returns a list of raw (un-analyzed) parse trees. The contents of the
39  * list have the form required by the specified RawParseMode.
40  */
41 List *
43 {
45  base_yy_extra_type yyextra;
46  int yyresult;
47 
48  /* initialize the flex scanner */
51 
52  /* base_yylex() only needs us to initialize the lookahead token, if any */
53  if (mode == RAW_PARSE_DEFAULT)
54  yyextra.have_lookahead = false;
55  else
56  {
57  /* this array is indexed by RawParseMode enum */
58  static const int mode_token[] = {
59  [RAW_PARSE_DEFAULT] = 0,
60  [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61  [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62  [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63  [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64  [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65  };
66 
67  yyextra.have_lookahead = true;
68  yyextra.lookahead_token = mode_token[mode];
69  yyextra.lookahead_yylloc = 0;
70  yyextra.lookahead_end = NULL;
71  }
72 
73  /* initialize the bison parser */
74  parser_init(&yyextra);
75 
76  /* Parse! */
77  yyresult = base_yyparse(yyscanner);
78 
79  /* Clean up (release memory) */
81 
82  if (yyresult) /* error */
83  return NIL;
84 
85  return yyextra.parsetree;
86 }
87 
88 
89 /*
90  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91  *
92  * This filter is needed because in some cases the standard SQL grammar
93  * requires more than one token lookahead. We reduce these cases to one-token
94  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95  *
96  * Using a filter is simpler than trying to recognize multiword tokens
97  * directly in scan.l, because we'd have to allow for comments between the
98  * words. Furthermore it's not clear how to do that without re-introducing
99  * scanner backtrack, which would cost more performance than this filter
100  * layer does.
101  *
102  * We also use this filter to convert UIDENT and USCONST sequences into
103  * plain IDENT and SCONST tokens. While that could be handled by additional
104  * productions in the main grammar, it's more efficient to do it like this.
105  *
106  * The filter also provides a convenient place to translate between
107  * the core_YYSTYPE and YYSTYPE representations (which are really the
108  * same thing anyway, but notationally they're different).
109  */
110 int
111 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 {
114  int cur_token;
115  int next_token;
116  int cur_token_length;
117  YYLTYPE cur_yylloc;
118 
119  /* Get next token --- we might already have it */
120  if (yyextra->have_lookahead)
121  {
122  cur_token = yyextra->lookahead_token;
123  lvalp->core_yystype = yyextra->lookahead_yylval;
124  *llocp = yyextra->lookahead_yylloc;
125  if (yyextra->lookahead_end)
126  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127  yyextra->have_lookahead = false;
128  }
129  else
130  cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 
132  /*
133  * If this token isn't one that requires lookahead, just return it. If it
134  * does, determine the token length. (We could get that via strlen(), but
135  * since we have such a small set of possibilities, hardwiring seems
136  * feasible and more efficient --- at least for the fixed-length cases.)
137  */
138  switch (cur_token)
139  {
140  case FORMAT:
141  cur_token_length = 6;
142  break;
143  case NOT:
144  cur_token_length = 3;
145  break;
146  case NULLS_P:
147  cur_token_length = 5;
148  break;
149  case WITH:
150  cur_token_length = 4;
151  break;
152  case UIDENT:
153  case USCONST:
154  cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155  break;
156  case WITHOUT:
157  cur_token_length = 7;
158  break;
159  default:
160  return cur_token;
161  }
162 
163  /*
164  * Identify end+1 of current token. core_yylex() has temporarily stored a
165  * '\0' here, and will undo that when we call it again. We need to redo
166  * it to fully revert the lookahead call for error reporting purposes.
167  */
168  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169  *llocp + cur_token_length;
170  Assert(*(yyextra->lookahead_end) == '\0');
171 
172  /*
173  * Save and restore *llocp around the call. It might look like we could
174  * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175  * does not work because flex actually holds onto the last-passed pointer
176  * internally, and will use that for error reporting. We need any error
177  * reports to point to the current token, not the next one.
178  */
179  cur_yylloc = *llocp;
180 
181  /* Get next token, saving outputs into lookahead variables */
182  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183  yyextra->lookahead_token = next_token;
184  yyextra->lookahead_yylloc = *llocp;
185 
186  *llocp = cur_yylloc;
187 
188  /* Now revert the un-truncation of the current token */
189  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190  *(yyextra->lookahead_end) = '\0';
191 
192  yyextra->have_lookahead = true;
193 
194  /* Replace cur_token if needed, based on lookahead */
195  switch (cur_token)
196  {
197  case FORMAT:
198  /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199  switch (next_token)
200  {
201  case JSON:
202  cur_token = FORMAT_LA;
203  break;
204  }
205  break;
206 
207  case NOT:
208  /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209  switch (next_token)
210  {
211  case BETWEEN:
212  case IN_P:
213  case LIKE:
214  case ILIKE:
215  case SIMILAR:
216  cur_token = NOT_LA;
217  break;
218  }
219  break;
220 
221  case NULLS_P:
222  /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223  switch (next_token)
224  {
225  case FIRST_P:
226  case LAST_P:
227  cur_token = NULLS_LA;
228  break;
229  }
230  break;
231 
232  case WITH:
233  /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234  switch (next_token)
235  {
236  case TIME:
237  case ORDINALITY:
238  cur_token = WITH_LA;
239  break;
240  }
241  break;
242 
243  case WITHOUT:
244  /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245  switch (next_token)
246  {
247  case TIME:
248  cur_token = WITHOUT_LA;
249  break;
250  }
251  break;
252 
253  case UIDENT:
254  case USCONST:
255  /* Look ahead for UESCAPE */
256  if (next_token == UESCAPE)
257  {
258  /* Yup, so get third token, which had better be SCONST */
259  const char *escstr;
260 
261  /* Again save and restore *llocp */
262  cur_yylloc = *llocp;
263 
264  /* Un-truncate current token so errors point to third token */
265  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266 
267  /* Get third token */
268  next_token = core_yylex(&(yyextra->lookahead_yylval),
269  llocp, yyscanner);
270 
271  /* If we throw error here, it will point to third token */
272  if (next_token != SCONST)
273  scanner_yyerror("UESCAPE must be followed by a simple string literal",
274  yyscanner);
275 
276  escstr = yyextra->lookahead_yylval.str;
277  if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278  scanner_yyerror("invalid Unicode escape character",
279  yyscanner);
280 
281  /* Now restore *llocp; errors will point to first token */
282  *llocp = cur_yylloc;
283 
284  /* Apply Unicode conversion */
285  lvalp->core_yystype.str =
286  str_udeescape(lvalp->core_yystype.str,
287  escstr[0],
288  *llocp,
289  yyscanner);
290 
291  /*
292  * We don't need to revert the un-truncation of UESCAPE. What
293  * we do want to do is clear have_lookahead, thereby consuming
294  * all three tokens.
295  */
296  yyextra->have_lookahead = false;
297  }
298  else
299  {
300  /* No UESCAPE, so convert using default escape character */
301  lvalp->core_yystype.str =
302  str_udeescape(lvalp->core_yystype.str,
303  '\\',
304  *llocp,
305  yyscanner);
306  }
307 
308  if (cur_token == UIDENT)
309  {
310  /* It's an identifier, so truncate as appropriate */
311  truncate_identifier(lvalp->core_yystype.str,
312  strlen(lvalp->core_yystype.str),
313  true);
314  cur_token = IDENT;
315  }
316  else if (cur_token == USCONST)
317  {
318  cur_token = SCONST;
319  }
320  break;
321  }
322 
323  return cur_token;
324 }
325 
326 /* convert hex digit (caller should have verified that) to value */
327 static unsigned int
328 hexval(unsigned char c)
329 {
330  if (c >= '0' && c <= '9')
331  return c - '0';
332  if (c >= 'a' && c <= 'f')
333  return c - 'a' + 0xA;
334  if (c >= 'A' && c <= 'F')
335  return c - 'A' + 0xA;
336  elog(ERROR, "invalid hexadecimal digit");
337  return 0; /* not reached */
338 }
339 
340 /* is Unicode code point acceptable? */
341 static void
343 {
345  ereport(ERROR,
346  (errcode(ERRCODE_SYNTAX_ERROR),
347  errmsg("invalid Unicode escape value")));
348 }
349 
350 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351 static bool
352 check_uescapechar(unsigned char escape)
353 {
354  if (isxdigit(escape)
355  || escape == '+'
356  || escape == '\''
357  || escape == '"'
358  || scanner_isspace(escape))
359  return false;
360  else
361  return true;
362 }
363 
364 /*
365  * Process Unicode escapes in "str", producing a palloc'd plain string
366  *
367  * escape: the escape character to use
368  * position: start position of U&'' or U&"" string token
369  * yyscanner: context information needed for error reports
370  */
371 static char *
372 str_udeescape(const char *str, char escape,
373  int position, core_yyscan_t yyscanner)
374 {
375  const char *in;
376  char *new,
377  *out;
378  size_t new_len;
379  pg_wchar pair_first = 0;
380  ScannerCallbackState scbstate;
381 
382  /*
383  * Guesstimate that result will be no longer than input, but allow enough
384  * padding for Unicode conversion.
385  */
386  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387  new = palloc(new_len);
388 
389  in = str;
390  out = new;
391  while (*in)
392  {
393  /* Enlarge string if needed */
394  size_t out_dist = out - new;
395 
396  if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397  {
398  new_len *= 2;
399  new = repalloc(new, new_len);
400  out = new + out_dist;
401  }
402 
403  if (in[0] == escape)
404  {
405  /*
406  * Any errors reported while processing this escape sequence will
407  * have an error cursor pointing at the escape.
408  */
410  in - str + position + 3); /* 3 for U&" */
411  if (in[1] == escape)
412  {
413  if (pair_first)
414  goto invalid_pair;
415  *out++ = escape;
416  in += 2;
417  }
418  else if (isxdigit((unsigned char) in[1]) &&
419  isxdigit((unsigned char) in[2]) &&
420  isxdigit((unsigned char) in[3]) &&
421  isxdigit((unsigned char) in[4]))
422  {
423  pg_wchar unicode;
424 
425  unicode = (hexval(in[1]) << 12) +
426  (hexval(in[2]) << 8) +
427  (hexval(in[3]) << 4) +
428  hexval(in[4]);
429  check_unicode_value(unicode);
430  if (pair_first)
431  {
432  if (is_utf16_surrogate_second(unicode))
433  {
434  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435  pair_first = 0;
436  }
437  else
438  goto invalid_pair;
439  }
440  else if (is_utf16_surrogate_second(unicode))
441  goto invalid_pair;
442 
443  if (is_utf16_surrogate_first(unicode))
444  pair_first = unicode;
445  else
446  {
447  pg_unicode_to_server(unicode, (unsigned char *) out);
448  out += strlen(out);
449  }
450  in += 5;
451  }
452  else if (in[1] == '+' &&
453  isxdigit((unsigned char) in[2]) &&
454  isxdigit((unsigned char) in[3]) &&
455  isxdigit((unsigned char) in[4]) &&
456  isxdigit((unsigned char) in[5]) &&
457  isxdigit((unsigned char) in[6]) &&
458  isxdigit((unsigned char) in[7]))
459  {
460  pg_wchar unicode;
461 
462  unicode = (hexval(in[2]) << 20) +
463  (hexval(in[3]) << 16) +
464  (hexval(in[4]) << 12) +
465  (hexval(in[5]) << 8) +
466  (hexval(in[6]) << 4) +
467  hexval(in[7]);
468  check_unicode_value(unicode);
469  if (pair_first)
470  {
471  if (is_utf16_surrogate_second(unicode))
472  {
473  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474  pair_first = 0;
475  }
476  else
477  goto invalid_pair;
478  }
479  else if (is_utf16_surrogate_second(unicode))
480  goto invalid_pair;
481 
482  if (is_utf16_surrogate_first(unicode))
483  pair_first = unicode;
484  else
485  {
486  pg_unicode_to_server(unicode, (unsigned char *) out);
487  out += strlen(out);
488  }
489  in += 8;
490  }
491  else
492  ereport(ERROR,
493  (errcode(ERRCODE_SYNTAX_ERROR),
494  errmsg("invalid Unicode escape"),
495  errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496 
498  }
499  else
500  {
501  if (pair_first)
502  goto invalid_pair;
503 
504  *out++ = *in++;
505  }
506  }
507 
508  /* unfinished surrogate pair? */
509  if (pair_first)
510  goto invalid_pair;
511 
512  *out = '\0';
513  return new;
514 
515  /*
516  * We might get here with the error callback active, or not. Call
517  * scanner_errposition to make sure an error cursor appears; if the
518  * callback is active, this is duplicative but harmless.
519  */
520 invalid_pair:
521  ereport(ERROR,
522  (errcode(ERRCODE_SYNTAX_ERROR),
523  errmsg("invalid Unicode surrogate pair"),
524  scanner_errposition(in - str + position + 3, /* 3 for U&" */
525  yyscanner)));
526  return NULL; /* keep compiler quiet */
527 }
List * raw_parser(const char *str, RawParseMode mode)
Definition: parser.c:42
static void check_unicode_value(pg_wchar c)
Definition: parser.c:342
static unsigned int hexval(unsigned char c)
Definition: parser.c:328
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
Definition: parser.c:372
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
Definition: parser.c:111
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:352
#define Assert(condition)
Definition: c.h:858
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
#define pg_yyget_extra(yyscanner)
Definition: gramparse.h:64
void parser_init(base_yy_extra_type *yyext)
int base_yyparse(core_yyscan_t yyscanner)
const char * str
static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)
Definition: hba.c:185
PGDLLIMPORT const ScanKeywordList ScanKeywords
unsigned int pg_wchar
Definition: mbprint.c:31
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:864
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
RawParseMode
Definition: parser.h:38
@ RAW_PARSE_PLPGSQL_EXPR
Definition: parser.h:41
@ RAW_PARSE_PLPGSQL_ASSIGN2
Definition: parser.h:43
@ RAW_PARSE_PLPGSQL_ASSIGN1
Definition: parser.h:42
@ RAW_PARSE_TYPE_NAME
Definition: parser.h:40
@ RAW_PARSE_PLPGSQL_ASSIGN3
Definition: parser.h:44
@ RAW_PARSE_DEFAULT
Definition: parser.h:39
static PgChecksumMode mode
Definition: pg_checksums.c:56
#define NIL
Definition: pg_list.h:68
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:329
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:519
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:537
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:525
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:531
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
char * c
#define YYLTYPE
Definition: scanner.h:44
int scanner_errposition(int location, core_yyscan_t yyscanner)
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)
void scanner_finish(core_yyscan_t yyscanner)
PGDLLIMPORT const uint16 ScanKeywordTokens[]
void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
void * core_yyscan_t
Definition: scanner.h:121
int core_yylex(core_YYSTYPE *yylval_param, YYLTYPE *yylloc_param, core_yyscan_t yyscanner)
void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn()
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:93
bool scanner_isspace(char ch)
Definition: scansup.c:117
Definition: pg_list.h:54
YYLTYPE lookahead_yylloc
Definition: gramparse.h:48
char * lookahead_end
Definition: gramparse.h:49
core_yy_extra_type core_yy_extra
Definition: gramparse.h:40
char lookahead_hold_char
Definition: gramparse.h:50
core_YYSTYPE lookahead_yylval
Definition: gramparse.h:47
char * scanbuf
Definition: scanner.h:72
char * str
Definition: scanner.h:32