PostgreSQL Source Code  git master
parser.c File Reference
#include "postgres.h"
#include "mb/pg_wchar.h"
#include "parser/gramparse.h"
#include "parser/parser.h"
#include "parser/scansup.h"
Include dependency graph for parser.c:

Go to the source code of this file.

Functions

static bool check_uescapechar (unsigned char escape)
 
static char * str_udeescape (const char *str, char escape, int position, core_yyscan_t yyscanner)
 
Listraw_parser (const char *str, RawParseMode mode)
 
int base_yylex (YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 
static unsigned int hexval (unsigned char c)
 
static void check_unicode_value (pg_wchar c)
 

Function Documentation

◆ base_yylex()

int base_yylex ( YYSTYPE *  lvalp,
YYLTYPE llocp,
core_yyscan_t  yyscanner 
)

Definition at line 111 of file parser.c.

112 {
114  int cur_token;
115  int next_token;
116  int cur_token_length;
117  YYLTYPE cur_yylloc;
118 
119  /* Get next token --- we might already have it */
120  if (yyextra->have_lookahead)
121  {
122  cur_token = yyextra->lookahead_token;
123  lvalp->core_yystype = yyextra->lookahead_yylval;
124  *llocp = yyextra->lookahead_yylloc;
125  if (yyextra->lookahead_end)
126  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127  yyextra->have_lookahead = false;
128  }
129  else
130  cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 
132  /*
133  * If this token isn't one that requires lookahead, just return it. If it
134  * does, determine the token length. (We could get that via strlen(), but
135  * since we have such a small set of possibilities, hardwiring seems
136  * feasible and more efficient --- at least for the fixed-length cases.)
137  */
138  switch (cur_token)
139  {
140  case NOT:
141  cur_token_length = 3;
142  break;
143  case NULLS_P:
144  cur_token_length = 5;
145  break;
146  case WITH:
147  cur_token_length = 4;
148  break;
149  case UIDENT:
150  case USCONST:
151  cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
152  break;
153  case WITHOUT:
154  cur_token_length = 7;
155  break;
156  default:
157  return cur_token;
158  }
159 
160  /*
161  * Identify end+1 of current token. core_yylex() has temporarily stored a
162  * '\0' here, and will undo that when we call it again. We need to redo
163  * it to fully revert the lookahead call for error reporting purposes.
164  */
165  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
166  *llocp + cur_token_length;
167  Assert(*(yyextra->lookahead_end) == '\0');
168 
169  /*
170  * Save and restore *llocp around the call. It might look like we could
171  * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
172  * does not work because flex actually holds onto the last-passed pointer
173  * internally, and will use that for error reporting. We need any error
174  * reports to point to the current token, not the next one.
175  */
176  cur_yylloc = *llocp;
177 
178  /* Get next token, saving outputs into lookahead variables */
179  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
180  yyextra->lookahead_token = next_token;
181  yyextra->lookahead_yylloc = *llocp;
182 
183  *llocp = cur_yylloc;
184 
185  /* Now revert the un-truncation of the current token */
186  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
187  *(yyextra->lookahead_end) = '\0';
188 
189  yyextra->have_lookahead = true;
190 
191  /* Replace cur_token if needed, based on lookahead */
192  switch (cur_token)
193  {
194  case NOT:
195  /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
196  switch (next_token)
197  {
198  case BETWEEN:
199  case IN_P:
200  case LIKE:
201  case ILIKE:
202  case SIMILAR:
203  cur_token = NOT_LA;
204  break;
205  }
206  break;
207 
208  case NULLS_P:
209  /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
210  switch (next_token)
211  {
212  case FIRST_P:
213  case LAST_P:
214  cur_token = NULLS_LA;
215  break;
216  }
217  break;
218 
219  case WITH:
220  /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
221  switch (next_token)
222  {
223  case TIME:
224  case ORDINALITY:
225  cur_token = WITH_LA;
226  break;
227  case UNIQUE:
228  cur_token = WITH_LA_UNIQUE;
229  break;
230  }
231  break;
232 
233  case WITHOUT:
234  /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
235  switch (next_token)
236  {
237  case TIME:
238  cur_token = WITHOUT_LA;
239  break;
240  }
241  break;
242 
243  case UIDENT:
244  case USCONST:
245  /* Look ahead for UESCAPE */
246  if (next_token == UESCAPE)
247  {
248  /* Yup, so get third token, which had better be SCONST */
249  const char *escstr;
250 
251  /* Again save and restore *llocp */
252  cur_yylloc = *llocp;
253 
254  /* Un-truncate current token so errors point to third token */
255  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
256 
257  /* Get third token */
258  next_token = core_yylex(&(yyextra->lookahead_yylval),
259  llocp, yyscanner);
260 
261  /* If we throw error here, it will point to third token */
262  if (next_token != SCONST)
263  scanner_yyerror("UESCAPE must be followed by a simple string literal",
264  yyscanner);
265 
266  escstr = yyextra->lookahead_yylval.str;
267  if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
268  scanner_yyerror("invalid Unicode escape character",
269  yyscanner);
270 
271  /* Now restore *llocp; errors will point to first token */
272  *llocp = cur_yylloc;
273 
274  /* Apply Unicode conversion */
275  lvalp->core_yystype.str =
276  str_udeescape(lvalp->core_yystype.str,
277  escstr[0],
278  *llocp,
279  yyscanner);
280 
281  /*
282  * We don't need to revert the un-truncation of UESCAPE. What
283  * we do want to do is clear have_lookahead, thereby consuming
284  * all three tokens.
285  */
286  yyextra->have_lookahead = false;
287  }
288  else
289  {
290  /* No UESCAPE, so convert using default escape character */
291  lvalp->core_yystype.str =
292  str_udeescape(lvalp->core_yystype.str,
293  '\\',
294  *llocp,
295  yyscanner);
296  }
297 
298  if (cur_token == UIDENT)
299  {
300  /* It's an identifier, so truncate as appropriate */
301  truncate_identifier(lvalp->core_yystype.str,
302  strlen(lvalp->core_yystype.str),
303  true);
304  cur_token = IDENT;
305  }
306  else if (cur_token == USCONST)
307  {
308  cur_token = SCONST;
309  }
310  break;
311  }
312 
313  return cur_token;
314 }
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
Definition: parser.c:362
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:342
#define pg_yyget_extra(yyscanner)
Definition: gramparse.h:64
static bool next_token(char **lineptr, char *buf, int bufsz, bool *initial_quote, bool *terminating_comma, int elevel, char **err_msg)
Definition: hba.c:169
Assert(fmt[strlen(fmt) - 1] !='\n')
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
#define YYLTYPE
Definition: scanner.h:44
int core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn()
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:93
YYLTYPE lookahead_yylloc
Definition: gramparse.h:48
char * lookahead_end
Definition: gramparse.h:49
core_yy_extra_type core_yy_extra
Definition: gramparse.h:40
char lookahead_hold_char
Definition: gramparse.h:50
core_YYSTYPE lookahead_yylval
Definition: gramparse.h:47
char * scanbuf
Definition: scanner.h:72
char * str
Definition: scanner.h:32

References Assert(), check_uescapechar(), base_yy_extra_type::core_yy_extra, core_yylex(), base_yy_extra_type::have_lookahead, base_yy_extra_type::lookahead_end, base_yy_extra_type::lookahead_hold_char, base_yy_extra_type::lookahead_token, base_yy_extra_type::lookahead_yylloc, base_yy_extra_type::lookahead_yylval, next_token(), pg_yyget_extra, core_yy_extra_type::scanbuf, scanner_yyerror(), core_YYSTYPE::str, str_udeescape(), truncate_identifier(), YYLTYPE, and yyscanner.

Referenced by filtered_base_yylex().

◆ check_uescapechar()

static bool check_uescapechar ( unsigned char  escape)
static

Definition at line 342 of file parser.c.

343 {
344  if (isxdigit(escape)
345  || escape == '+'
346  || escape == '\''
347  || escape == '"'
348  || scanner_isspace(escape))
349  return false;
350  else
351  return true;
352 }
bool scanner_isspace(char ch)
Definition: scansup.c:117

References scanner_isspace().

Referenced by base_yylex().

◆ check_unicode_value()

static void check_unicode_value ( pg_wchar  c)
static

Definition at line 332 of file parser.c.

333 {
335  ereport(ERROR,
336  (errcode(ERRCODE_SYNTAX_ERROR),
337  errmsg("invalid Unicode escape value")));
338 }
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define ERROR
Definition: elog.h:33
#define ereport(elevel,...)
Definition: elog.h:143
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:523
char * c

References ereport, errcode(), errmsg(), ERROR, and is_valid_unicode_codepoint().

Referenced by str_udeescape().

◆ hexval()

static unsigned int hexval ( unsigned char  c)
static

Definition at line 318 of file parser.c.

319 {
320  if (c >= '0' && c <= '9')
321  return c - '0';
322  if (c >= 'a' && c <= 'f')
323  return c - 'a' + 0xA;
324  if (c >= 'A' && c <= 'F')
325  return c - 'A' + 0xA;
326  elog(ERROR, "invalid hexadecimal digit");
327  return 0; /* not reached */
328 }
#define elog(elevel,...)
Definition: elog.h:218

References elog, and ERROR.

Referenced by str_udeescape().

◆ raw_parser()

List* raw_parser ( const char *  str,
RawParseMode  mode 
)

Definition at line 42 of file parser.c.

43 {
45  base_yy_extra_type yyextra;
46  int yyresult;
47 
48  /* initialize the flex scanner */
51 
52  /* base_yylex() only needs us to initialize the lookahead token, if any */
53  if (mode == RAW_PARSE_DEFAULT)
54  yyextra.have_lookahead = false;
55  else
56  {
57  /* this array is indexed by RawParseMode enum */
58  static const int mode_token[] = {
59  0, /* RAW_PARSE_DEFAULT */
60  MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
61  MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
62  MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
63  MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
64  MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
65  };
66 
67  yyextra.have_lookahead = true;
68  yyextra.lookahead_token = mode_token[mode];
69  yyextra.lookahead_yylloc = 0;
70  yyextra.lookahead_end = NULL;
71  }
72 
73  /* initialize the bison parser */
74  parser_init(&yyextra);
75 
76  /* Parse! */
77  yyresult = base_yyparse(yyscanner);
78 
79  /* Clean up (release memory) */
81 
82  if (yyresult) /* error */
83  return NIL;
84 
85  return yyextra.parsetree;
86 }
void parser_init(base_yy_extra_type *yyext)
int base_yyparse(core_yyscan_t yyscanner)
PGDLLIMPORT const ScanKeywordList ScanKeywords
@ RAW_PARSE_DEFAULT
Definition: parser.h:39
static PgChecksumMode mode
Definition: pg_checksums.c:65
#define NIL
Definition: pg_list.h:65
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
void scanner_finish(core_yyscan_t yyscanner)
PGDLLIMPORT const uint16 ScanKeywordTokens[]
void * core_yyscan_t
Definition: scanner.h:121

References base_yyparse(), base_yy_extra_type::core_yy_extra, base_yy_extra_type::have_lookahead, base_yy_extra_type::lookahead_end, base_yy_extra_type::lookahead_token, base_yy_extra_type::lookahead_yylloc, mode, NIL, parser_init(), base_yy_extra_type::parsetree, RAW_PARSE_DEFAULT, ScanKeywords, ScanKeywordTokens, scanner_finish(), scanner_init(), generate_unaccent_rules::str, and yyscanner.

Referenced by _SPI_prepare_oneshot_plan(), _SPI_prepare_plan(), ATPostAlterTypeParse(), pg_parse_query(), and typeStringToTypeName().

◆ str_udeescape()

static char * str_udeescape ( const char *  str,
char  escape,
int  position,
core_yyscan_t  yyscanner 
)
static

Definition at line 362 of file parser.c.

364 {
365  const char *in;
366  char *new,
367  *out;
368  size_t new_len;
369  pg_wchar pair_first = 0;
370  ScannerCallbackState scbstate;
371 
372  /*
373  * Guesstimate that result will be no longer than input, but allow enough
374  * padding for Unicode conversion.
375  */
376  new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
377  new = palloc(new_len);
378 
379  in = str;
380  out = new;
381  while (*in)
382  {
383  /* Enlarge string if needed */
384  size_t out_dist = out - new;
385 
386  if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
387  {
388  new_len *= 2;
389  new = repalloc(new, new_len);
390  out = new + out_dist;
391  }
392 
393  if (in[0] == escape)
394  {
395  /*
396  * Any errors reported while processing this escape sequence will
397  * have an error cursor pointing at the escape.
398  */
400  in - str + position + 3); /* 3 for U&" */
401  if (in[1] == escape)
402  {
403  if (pair_first)
404  goto invalid_pair;
405  *out++ = escape;
406  in += 2;
407  }
408  else if (isxdigit((unsigned char) in[1]) &&
409  isxdigit((unsigned char) in[2]) &&
410  isxdigit((unsigned char) in[3]) &&
411  isxdigit((unsigned char) in[4]))
412  {
413  pg_wchar unicode;
414 
415  unicode = (hexval(in[1]) << 12) +
416  (hexval(in[2]) << 8) +
417  (hexval(in[3]) << 4) +
418  hexval(in[4]);
419  check_unicode_value(unicode);
420  if (pair_first)
421  {
422  if (is_utf16_surrogate_second(unicode))
423  {
424  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
425  pair_first = 0;
426  }
427  else
428  goto invalid_pair;
429  }
430  else if (is_utf16_surrogate_second(unicode))
431  goto invalid_pair;
432 
433  if (is_utf16_surrogate_first(unicode))
434  pair_first = unicode;
435  else
436  {
437  pg_unicode_to_server(unicode, (unsigned char *) out);
438  out += strlen(out);
439  }
440  in += 5;
441  }
442  else if (in[1] == '+' &&
443  isxdigit((unsigned char) in[2]) &&
444  isxdigit((unsigned char) in[3]) &&
445  isxdigit((unsigned char) in[4]) &&
446  isxdigit((unsigned char) in[5]) &&
447  isxdigit((unsigned char) in[6]) &&
448  isxdigit((unsigned char) in[7]))
449  {
450  pg_wchar unicode;
451 
452  unicode = (hexval(in[2]) << 20) +
453  (hexval(in[3]) << 16) +
454  (hexval(in[4]) << 12) +
455  (hexval(in[5]) << 8) +
456  (hexval(in[6]) << 4) +
457  hexval(in[7]);
458  check_unicode_value(unicode);
459  if (pair_first)
460  {
461  if (is_utf16_surrogate_second(unicode))
462  {
463  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
464  pair_first = 0;
465  }
466  else
467  goto invalid_pair;
468  }
469  else if (is_utf16_surrogate_second(unicode))
470  goto invalid_pair;
471 
472  if (is_utf16_surrogate_first(unicode))
473  pair_first = unicode;
474  else
475  {
476  pg_unicode_to_server(unicode, (unsigned char *) out);
477  out += strlen(out);
478  }
479  in += 8;
480  }
481  else
482  ereport(ERROR,
483  (errcode(ERRCODE_SYNTAX_ERROR),
484  errmsg("invalid Unicode escape"),
485  errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
486 
488  }
489  else
490  {
491  if (pair_first)
492  goto invalid_pair;
493 
494  *out++ = *in++;
495  }
496  }
497 
498  /* unfinished surrogate pair? */
499  if (pair_first)
500  goto invalid_pair;
501 
502  *out = '\0';
503  return new;
504 
505  /*
506  * We might get here with the error callback active, or not. Call
507  * scanner_errposition to make sure an error cursor appears; if the
508  * callback is active, this is duplicative but harmless.
509  */
510 invalid_pair:
511  ereport(ERROR,
512  (errcode(ERRCODE_SYNTAX_ERROR),
513  errmsg("invalid Unicode surrogate pair"),
514  scanner_errposition(in - str + position + 3, /* 3 for U&" */
515  yyscanner)));
516  return NULL; /* keep compiler quiet */
517 }
static void check_unicode_value(pg_wchar c)
Definition: parser.c:332
static unsigned int hexval(unsigned char c)
Definition: parser.c:318
int errhint(const char *fmt,...)
Definition: elog.c:1151
unsigned int pg_wchar
Definition: mbprint.c:31
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:864
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1188
void * palloc(Size size)
Definition: mcxt.c:1068
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:327
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:541
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:529
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:535
int scanner_errposition(int location, core_yyscan_t yyscanner)
void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)
void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)

References cancel_scanner_errposition_callback(), check_unicode_value(), ereport, errcode(), errhint(), errmsg(), ERROR, hexval(), is_utf16_surrogate_first(), is_utf16_surrogate_second(), MAX_UNICODE_EQUIVALENT_STRING, palloc(), pg_unicode_to_server(), repalloc(), scanner_errposition(), setup_scanner_errposition_callback(), generate_unaccent_rules::str, surrogate_pair_to_codepoint(), and yyscanner.

Referenced by base_yylex().