PostgreSQL Source Code  git master
parser.c File Reference
#include "postgres.h"
#include "mb/pg_wchar.h"
#include "parser/gramparse.h"
#include "parser/parser.h"
#include "parser/scansup.h"
Include dependency graph for parser.c:

Go to the source code of this file.

Functions

static bool check_uescapechar (unsigned char escape)
 
static char * str_udeescape (const char *str, char escape, int position, core_yyscan_t yyscanner)
 
Listraw_parser (const char *str)
 
int base_yylex (YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 
static unsigned int hexval (unsigned char c)
 
static void check_unicode_value (pg_wchar c, int pos, core_yyscan_t yyscanner)
 

Function Documentation

◆ base_yylex()

int base_yylex ( YYSTYPE *  lvalp,
YYLTYPE llocp,
core_yyscan_t  yyscanner 
)

Definition at line 93 of file parser.c.

References Assert, check_uescapechar(), base_yy_extra_type::core_yy_extra, core_yylex(), base_yy_extra_type::have_lookahead, base_yy_extra_type::lookahead_end, base_yy_extra_type::lookahead_hold_char, base_yy_extra_type::lookahead_token, base_yy_extra_type::lookahead_yylloc, base_yy_extra_type::lookahead_yylval, next_token(), pg_yyget_extra, core_yy_extra_type::scanbuf, scanner_yyerror(), core_YYSTYPE::str, str_udeescape(), truncate_identifier(), YYLTYPE, and yyscanner.

Referenced by filtered_base_yylex().

94 {
96  int cur_token;
97  int next_token;
98  int cur_token_length;
99  YYLTYPE cur_yylloc;
100 
101  /* Get next token --- we might already have it */
102  if (yyextra->have_lookahead)
103  {
104  cur_token = yyextra->lookahead_token;
105  lvalp->core_yystype = yyextra->lookahead_yylval;
106  *llocp = yyextra->lookahead_yylloc;
107  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
108  yyextra->have_lookahead = false;
109  }
110  else
111  cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
112 
113  /*
114  * If this token isn't one that requires lookahead, just return it. If it
115  * does, determine the token length. (We could get that via strlen(), but
116  * since we have such a small set of possibilities, hardwiring seems
117  * feasible and more efficient --- at least for the fixed-length cases.)
118  */
119  switch (cur_token)
120  {
121  case NOT:
122  cur_token_length = 3;
123  break;
124  case NULLS_P:
125  cur_token_length = 5;
126  break;
127  case WITH:
128  cur_token_length = 4;
129  break;
130  case UIDENT:
131  case USCONST:
132  cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
133  break;
134  default:
135  return cur_token;
136  }
137 
138  /*
139  * Identify end+1 of current token. core_yylex() has temporarily stored a
140  * '\0' here, and will undo that when we call it again. We need to redo
141  * it to fully revert the lookahead call for error reporting purposes.
142  */
143  yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
144  *llocp + cur_token_length;
145  Assert(*(yyextra->lookahead_end) == '\0');
146 
147  /*
148  * Save and restore *llocp around the call. It might look like we could
149  * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
150  * does not work because flex actually holds onto the last-passed pointer
151  * internally, and will use that for error reporting. We need any error
152  * reports to point to the current token, not the next one.
153  */
154  cur_yylloc = *llocp;
155 
156  /* Get next token, saving outputs into lookahead variables */
157  next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
158  yyextra->lookahead_token = next_token;
159  yyextra->lookahead_yylloc = *llocp;
160 
161  *llocp = cur_yylloc;
162 
163  /* Now revert the un-truncation of the current token */
164  yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
165  *(yyextra->lookahead_end) = '\0';
166 
167  yyextra->have_lookahead = true;
168 
169  /* Replace cur_token if needed, based on lookahead */
170  switch (cur_token)
171  {
172  case NOT:
173  /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
174  switch (next_token)
175  {
176  case BETWEEN:
177  case IN_P:
178  case LIKE:
179  case ILIKE:
180  case SIMILAR:
181  cur_token = NOT_LA;
182  break;
183  }
184  break;
185 
186  case NULLS_P:
187  /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
188  switch (next_token)
189  {
190  case FIRST_P:
191  case LAST_P:
192  cur_token = NULLS_LA;
193  break;
194  }
195  break;
196 
197  case WITH:
198  /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
199  switch (next_token)
200  {
201  case TIME:
202  case ORDINALITY:
203  cur_token = WITH_LA;
204  break;
205  }
206  break;
207 
208  case UIDENT:
209  case USCONST:
210  /* Look ahead for UESCAPE */
211  if (next_token == UESCAPE)
212  {
213  /* Yup, so get third token, which had better be SCONST */
214  const char *escstr;
215 
216  /* Again save and restore *llocp */
217  cur_yylloc = *llocp;
218 
219  /* Un-truncate current token so errors point to third token */
220  *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
221 
222  /* Get third token */
223  next_token = core_yylex(&(yyextra->lookahead_yylval),
224  llocp, yyscanner);
225 
226  /* If we throw error here, it will point to third token */
227  if (next_token != SCONST)
228  scanner_yyerror("UESCAPE must be followed by a simple string literal",
229  yyscanner);
230 
231  escstr = yyextra->lookahead_yylval.str;
232  if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
233  scanner_yyerror("invalid Unicode escape character",
234  yyscanner);
235 
236  /* Now restore *llocp; errors will point to first token */
237  *llocp = cur_yylloc;
238 
239  /* Apply Unicode conversion */
240  lvalp->core_yystype.str =
241  str_udeescape(lvalp->core_yystype.str,
242  escstr[0],
243  *llocp,
244  yyscanner);
245 
246  /*
247  * We don't need to revert the un-truncation of UESCAPE. What
248  * we do want to do is clear have_lookahead, thereby consuming
249  * all three tokens.
250  */
251  yyextra->have_lookahead = false;
252  }
253  else
254  {
255  /* No UESCAPE, so convert using default escape character */
256  lvalp->core_yystype.str =
257  str_udeescape(lvalp->core_yystype.str,
258  '\\',
259  *llocp,
260  yyscanner);
261  }
262 
263  if (cur_token == UIDENT)
264  {
265  /* It's an identifier, so truncate as appropriate */
266  truncate_identifier(lvalp->core_yystype.str,
267  strlen(lvalp->core_yystype.str),
268  true);
269  cur_token = IDENT;
270  }
271  else if (cur_token == USCONST)
272  {
273  cur_token = SCONST;
274  }
275  break;
276  }
277 
278  return cur_token;
279 }
char * lookahead_end
Definition: gramparse.h:49
core_YYSTYPE lookahead_yylval
Definition: gramparse.h:47
core_yy_extra_type core_yy_extra
Definition: gramparse.h:40
static bool next_token(char **lineptr, char *buf, int bufsz, bool *initial_quote, bool *terminating_comma, int elevel, char **err_msg)
Definition: hba.c:195
char * scanbuf
Definition: scanner.h:72
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:315
void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn()
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:186
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
Definition: parser.c:335
#define YYLTYPE
Definition: scanner.h:44
int core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
#define pg_yyget_extra(yyscanner)
Definition: gramparse.h:64
char * str
Definition: scanner.h:32
#define Assert(condition)
Definition: c.h:738
YYLTYPE lookahead_yylloc
Definition: gramparse.h:48
char lookahead_hold_char
Definition: gramparse.h:50
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106

◆ check_uescapechar()

static bool check_uescapechar ( unsigned char  escape)
static

Definition at line 315 of file parser.c.

References scanner_isspace().

Referenced by base_yylex().

316 {
317  if (isxdigit(escape)
318  || escape == '+'
319  || escape == '\''
320  || escape == '"'
321  || scanner_isspace(escape))
322  return false;
323  else
324  return true;
325 }
bool scanner_isspace(char ch)
Definition: scansup.c:220

◆ check_unicode_value()

static void check_unicode_value ( pg_wchar  c,
int  pos,
core_yyscan_t  yyscanner 
)
static

Definition at line 297 of file parser.c.

References ereport, errcode(), errmsg(), ERROR, GetDatabaseEncoding(), PG_UTF8, and scanner_errposition().

Referenced by str_udeescape().

298 {
299  /* See also addunicode() in scan.l */
300  if (c == 0 || c > 0x10FFFF)
301  ereport(ERROR,
302  (errcode(ERRCODE_SYNTAX_ERROR),
303  errmsg("invalid Unicode escape value"),
305 
306  if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
307  ereport(ERROR,
308  (errcode(ERRCODE_SYNTAX_ERROR),
309  errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"),
311 }
int errcode(int sqlerrcode)
Definition: elog.c:608
#define ERROR
Definition: elog.h:43
char * c
#define ereport(elevel, rest)
Definition: elog.h:141
int scanner_errposition(int location, core_yyscan_t yyscanner)
int GetDatabaseEncoding(void)
Definition: mbutils.c:1046
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
int errmsg(const char *fmt,...)
Definition: elog.c:822

◆ hexval()

static unsigned int hexval ( unsigned char  c)
static

Definition at line 283 of file parser.c.

References elog, and ERROR.

Referenced by str_udeescape().

284 {
285  if (c >= '0' && c <= '9')
286  return c - '0';
287  if (c >= 'a' && c <= 'f')
288  return c - 'a' + 0xA;
289  if (c >= 'A' && c <= 'F')
290  return c - 'A' + 0xA;
291  elog(ERROR, "invalid hexadecimal digit");
292  return 0; /* not reached */
293 }
#define ERROR
Definition: elog.h:43
char * c
#define elog(elevel,...)
Definition: elog.h:228

◆ raw_parser()

List* raw_parser ( const char *  str)

Definition at line 42 of file parser.c.

References base_yyparse(), base_yy_extra_type::core_yy_extra, base_yy_extra_type::have_lookahead, NIL, parser_init(), base_yy_extra_type::parsetree, ScanKeywords, ScanKeywordTokens, scanner_finish(), scanner_init(), and yyscanner.

Referenced by ATPostAlterTypeParse(), pg_parse_query(), and typeStringToTypeName().

43 {
45  base_yy_extra_type yyextra;
46  int yyresult;
47 
48  /* initialize the flex scanner */
49  yyscanner = scanner_init(str, &yyextra.core_yy_extra,
51 
52  /* base_yylex() only needs this much initialization */
53  yyextra.have_lookahead = false;
54 
55  /* initialize the bison parser */
56  parser_init(&yyextra);
57 
58  /* Parse! */
59  yyresult = base_yyparse(yyscanner);
60 
61  /* Clean up (release memory) */
62  scanner_finish(yyscanner);
63 
64  if (yyresult) /* error */
65  return NIL;
66 
67  return yyextra.parsetree;
68 }
#define NIL
Definition: pg_list.h:65
int base_yyparse(core_yyscan_t yyscanner)
void * core_yyscan_t
Definition: scanner.h:117
PGDLLIMPORT const uint16 ScanKeywordTokens[]
core_yy_extra_type core_yy_extra
Definition: gramparse.h:40
PGDLLIMPORT const ScanKeywordList ScanKeywords
void parser_init(base_yy_extra_type *yyext)
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
void scanner_finish(core_yyscan_t yyscanner)

◆ str_udeescape()

static char * str_udeescape ( const char *  str,
char  escape,
int  position,
core_yyscan_t  yyscanner 
)
static

Definition at line 335 of file parser.c.

References check_unicode_value(), ereport, errcode(), errmsg(), ERROR, hexval(), is_utf16_surrogate_first(), is_utf16_surrogate_second(), palloc(), pg_mblen(), pg_verifymbstr(), scanner_errposition(), generate_unaccent_rules::str, surrogate_pair_to_codepoint(), and unicode_to_utf8().

Referenced by base_yylex().

337 {
338  const char *in;
339  char *new,
340  *out;
341  pg_wchar pair_first = 0;
342 
343  /*
344  * This relies on the subtle assumption that a UTF-8 expansion cannot be
345  * longer than its escaped representation.
346  */
347  new = palloc(strlen(str) + 1);
348 
349  in = str;
350  out = new;
351  while (*in)
352  {
353  if (in[0] == escape)
354  {
355  if (in[1] == escape)
356  {
357  if (pair_first)
358  goto invalid_pair;
359  *out++ = escape;
360  in += 2;
361  }
362  else if (isxdigit((unsigned char) in[1]) &&
363  isxdigit((unsigned char) in[2]) &&
364  isxdigit((unsigned char) in[3]) &&
365  isxdigit((unsigned char) in[4]))
366  {
367  pg_wchar unicode;
368 
369  unicode = (hexval(in[1]) << 12) +
370  (hexval(in[2]) << 8) +
371  (hexval(in[3]) << 4) +
372  hexval(in[4]);
373  check_unicode_value(unicode,
374  in - str + position + 3, /* 3 for U&" */
375  yyscanner);
376  if (pair_first)
377  {
378  if (is_utf16_surrogate_second(unicode))
379  {
380  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
381  pair_first = 0;
382  }
383  else
384  goto invalid_pair;
385  }
386  else if (is_utf16_surrogate_second(unicode))
387  goto invalid_pair;
388 
389  if (is_utf16_surrogate_first(unicode))
390  pair_first = unicode;
391  else
392  {
393  unicode_to_utf8(unicode, (unsigned char *) out);
394  out += pg_mblen(out);
395  }
396  in += 5;
397  }
398  else if (in[1] == '+' &&
399  isxdigit((unsigned char) in[2]) &&
400  isxdigit((unsigned char) in[3]) &&
401  isxdigit((unsigned char) in[4]) &&
402  isxdigit((unsigned char) in[5]) &&
403  isxdigit((unsigned char) in[6]) &&
404  isxdigit((unsigned char) in[7]))
405  {
406  pg_wchar unicode;
407 
408  unicode = (hexval(in[2]) << 20) +
409  (hexval(in[3]) << 16) +
410  (hexval(in[4]) << 12) +
411  (hexval(in[5]) << 8) +
412  (hexval(in[6]) << 4) +
413  hexval(in[7]);
414  check_unicode_value(unicode,
415  in - str + position + 3, /* 3 for U&" */
416  yyscanner);
417  if (pair_first)
418  {
419  if (is_utf16_surrogate_second(unicode))
420  {
421  unicode = surrogate_pair_to_codepoint(pair_first, unicode);
422  pair_first = 0;
423  }
424  else
425  goto invalid_pair;
426  }
427  else if (is_utf16_surrogate_second(unicode))
428  goto invalid_pair;
429 
430  if (is_utf16_surrogate_first(unicode))
431  pair_first = unicode;
432  else
433  {
434  unicode_to_utf8(unicode, (unsigned char *) out);
435  out += pg_mblen(out);
436  }
437  in += 8;
438  }
439  else
440  ereport(ERROR,
441  (errcode(ERRCODE_SYNTAX_ERROR),
442  errmsg("invalid Unicode escape value"),
443  scanner_errposition(in - str + position + 3, /* 3 for U&" */
444  yyscanner)));
445  }
446  else
447  {
448  if (pair_first)
449  goto invalid_pair;
450 
451  *out++ = *in++;
452  }
453  }
454 
455  /* unfinished surrogate pair? */
456  if (pair_first)
457  goto invalid_pair;
458 
459  *out = '\0';
460 
461  /*
462  * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
463  * codes; but it's probably not worth the trouble, since this isn't likely
464  * to be a performance-critical path.
465  */
466  pg_verifymbstr(new, out - new, false);
467  return new;
468 
469 invalid_pair:
470  ereport(ERROR,
471  (errcode(ERRCODE_SYNTAX_ERROR),
472  errmsg("invalid Unicode surrogate pair"),
473  scanner_errposition(in - str + position + 3, /* 3 for U&" */
474  yyscanner)));
475  return NULL; /* keep compiler quiet */
476 }
static unsigned int hexval(unsigned char c)
Definition: parser.c:283
unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: wchar.c:483
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:515
int errcode(int sqlerrcode)
Definition: elog.c:608
#define ERROR
Definition: elog.h:43
#define ereport(elevel, rest)
Definition: elog.h:141
unsigned int pg_wchar
Definition: mbprint.c:31
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:521
int scanner_errposition(int location, core_yyscan_t yyscanner)
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1341
int pg_mblen(const char *mbstr)
Definition: mbutils.c:802
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:822
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:509
static void check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner)
Definition: parser.c:297