PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
parser.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * parser.c
4 * Main entry point/driver for PostgreSQL grammar
5 *
6 * Note that the grammar is not allowed to perform any table access
7 * (since we need to be able to do basic parsing even while inside an
8 * aborted transaction). Therefore, the data structures returned by
9 * the grammar are "raw" parsetrees that still need to be analyzed by
10 * analyze.c and related files.
11 *
12 *
13 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 * IDENTIFICATION
17 * src/backend/parser/parser.c
18 *
19 *-------------------------------------------------------------------------
20 */
21
22#include "postgres.h"
23
24#include "gramparse.h"
25#include "mb/pg_wchar.h"
26#include "parser/parser.h"
27#include "parser/scansup.h"
28
29static bool check_uescapechar(unsigned char escape);
30static char *str_udeescape(const char *str, char escape,
31 int position, core_yyscan_t yyscanner);
32
33
34/*
35 * raw_parser
36 * Given a query in string form, do lexical and grammatical analysis.
37 *
38 * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 * list have the form required by the specified RawParseMode.
40 */
41List *
43{
46 int yyresult;
47
48 /* initialize the flex scanner */
49 yyscanner = scanner_init(str, &yyextra.core_yy_extra,
51
52 /* base_yylex() only needs us to initialize the lookahead token, if any */
54 yyextra.have_lookahead = false;
55 else
56 {
57 /* this array is indexed by RawParseMode enum */
58 static const int mode_token[] = {
60 [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,
61 [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,
62 [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,
63 [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,
64 [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,
65 };
66
67 yyextra.have_lookahead = true;
68 yyextra.lookahead_token = mode_token[mode];
69 yyextra.lookahead_yylloc = 0;
70 yyextra.lookahead_end = NULL;
71 }
72
73 /* initialize the bison parser */
75
76 /* Parse! */
77 yyresult = base_yyparse(yyscanner);
78
79 /* Clean up (release memory) */
81
82 if (yyresult) /* error */
83 return NIL;
84
85 return yyextra.parsetree;
86}
87
88
89/*
90 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 *
92 * This filter is needed because in some cases the standard SQL grammar
93 * requires more than one token lookahead. We reduce these cases to one-token
94 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 *
96 * Using a filter is simpler than trying to recognize multiword tokens
97 * directly in scan.l, because we'd have to allow for comments between the
98 * words. Furthermore it's not clear how to do that without re-introducing
99 * scanner backtrack, which would cost more performance than this filter
100 * layer does.
101 *
102 * We also use this filter to convert UIDENT and USCONST sequences into
103 * plain IDENT and SCONST tokens. While that could be handled by additional
104 * productions in the main grammar, it's more efficient to do it like this.
105 *
106 * The filter also provides a convenient place to translate between
107 * the core_YYSTYPE and YYSTYPE representations (which are really the
108 * same thing anyway, but notationally they're different).
109 */
110int
112{
114 int cur_token;
115 int next_token;
116 int cur_token_length;
117 YYLTYPE cur_yylloc;
118
119 /* Get next token --- we might already have it */
120 if (yyextra->have_lookahead)
121 {
122 cur_token = yyextra->lookahead_token;
123 lvalp->core_yystype = yyextra->lookahead_yylval;
124 *llocp = yyextra->lookahead_yylloc;
125 if (yyextra->lookahead_end)
126 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 yyextra->have_lookahead = false;
128 }
129 else
130 cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131
132 /*
133 * If this token isn't one that requires lookahead, just return it. If it
134 * does, determine the token length. (We could get that via strlen(), but
135 * since we have such a small set of possibilities, hardwiring seems
136 * feasible and more efficient --- at least for the fixed-length cases.)
137 */
138 switch (cur_token)
139 {
140 case FORMAT:
141 cur_token_length = 6;
142 break;
143 case NOT:
144 cur_token_length = 3;
145 break;
146 case NULLS_P:
147 cur_token_length = 5;
148 break;
149 case WITH:
150 cur_token_length = 4;
151 break;
152 case UIDENT:
153 case USCONST:
154 cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155 break;
156 case WITHOUT:
157 cur_token_length = 7;
158 break;
159 default:
160 return cur_token;
161 }
162
163 /*
164 * Identify end+1 of current token. core_yylex() has temporarily stored a
165 * '\0' here, and will undo that when we call it again. We need to redo
166 * it to fully revert the lookahead call for error reporting purposes.
167 */
168 yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169 *llocp + cur_token_length;
170 Assert(*(yyextra->lookahead_end) == '\0');
171
172 /*
173 * Save and restore *llocp around the call. It might look like we could
174 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 * does not work because flex actually holds onto the last-passed pointer
176 * internally, and will use that for error reporting. We need any error
177 * reports to point to the current token, not the next one.
178 */
179 cur_yylloc = *llocp;
180
181 /* Get next token, saving outputs into lookahead variables */
182 next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183 yyextra->lookahead_token = next_token;
184 yyextra->lookahead_yylloc = *llocp;
185
186 *llocp = cur_yylloc;
187
188 /* Now revert the un-truncation of the current token */
189 yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190 *(yyextra->lookahead_end) = '\0';
191
192 yyextra->have_lookahead = true;
193
194 /* Replace cur_token if needed, based on lookahead */
195 switch (cur_token)
196 {
197 case FORMAT:
198 /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199 switch (next_token)
200 {
201 case JSON:
202 cur_token = FORMAT_LA;
203 break;
204 }
205 break;
206
207 case NOT:
208 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209 switch (next_token)
210 {
211 case BETWEEN:
212 case IN_P:
213 case LIKE:
214 case ILIKE:
215 case SIMILAR:
216 cur_token = NOT_LA;
217 break;
218 }
219 break;
220
221 case NULLS_P:
222 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223 switch (next_token)
224 {
225 case FIRST_P:
226 case LAST_P:
227 cur_token = NULLS_LA;
228 break;
229 }
230 break;
231
232 case WITH:
233 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234 switch (next_token)
235 {
236 case TIME:
237 case ORDINALITY:
238 cur_token = WITH_LA;
239 break;
240 }
241 break;
242
243 case WITHOUT:
244 /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245 switch (next_token)
246 {
247 case TIME:
248 cur_token = WITHOUT_LA;
249 break;
250 }
251 break;
252
253 case UIDENT:
254 case USCONST:
255 /* Look ahead for UESCAPE */
256 if (next_token == UESCAPE)
257 {
258 /* Yup, so get third token, which had better be SCONST */
259 const char *escstr;
260
261 /* Again save and restore *llocp */
262 cur_yylloc = *llocp;
263
264 /* Un-truncate current token so errors point to third token */
265 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266
267 /* Get third token */
268 next_token = core_yylex(&(yyextra->lookahead_yylval),
269 llocp, yyscanner);
270
271 /* If we throw error here, it will point to third token */
272 if (next_token != SCONST)
273 scanner_yyerror("UESCAPE must be followed by a simple string literal",
274 yyscanner);
275
276 escstr = yyextra->lookahead_yylval.str;
277 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278 scanner_yyerror("invalid Unicode escape character",
279 yyscanner);
280
281 /* Now restore *llocp; errors will point to first token */
282 *llocp = cur_yylloc;
283
284 /* Apply Unicode conversion */
285 lvalp->core_yystype.str =
286 str_udeescape(lvalp->core_yystype.str,
287 escstr[0],
288 *llocp,
289 yyscanner);
290
291 /*
292 * We don't need to revert the un-truncation of UESCAPE. What
293 * we do want to do is clear have_lookahead, thereby consuming
294 * all three tokens.
295 */
296 yyextra->have_lookahead = false;
297 }
298 else
299 {
300 /* No UESCAPE, so convert using default escape character */
301 lvalp->core_yystype.str =
302 str_udeescape(lvalp->core_yystype.str,
303 '\\',
304 *llocp,
305 yyscanner);
306 }
307
308 if (cur_token == UIDENT)
309 {
310 /* It's an identifier, so truncate as appropriate */
311 truncate_identifier(lvalp->core_yystype.str,
312 strlen(lvalp->core_yystype.str),
313 true);
314 cur_token = IDENT;
315 }
316 else if (cur_token == USCONST)
317 {
318 cur_token = SCONST;
319 }
320 break;
321 }
322
323 return cur_token;
324}
325
326/* convert hex digit (caller should have verified that) to value */
327static unsigned int
328hexval(unsigned char c)
329{
330 if (c >= '0' && c <= '9')
331 return c - '0';
332 if (c >= 'a' && c <= 'f')
333 return c - 'a' + 0xA;
334 if (c >= 'A' && c <= 'F')
335 return c - 'A' + 0xA;
336 elog(ERROR, "invalid hexadecimal digit");
337 return 0; /* not reached */
338}
339
340/* is Unicode code point acceptable? */
341static void
343{
346 (errcode(ERRCODE_SYNTAX_ERROR),
347 errmsg("invalid Unicode escape value")));
348}
349
350/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351static bool
352check_uescapechar(unsigned char escape)
353{
354 if (isxdigit(escape)
355 || escape == '+'
356 || escape == '\''
357 || escape == '"'
358 || scanner_isspace(escape))
359 return false;
360 else
361 return true;
362}
363
364/*
365 * Process Unicode escapes in "str", producing a palloc'd plain string
366 *
367 * escape: the escape character to use
368 * position: start position of U&'' or U&"" string token
369 * yyscanner: context information needed for error reports
370 */
371static char *
372str_udeescape(const char *str, char escape,
373 int position, core_yyscan_t yyscanner)
374{
375 const char *in;
376 char *new,
377 *out;
378 size_t new_len;
379 pg_wchar pair_first = 0;
380 ScannerCallbackState scbstate;
381
382 /*
383 * Guesstimate that result will be no longer than input, but allow enough
384 * padding for Unicode conversion.
385 */
386 new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387 new = palloc(new_len);
388
389 in = str;
390 out = new;
391 while (*in)
392 {
393 /* Enlarge string if needed */
394 size_t out_dist = out - new;
395
396 if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397 {
398 new_len *= 2;
399 new = repalloc(new, new_len);
400 out = new + out_dist;
401 }
402
403 if (in[0] == escape)
404 {
405 /*
406 * Any errors reported while processing this escape sequence will
407 * have an error cursor pointing at the escape.
408 */
410 in - str + position + 3); /* 3 for U&" */
411 if (in[1] == escape)
412 {
413 if (pair_first)
414 goto invalid_pair;
415 *out++ = escape;
416 in += 2;
417 }
418 else if (isxdigit((unsigned char) in[1]) &&
419 isxdigit((unsigned char) in[2]) &&
420 isxdigit((unsigned char) in[3]) &&
421 isxdigit((unsigned char) in[4]))
422 {
423 pg_wchar unicode;
424
425 unicode = (hexval(in[1]) << 12) +
426 (hexval(in[2]) << 8) +
427 (hexval(in[3]) << 4) +
428 hexval(in[4]);
429 check_unicode_value(unicode);
430 if (pair_first)
431 {
432 if (is_utf16_surrogate_second(unicode))
433 {
434 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435 pair_first = 0;
436 }
437 else
438 goto invalid_pair;
439 }
440 else if (is_utf16_surrogate_second(unicode))
441 goto invalid_pair;
442
443 if (is_utf16_surrogate_first(unicode))
444 pair_first = unicode;
445 else
446 {
447 pg_unicode_to_server(unicode, (unsigned char *) out);
448 out += strlen(out);
449 }
450 in += 5;
451 }
452 else if (in[1] == '+' &&
453 isxdigit((unsigned char) in[2]) &&
454 isxdigit((unsigned char) in[3]) &&
455 isxdigit((unsigned char) in[4]) &&
456 isxdigit((unsigned char) in[5]) &&
457 isxdigit((unsigned char) in[6]) &&
458 isxdigit((unsigned char) in[7]))
459 {
460 pg_wchar unicode;
461
462 unicode = (hexval(in[2]) << 20) +
463 (hexval(in[3]) << 16) +
464 (hexval(in[4]) << 12) +
465 (hexval(in[5]) << 8) +
466 (hexval(in[6]) << 4) +
467 hexval(in[7]);
468 check_unicode_value(unicode);
469 if (pair_first)
470 {
471 if (is_utf16_surrogate_second(unicode))
472 {
473 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474 pair_first = 0;
475 }
476 else
477 goto invalid_pair;
478 }
479 else if (is_utf16_surrogate_second(unicode))
480 goto invalid_pair;
481
482 if (is_utf16_surrogate_first(unicode))
483 pair_first = unicode;
484 else
485 {
486 pg_unicode_to_server(unicode, (unsigned char *) out);
487 out += strlen(out);
488 }
489 in += 8;
490 }
491 else
493 (errcode(ERRCODE_SYNTAX_ERROR),
494 errmsg("invalid Unicode escape"),
495 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496
498 }
499 else
500 {
501 if (pair_first)
502 goto invalid_pair;
503
504 *out++ = *in++;
505 }
506 }
507
508 /* unfinished surrogate pair? */
509 if (pair_first)
510 goto invalid_pair;
511
512 *out = '\0';
513 return new;
514
515 /*
516 * We might get here with the error callback active, or not. Call
517 * scanner_errposition to make sure an error cursor appears; if the
518 * callback is active, this is duplicative but harmless.
519 */
520invalid_pair:
522 (errcode(ERRCODE_SYNTAX_ERROR),
523 errmsg("invalid Unicode surrogate pair"),
524 scanner_errposition(in - str + position + 3, /* 3 for U&" */
525 yyscanner)));
526 return NULL; /* keep compiler quiet */
527}
static void check_unicode_value(pg_wchar c)
Definition: parser.c:342
List * raw_parser(const char *str, RawParseMode mode)
Definition: parser.c:42
static unsigned int hexval(unsigned char c)
Definition: parser.c:328
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
Definition: parser.c:372
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
Definition: parser.c:111
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:352
#define Assert(condition)
Definition: c.h:812
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
#define pg_yyget_extra(yyscanner)
Definition: gramparse.h:64
void parser_init(base_yy_extra_type *yyext)
int base_yyparse(core_yyscan_t yyscanner)
const char * str
static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)
Definition: hba.c:185
PGDLLIMPORT const ScanKeywordList ScanKeywords
unsigned int pg_wchar
Definition: mbprint.c:31
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:864
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
RawParseMode
Definition: parser.h:38
@ RAW_PARSE_PLPGSQL_EXPR
Definition: parser.h:41
@ RAW_PARSE_PLPGSQL_ASSIGN2
Definition: parser.h:43
@ RAW_PARSE_PLPGSQL_ASSIGN1
Definition: parser.h:42
@ RAW_PARSE_TYPE_NAME
Definition: parser.h:40
@ RAW_PARSE_PLPGSQL_ASSIGN3
Definition: parser.h:44
@ RAW_PARSE_DEFAULT
Definition: parser.h:39
static PgChecksumMode mode
Definition: pg_checksums.c:55
#define NIL
Definition: pg_list.h:68
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:329
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:519
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:537
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:525
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:531
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
char * c
const char * YYLTYPE
int YYSTYPE
Definition: psqlscanslash.l:39
int scanner_errposition(int location, core_yyscan_t yyscanner)
Definition: scan.l:1154
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
Definition: scan.l:1263
void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)
Definition: scan.l:1200
void scanner_finish(core_yyscan_t yyscanner)
Definition: scan.l:1305
void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
Definition: scan.l:1217
#define yyextra
Definition: scan.l:1132
const uint16 ScanKeywordTokens[]
Definition: scan.l:81
void scanner_yyerror(const char *message, core_yyscan_t yyscanner)
Definition: scan.l:1236
void * core_yyscan_t
Definition: scanner.h:121
int core_yylex(core_YYSTYPE *yylval_param, YYLTYPE *yylloc_param, core_yyscan_t yyscanner)
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:93
bool scanner_isspace(char ch)
Definition: scansup.c:117
Definition: pg_list.h:54