PostgreSQL Source Code git master
parser.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * parser.c
4 * Main entry point/driver for PostgreSQL grammar
5 *
6 * This should match src/backend/parser/parser.c, except that we do not
7 * need to bother with re-entrant interfaces.
8 *
9 * Note: ECPG doesn't report error location like the backend does.
10 * This file will need work if we ever want it to.
11 *
12 *
13 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 * IDENTIFICATION
17 * src/interfaces/ecpg/preproc/parser.c
18 *
19 *-------------------------------------------------------------------------
20 */
21
22#include "postgres_fe.h"
23
24#include "preproc_extern.h"
25#include "preproc.h"
26
27
28static bool have_lookahead; /* is lookahead info valid? */
29static int lookahead_token; /* one-token lookahead */
30static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
31static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
32static char *lookahead_yytext; /* start current token */
33
34static int base_yylex_location(void);
35static bool check_uescapechar(unsigned char escape);
36static bool ecpg_isspace(char ch);
37
38
39/*
40 * Intermediate filter between parser and base lexer (base_yylex in scan.l).
41 *
42 * This filter is needed because in some cases the standard SQL grammar
43 * requires more than one token lookahead. We reduce these cases to one-token
44 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
45 *
46 * Using a filter is simpler than trying to recognize multiword tokens
47 * directly in scan.l, because we'd have to allow for comments between the
48 * words. Furthermore it's not clear how to do that without re-introducing
49 * scanner backtrack, which would cost more performance than this filter
50 * layer does.
51 *
52 * We also use this filter to convert UIDENT and USCONST sequences into
53 * plain IDENT and SCONST tokens. While that could be handled by additional
54 * productions in the main grammar, it's more efficient to do it like this.
55 */
56int
58{
59 int cur_token;
60 int next_token;
61 YYSTYPE cur_yylval;
62 YYLTYPE cur_yylloc;
63 char *cur_yytext;
64
65 /* Get next token --- we might already have it */
67 {
68 cur_token = lookahead_token;
70 base_yylloc = lookahead_yylloc;
72 have_lookahead = false;
73 }
74 else
75 cur_token = base_yylex_location();
76
77 /*
78 * If this token isn't one that requires lookahead, just return it.
79 */
80 switch (cur_token)
81 {
82 case FORMAT:
83 case NOT:
84 case NULLS_P:
85 case WITH:
86 case WITHOUT:
87 case UIDENT:
88 case USCONST:
89 break;
90 default:
91 return cur_token;
92 }
93
94 /* Save and restore lexer output variables around the call */
95 cur_yylval = base_yylval;
96 cur_yylloc = base_yylloc;
97 cur_yytext = base_yytext;
98
99 /* Get next token, saving outputs into lookahead variables */
101
104 lookahead_yylloc = base_yylloc;
106
107 base_yylval = cur_yylval;
108 base_yylloc = cur_yylloc;
109 base_yytext = cur_yytext;
110
111 have_lookahead = true;
112
113 /* Replace cur_token if needed, based on lookahead */
114 switch (cur_token)
115 {
116 case FORMAT:
117 /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
118 switch (next_token)
119 {
120 case JSON:
121 cur_token = FORMAT_LA;
122 break;
123 }
124 break;
125
126 case NOT:
127 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
128 switch (next_token)
129 {
130 case BETWEEN:
131 case IN_P:
132 case LIKE:
133 case ILIKE:
134 case SIMILAR:
135 cur_token = NOT_LA;
136 break;
137 }
138 break;
139
140 case NULLS_P:
141 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
142 switch (next_token)
143 {
144 case FIRST_P:
145 case LAST_P:
146 cur_token = NULLS_LA;
147 break;
148 }
149 break;
150
151 case WITH:
152 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
153 switch (next_token)
154 {
155 case TIME:
156 case ORDINALITY:
157 cur_token = WITH_LA;
158 break;
159 }
160 break;
161
162 case WITHOUT:
163 /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
164 switch (next_token)
165 {
166 case TIME:
167 cur_token = WITHOUT_LA;
168 break;
169 }
170 break;
171 case UIDENT:
172 case USCONST:
173 /* Look ahead for UESCAPE */
174 if (next_token == UESCAPE)
175 {
176 /* Yup, so get third token, which had better be SCONST */
177 const char *escstr;
178
179 /*
180 * Again save and restore lexer output variables around the
181 * call
182 */
183 cur_yylval = base_yylval;
184 cur_yylloc = base_yylloc;
185 cur_yytext = base_yytext;
186
187 /* Get third token */
189
190 if (next_token != SCONST)
191 mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
192
193 /*
194 * Save and check escape string, which the scanner returns
195 * with quotes
196 */
197 escstr = base_yylval.str;
198 if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
199 mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
200
201 base_yylval = cur_yylval;
202 base_yylloc = cur_yylloc;
203 base_yytext = cur_yytext;
204
205 /* Combine 3 tokens into 1 */
207 " UESCAPE ",
208 escstr);
209 base_yylloc = loc_strdup(base_yylval.str);
210
211 /* Clear have_lookahead, thereby consuming all three tokens */
212 have_lookahead = false;
213 }
214
215 if (cur_token == UIDENT)
216 cur_token = IDENT;
217 else if (cur_token == USCONST)
218 cur_token = SCONST;
219 break;
220 }
221
222 return cur_token;
223}
224
225/*
226 * Call base_yylex() and fill in base_yylloc.
227 *
228 * pgc.l does not worry about setting yylloc, and given what we want for
229 * that, trying to set it there would be pretty inconvenient. What we
230 * want is: if the returned token has type <str>, then duplicate its
231 * string value as yylloc; otherwise, make a downcased copy of yytext.
232 * The downcasing is ASCII-only because all that we care about there
233 * is producing uniformly-cased output of keywords. (That's mostly
234 * cosmetic, but there are places in ecpglib that expect to receive
235 * downcased keywords, plus it keeps us regression-test-compatible
236 * with the pre-v18 implementation of ecpg.)
237 */
238static int
240{
241 int token = base_yylex();
242
243 switch (token)
244 {
245 /* List a token here if pgc.l assigns to base_yylval.str for it */
246 case Op:
247 case CSTRING:
248 case CPP_LINE:
249 case CVARIABLE:
250 case BCONST:
251 case SCONST:
252 case USCONST:
253 case XCONST:
254 case FCONST:
255 case IDENT:
256 case UIDENT:
257 case IP:
258 /* Duplicate the <str> value */
259 base_yylloc = loc_strdup(base_yylval.str);
260 break;
261 default:
262 /* Else just use the input, i.e., yytext */
263 base_yylloc = loc_strdup(base_yytext);
264 /* Apply an ASCII-only downcasing */
265 for (unsigned char *ptr = (unsigned char *) base_yylloc; *ptr; ptr++)
266 {
267 if (*ptr >= 'A' && *ptr <= 'Z')
268 *ptr += 'a' - 'A';
269 }
270 break;
271 }
272 return token;
273}
274
275/*
276 * check_uescapechar() and ecpg_isspace() should match their equivalents
277 * in pgc.l.
278 */
279
280/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
281static bool
282check_uescapechar(unsigned char escape)
283{
284 if (isxdigit(escape)
285 || escape == '+'
286 || escape == '\''
287 || escape == '"'
288 || ecpg_isspace(escape))
289 return false;
290 else
291 return true;
292}
293
294/*
295 * ecpg_isspace() --- return true if flex scanner considers char whitespace
296 */
297static bool
299{
300 if (ch == ' ' ||
301 ch == '\t' ||
302 ch == '\n' ||
303 ch == '\r' ||
304 ch == '\f')
305 return true;
306 return false;
307}
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
Definition: parser.c:111
static uint8 IP[64]
Definition: crypt-des.c:74
static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)
Definition: hba.c:185
#define token
Definition: indent_globs.h:126
static YYLTYPE lookahead_yylloc
Definition: parser.c:31
static int base_yylex_location(void)
Definition: parser.c:239
static bool have_lookahead
Definition: parser.c:28
static int lookahead_token
Definition: parser.c:29
static YYSTYPE lookahead_yylval
Definition: parser.c:30
static bool ecpg_isspace(char ch)
Definition: parser.c:298
static char * lookahead_yytext
Definition: parser.c:32
int filtered_base_yylex(void)
Definition: parser.c:57
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:282
YYSTYPE base_yylval
const char * YYLTYPE
void mmerror(int error_code, enum errortype type, const char *error,...) pg_attribute_printf(3
#define PARSE_ERROR
char * make3_str(const char *str1, const char *str2, const char *str3)
Definition: util.c:256
char * base_yytext
char * loc_strdup(const char *string)
Definition: util.c:170
int YYSTYPE
Definition: psqlscanslash.l:39
@ ET_ERROR
Definition: type.h:220