PostgreSQL Source Code  git master
parser.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * parser.c
4  * Main entry point/driver for PostgreSQL grammar
5  *
6  * This should match src/backend/parser/parser.c, except that we do not
7  * need to bother with re-entrant interfaces.
8  *
9  * Note: ECPG doesn't report error location like the backend does.
10  * This file will need work if we ever want it to.
11  *
12  *
13  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  * IDENTIFICATION
17  * src/interfaces/ecpg/preproc/parser.c
18  *
19  *-------------------------------------------------------------------------
20  */
21 
22 #include "postgres_fe.h"
23 
24 #include "preproc_extern.h"
25 #include "preproc.h"
26 
27 
28 static bool have_lookahead; /* is lookahead info valid? */
29 static int lookahead_token; /* one-token lookahead */
30 static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
31 static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
32 static char *lookahead_yytext; /* start current token */
33 
34 static bool check_uescapechar(unsigned char escape);
35 static bool ecpg_isspace(char ch);
36 
37 
38 /*
39  * Intermediate filter between parser and base lexer (base_yylex in scan.l).
40  *
41  * This filter is needed because in some cases the standard SQL grammar
42  * requires more than one token lookahead. We reduce these cases to one-token
43  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
44  *
45  * Using a filter is simpler than trying to recognize multiword tokens
46  * directly in scan.l, because we'd have to allow for comments between the
47  * words. Furthermore it's not clear how to do that without re-introducing
48  * scanner backtrack, which would cost more performance than this filter
49  * layer does.
50  *
51  * We also use this filter to convert UIDENT and USCONST sequences into
52  * plain IDENT and SCONST tokens. While that could be handled by additional
53  * productions in the main grammar, it's more efficient to do it like this.
54  */
55 int
57 {
58  int cur_token;
59  int next_token;
60  YYSTYPE cur_yylval;
61  YYLTYPE cur_yylloc;
62  char *cur_yytext;
63 
64  /* Get next token --- we might already have it */
65  if (have_lookahead)
66  {
67  cur_token = lookahead_token;
68  base_yylval = lookahead_yylval;
69  base_yylloc = lookahead_yylloc;
71  have_lookahead = false;
72  }
73  else
74  cur_token = base_yylex();
75 
76  /*
77  * If this token isn't one that requires lookahead, just return it.
78  */
79  switch (cur_token)
80  {
81  case FORMAT:
82  case NOT:
83  case NULLS_P:
84  case WITH:
85  case WITHOUT:
86  case UIDENT:
87  case USCONST:
88  break;
89  default:
90  return cur_token;
91  }
92 
93  /* Save and restore lexer output variables around the call */
94  cur_yylval = base_yylval;
95  cur_yylloc = base_yylloc;
96  cur_yytext = base_yytext;
97 
98  /* Get next token, saving outputs into lookahead variables */
100 
102  lookahead_yylval = base_yylval;
103  lookahead_yylloc = base_yylloc;
105 
106  base_yylval = cur_yylval;
107  base_yylloc = cur_yylloc;
108  base_yytext = cur_yytext;
109 
110  have_lookahead = true;
111 
112  /* Replace cur_token if needed, based on lookahead */
113  switch (cur_token)
114  {
115  case FORMAT:
116  /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
117  switch (next_token)
118  {
119  case JSON:
120  cur_token = FORMAT_LA;
121  break;
122  }
123  break;
124 
125  case NOT:
126  /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
127  switch (next_token)
128  {
129  case BETWEEN:
130  case IN_P:
131  case LIKE:
132  case ILIKE:
133  case SIMILAR:
134  cur_token = NOT_LA;
135  break;
136  }
137  break;
138 
139  case NULLS_P:
140  /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
141  switch (next_token)
142  {
143  case FIRST_P:
144  case LAST_P:
145  cur_token = NULLS_LA;
146  break;
147  }
148  break;
149 
150  case WITH:
151  /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
152  switch (next_token)
153  {
154  case TIME:
155  case ORDINALITY:
156  cur_token = WITH_LA;
157  break;
158  }
159  break;
160 
161  case WITHOUT:
162  /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
163  switch (next_token)
164  {
165  case TIME:
166  cur_token = WITHOUT_LA;
167  break;
168  }
169  break;
170  case UIDENT:
171  case USCONST:
172  /* Look ahead for UESCAPE */
173  if (next_token == UESCAPE)
174  {
175  /* Yup, so get third token, which had better be SCONST */
176  const char *escstr;
177 
178  /*
179  * Again save and restore lexer output variables around the
180  * call
181  */
182  cur_yylval = base_yylval;
183  cur_yylloc = base_yylloc;
184  cur_yytext = base_yytext;
185 
186  /* Get third token */
188 
189  if (next_token != SCONST)
190  mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
191 
192  /*
193  * Save and check escape string, which the scanner returns
194  * with quotes
195  */
196  escstr = base_yylval.str;
197  if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
198  mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
199 
200  base_yylval = cur_yylval;
201  base_yylloc = cur_yylloc;
202  base_yytext = cur_yytext;
203 
204  /* Combine 3 tokens into 1 */
205  base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
206 
207  /* Clear have_lookahead, thereby consuming all three tokens */
208  have_lookahead = false;
209  }
210 
211  if (cur_token == UIDENT)
212  cur_token = IDENT;
213  else if (cur_token == USCONST)
214  cur_token = SCONST;
215  break;
216  }
217 
218  return cur_token;
219 }
220 
221 /*
222  * check_uescapechar() and ecpg_isspace() should match their equivalents
223  * in pgc.l.
224  */
225 
226 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
227 static bool
228 check_uescapechar(unsigned char escape)
229 {
230  if (isxdigit(escape)
231  || escape == '+'
232  || escape == '\''
233  || escape == '"'
234  || ecpg_isspace(escape))
235  return false;
236  else
237  return true;
238 }
239 
240 /*
241  * ecpg_isspace() --- return true if flex scanner considers char whitespace
242  */
243 static bool
244 ecpg_isspace(char ch)
245 {
246  if (ch == ' ' ||
247  ch == '\t' ||
248  ch == '\n' ||
249  ch == '\r' ||
250  ch == '\f')
251  return true;
252  return false;
253 }
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
Definition: parser.c:111
static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)
Definition: hba.c:185
static YYLTYPE lookahead_yylloc
Definition: parser.c:31
static bool have_lookahead
Definition: parser.c:28
static int lookahead_token
Definition: parser.c:29
static YYSTYPE lookahead_yylval
Definition: parser.c:30
static bool ecpg_isspace(char ch)
Definition: parser.c:244
static char * lookahead_yytext
Definition: parser.c:32
int filtered_base_yylex(void)
Definition: parser.c:56
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:228
void mmerror(int error_code, enum errortype type, const char *error,...) pg_attribute_printf(3
#define PARSE_ERROR
char * base_yytext
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
#define YYLTYPE
Definition: scanner.h:44
@ ET_ERROR
Definition: type.h:219