PostgreSQL Source Code  git master
pl_scanner.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pl_scanner.c
4  * lexical scanning for PL/pgSQL
5  *
6  *
7  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  * src/pl/plpgsql/src/pl_scanner.c
13  *
14  *-------------------------------------------------------------------------
15  */
16 #include "postgres.h"
17 
18 #include "mb/pg_wchar.h"
19 #include "parser/scanner.h"
20 
21 #include "plpgsql.h"
22 #include "pl_gram.h" /* must be after parser/scanner.h */
23 
24 
25 /* Klugy flag to tell scanner how to look up identifiers */
27 
28 /*
29  * A word about keywords:
30  *
31  * We keep reserved and unreserved keywords in separate headers. Be careful
32  * not to put the same word in both headers. Also be sure that pl_gram.y's
33  * unreserved_keyword production agrees with the unreserved header. The
34  * reserved keywords are passed to the core scanner, so they will be
35  * recognized before (and instead of) any variable name. Unreserved words
36  * are checked for separately, usually after determining that the identifier
37  * isn't a known variable name. If plpgsql_IdentifierLookup is DECLARE then
38  * no variable names will be recognized, so the unreserved words always work.
39  * (Note in particular that this helps us avoid reserving keywords that are
40  * only needed in DECLARE sections.)
41  *
42  * In certain contexts it is desirable to prefer recognizing an unreserved
43  * keyword over recognizing a variable name. In particular, at the start
44  * of a statement we should prefer unreserved keywords unless the statement
45  * looks like an assignment (i.e., first token is followed by ':=' or '[').
46  * This rule allows most statement-introducing keywords to be kept unreserved.
47  * (We still have to reserve initial keywords that might follow a block
48  * label, unfortunately, since the method used to determine if we are at
49  * start of statement doesn't recognize such cases. We'd also have to
50  * reserve any keyword that could legitimately be followed by ':=' or '['.)
51  * Some additional cases are handled in pl_gram.y using tok_is_keyword().
52  *
53  * We try to avoid reserving more keywords than we have to; but there's
54  * little point in not reserving a word if it's reserved in the core grammar.
55  * Currently, the following words are reserved here but not in the core:
56  * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE
57  */
58 
59 /* ScanKeywordList lookup data for PL/pgSQL keywords */
60 #include "pl_reserved_kwlist_d.h"
61 #include "pl_unreserved_kwlist_d.h"
62 
63 /* Token codes for PL/pgSQL keywords */
64 #define PG_KEYWORD(kwname, value) value,
65 
66 static const uint16 ReservedPLKeywordTokens[] = {
67 #include "pl_reserved_kwlist.h"
68 };
69 
71 #include "pl_unreserved_kwlist.h"
72 };
73 
74 #undef PG_KEYWORD
75 
76 /*
77  * This macro must recognize all tokens that can immediately precede a
78  * PL/pgSQL executable statement (that is, proc_sect or proc_stmt in the
79  * grammar). Fortunately, there are not very many, so hard-coding in this
80  * fashion seems sufficient.
81  */
82 #define AT_STMT_START(prev_token) \
83  ((prev_token) == ';' || \
84  (prev_token) == K_BEGIN || \
85  (prev_token) == K_THEN || \
86  (prev_token) == K_ELSE || \
87  (prev_token) == K_LOOP)
88 
89 
90 /* Auxiliary data about a token (other than the token type) */
91 typedef struct
92 {
93  YYSTYPE lval; /* semantic information */
94  YYLTYPE lloc; /* offset in scanbuf */
95  int leng; /* length in bytes */
96 } TokenAuxData;
97 
98 /*
99  * Scanner working state. At some point we might wish to fold all this
100  * into a YY_EXTRA struct. For the moment, there is no need for plpgsql's
101  * lexer to be re-entrant, and the notational burden of passing a yyscanner
102  * pointer around is great enough to not want to do it without need.
103  */
104 
105 /* The stuff the core lexer needs */
106 static core_yyscan_t yyscanner = NULL;
108 
109 /* The original input string */
110 static const char *scanorig;
111 
112 /* Current token's length (corresponds to plpgsql_yylval and plpgsql_yylloc) */
113 static int plpgsql_yyleng;
114 
115 /* Current token's code (corresponds to plpgsql_yylval and plpgsql_yylloc) */
116 static int plpgsql_yytoken;
117 
118 /* Token pushback stack */
119 #define MAX_PUSHBACKS 4
120 
121 static int num_pushbacks;
124 
125 /* State for plpgsql_location_to_lineno() */
126 static const char *cur_line_start;
127 static const char *cur_line_end;
128 static int cur_line_num;
129 
130 /* Internal functions */
131 static int internal_yylex(TokenAuxData *auxdata);
132 static void push_back_token(int token, TokenAuxData *auxdata);
133 static void location_lineno_init(void);
134 
135 
136 /*
137  * This is the yylex routine called from the PL/pgSQL grammar.
138  * It is a wrapper around the core lexer, with the ability to recognize
139  * PL/pgSQL variables and return them as special T_DATUM tokens. If a
140  * word or compound word does not match any variable name, or if matching
141  * is turned off by plpgsql_IdentifierLookup, it is returned as
142  * T_WORD or T_CWORD respectively, or as an unreserved keyword if it
143  * matches one of those.
144  */
145 int
147 {
148  int tok1;
149  TokenAuxData aux1;
150  int kwnum;
151 
152  tok1 = internal_yylex(&aux1);
153  if (tok1 == IDENT || tok1 == PARAM)
154  {
155  int tok2;
156  TokenAuxData aux2;
157 
158  tok2 = internal_yylex(&aux2);
159  if (tok2 == '.')
160  {
161  int tok3;
162  TokenAuxData aux3;
163 
164  tok3 = internal_yylex(&aux3);
165  if (tok3 == IDENT)
166  {
167  int tok4;
168  TokenAuxData aux4;
169 
170  tok4 = internal_yylex(&aux4);
171  if (tok4 == '.')
172  {
173  int tok5;
174  TokenAuxData aux5;
175 
176  tok5 = internal_yylex(&aux5);
177  if (tok5 == IDENT)
178  {
179  if (plpgsql_parse_tripword(aux1.lval.str,
180  aux3.lval.str,
181  aux5.lval.str,
182  &aux1.lval.wdatum,
183  &aux1.lval.cword))
184  tok1 = T_DATUM;
185  else
186  tok1 = T_CWORD;
187  }
188  else
189  {
190  /* not A.B.C, so just process A.B */
191  push_back_token(tok5, &aux5);
192  push_back_token(tok4, &aux4);
193  if (plpgsql_parse_dblword(aux1.lval.str,
194  aux3.lval.str,
195  &aux1.lval.wdatum,
196  &aux1.lval.cword))
197  tok1 = T_DATUM;
198  else
199  tok1 = T_CWORD;
200  }
201  }
202  else
203  {
204  /* not A.B.C, so just process A.B */
205  push_back_token(tok4, &aux4);
206  if (plpgsql_parse_dblword(aux1.lval.str,
207  aux3.lval.str,
208  &aux1.lval.wdatum,
209  &aux1.lval.cword))
210  tok1 = T_DATUM;
211  else
212  tok1 = T_CWORD;
213  }
214  }
215  else
216  {
217  /* not A.B, so just process A */
218  push_back_token(tok3, &aux3);
219  push_back_token(tok2, &aux2);
220  if (plpgsql_parse_word(aux1.lval.str,
221  core_yy.scanbuf + aux1.lloc,
222  true,
223  &aux1.lval.wdatum,
224  &aux1.lval.word))
225  tok1 = T_DATUM;
226  else if (!aux1.lval.word.quoted &&
227  (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
228  &UnreservedPLKeywords)) >= 0)
229  {
230  aux1.lval.keyword = GetScanKeyword(kwnum,
231  &UnreservedPLKeywords);
232  tok1 = UnreservedPLKeywordTokens[kwnum];
233  }
234  else
235  tok1 = T_WORD;
236  }
237  }
238  else
239  {
240  /* not A.B, so just process A */
241  push_back_token(tok2, &aux2);
242 
243  /*
244  * See if it matches a variable name, except in the context where
245  * we are at start of statement and the next token isn't
246  * assignment or '['. In that case, it couldn't validly be a
247  * variable name, and skipping the lookup allows variable names to
248  * be used that would conflict with plpgsql or core keywords that
249  * introduce statements (e.g., "comment"). Without this special
250  * logic, every statement-introducing keyword would effectively be
251  * reserved in PL/pgSQL, which would be unpleasant.
252  *
253  * If it isn't a variable name, try to match against unreserved
254  * plpgsql keywords. If not one of those either, it's T_WORD.
255  *
256  * Note: we must call plpgsql_parse_word even if we don't want to
257  * do variable lookup, because it sets up aux1.lval.word for the
258  * non-variable cases.
259  */
260  if (plpgsql_parse_word(aux1.lval.str,
261  core_yy.scanbuf + aux1.lloc,
263  (tok2 == '=' || tok2 == COLON_EQUALS ||
264  tok2 == '[')),
265  &aux1.lval.wdatum,
266  &aux1.lval.word))
267  tok1 = T_DATUM;
268  else if (!aux1.lval.word.quoted &&
269  (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
270  &UnreservedPLKeywords)) >= 0)
271  {
272  aux1.lval.keyword = GetScanKeyword(kwnum,
273  &UnreservedPLKeywords);
274  tok1 = UnreservedPLKeywordTokens[kwnum];
275  }
276  else
277  tok1 = T_WORD;
278  }
279  }
280  else
281  {
282  /*
283  * Not a potential plpgsql variable name, just return the data.
284  *
285  * Note that we also come through here if the grammar pushed back a
286  * T_DATUM, T_CWORD, T_WORD, or unreserved-keyword token returned by a
287  * previous lookup cycle; thus, pushbacks do not incur extra lookup
288  * work, since we'll never do the above code twice for the same token.
289  * This property also makes it safe to rely on the old value of
290  * plpgsql_yytoken in the is-this-start-of-statement test above.
291  */
292  }
293 
294  plpgsql_yylval = aux1.lval;
295  plpgsql_yylloc = aux1.lloc;
296  plpgsql_yyleng = aux1.leng;
297  plpgsql_yytoken = tok1;
298  return tok1;
299 }
300 
301 /*
302  * Internal yylex function. This wraps the core lexer and adds one feature:
303  * a token pushback stack. We also make a couple of trivial single-token
304  * translations from what the core lexer does to what we want, in particular
305  * interfacing from the core_YYSTYPE to YYSTYPE union.
306  */
307 static int
309 {
310  int token;
311  const char *yytext;
312 
313  if (num_pushbacks > 0)
314  {
315  num_pushbacks--;
317  *auxdata = pushback_auxdata[num_pushbacks];
318  }
319  else
320  {
321  token = core_yylex(&auxdata->lval.core_yystype,
322  &auxdata->lloc,
323  yyscanner);
324 
325  /* remember the length of yytext before it gets changed */
326  yytext = core_yy.scanbuf + auxdata->lloc;
327  auxdata->leng = strlen(yytext);
328 
329  /* Check for << >> and #, which the core considers operators */
330  if (token == Op)
331  {
332  if (strcmp(auxdata->lval.str, "<<") == 0)
333  token = LESS_LESS;
334  else if (strcmp(auxdata->lval.str, ">>") == 0)
335  token = GREATER_GREATER;
336  else if (strcmp(auxdata->lval.str, "#") == 0)
337  token = '#';
338  }
339 
340  /* The core returns PARAM as ival, but we treat it like IDENT */
341  else if (token == PARAM)
342  {
343  auxdata->lval.str = pstrdup(yytext);
344  }
345  }
346 
347  return token;
348 }
349 
350 /*
351  * Push back a token to be re-read by next internal_yylex() call.
352  */
353 static void
355 {
357  elog(ERROR, "too many tokens pushed back");
359  pushback_auxdata[num_pushbacks] = *auxdata;
360  num_pushbacks++;
361 }
362 
363 /*
364  * Push back a single token to be re-read by next plpgsql_yylex() call.
365  *
366  * NOTE: this does not cause yylval or yylloc to "back up". Also, it
367  * is not a good idea to push back a token code other than what you read.
368  */
369 void
371 {
372  TokenAuxData auxdata;
373 
374  auxdata.lval = plpgsql_yylval;
375  auxdata.lloc = plpgsql_yylloc;
376  auxdata.leng = plpgsql_yyleng;
377  push_back_token(token, &auxdata);
378 }
379 
380 /*
381  * Tell whether a token is an unreserved keyword.
382  *
383  * (If it is, its lowercased form was returned as the token value, so we
384  * do not need to offer that data here.)
385  */
386 bool
388 {
389  int i;
390 
391  for (i = 0; i < lengthof(UnreservedPLKeywordTokens); i++)
392  {
394  return true;
395  }
396  return false;
397 }
398 
399 /*
400  * Append the function text starting at startlocation and extending to
401  * (not including) endlocation onto the existing contents of "buf".
402  */
403 void
405  int startlocation, int endlocation)
406 {
407  Assert(startlocation <= endlocation);
408  appendBinaryStringInfo(buf, scanorig + startlocation,
409  endlocation - startlocation);
410 }
411 
412 /*
413  * Peek one token ahead in the input stream. Only the token code is
414  * made available, not any of the auxiliary info such as location.
415  *
416  * NB: no variable or unreserved keyword lookup is performed here, they will
417  * be returned as IDENT. Reserved keywords are resolved as usual.
418  */
419 int
421 {
422  int tok1;
423  TokenAuxData aux1;
424 
425  tok1 = internal_yylex(&aux1);
426  push_back_token(tok1, &aux1);
427  return tok1;
428 }
429 
430 /*
431  * Peek two tokens ahead in the input stream. The first token and its
432  * location in the query are returned in *tok1_p and *tok1_loc, second token
433  * and its location in *tok2_p and *tok2_loc.
434  *
435  * NB: no variable or unreserved keyword lookup is performed here, they will
436  * be returned as IDENT. Reserved keywords are resolved as usual.
437  */
438 void
439 plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
440 {
441  int tok1,
442  tok2;
443  TokenAuxData aux1,
444  aux2;
445 
446  tok1 = internal_yylex(&aux1);
447  tok2 = internal_yylex(&aux2);
448 
449  *tok1_p = tok1;
450  if (tok1_loc)
451  *tok1_loc = aux1.lloc;
452  *tok2_p = tok2;
453  if (tok2_loc)
454  *tok2_loc = aux2.lloc;
455 
456  push_back_token(tok2, &aux2);
457  push_back_token(tok1, &aux1);
458 }
459 
460 /*
461  * plpgsql_scanner_errposition
462  * Report an error cursor position, if possible.
463  *
464  * This is expected to be used within an ereport() call. The return value
465  * is a dummy (always 0, in fact).
466  *
467  * Note that this can only be used for messages emitted during initial
468  * parsing of a plpgsql function, since it requires the scanorig string
469  * to still be available.
470  */
471 int
473 {
474  int pos;
475 
476  if (location < 0 || scanorig == NULL)
477  return 0; /* no-op if location is unknown */
478 
479  /* Convert byte offset to character number */
480  pos = pg_mbstrlen_with_len(scanorig, location) + 1;
481  /* And pass it to the ereport mechanism */
482  (void) internalerrposition(pos);
483  /* Also pass the function body string */
484  return internalerrquery(scanorig);
485 }
486 
487 /*
488  * plpgsql_yyerror
489  * Report a lexer or grammar error.
490  *
491  * The message's cursor position refers to the current token (the one
492  * last returned by plpgsql_yylex()).
493  * This is OK for syntax error messages from the Bison parser, because Bison
494  * parsers report error as soon as the first unparsable token is reached.
495  * Beware of using yyerror for other purposes, as the cursor position might
496  * be misleading!
497  */
498 void
499 plpgsql_yyerror(const char *message)
500 {
501  char *yytext = core_yy.scanbuf + plpgsql_yylloc;
502 
503  if (*yytext == '\0')
504  {
505  ereport(ERROR,
506  (errcode(ERRCODE_SYNTAX_ERROR),
507  /* translator: %s is typically the translation of "syntax error" */
508  errmsg("%s at end of input", _(message)),
509  plpgsql_scanner_errposition(plpgsql_yylloc)));
510  }
511  else
512  {
513  /*
514  * If we have done any lookahead then flex will have restored the
515  * character after the end-of-token. Zap it again so that we report
516  * only the single token here. This modifies scanbuf but we no longer
517  * care about that.
518  */
519  yytext[plpgsql_yyleng] = '\0';
520 
521  ereport(ERROR,
522  (errcode(ERRCODE_SYNTAX_ERROR),
523  /* translator: first %s is typically the translation of "syntax error" */
524  errmsg("%s at or near \"%s\"", _(message), yytext),
525  plpgsql_scanner_errposition(plpgsql_yylloc)));
526  }
527 }
528 
529 /*
530  * Given a location (a byte offset in the function source text),
531  * return a line number.
532  *
533  * We expect that this is typically called for a sequence of increasing
534  * location values, so optimize accordingly by tracking the endpoints
535  * of the "current" line.
536  */
537 int
539 {
540  const char *loc;
541 
542  if (location < 0 || scanorig == NULL)
543  return 0; /* garbage in, garbage out */
544  loc = scanorig + location;
545 
546  /* be correct, but not fast, if input location goes backwards */
547  if (loc < cur_line_start)
549 
550  while (cur_line_end != NULL && loc > cur_line_end)
551  {
553  cur_line_num++;
554  cur_line_end = strchr(cur_line_start, '\n');
555  }
556 
557  return cur_line_num;
558 }
559 
560 /* initialize or reset the state for plpgsql_location_to_lineno */
561 static void
563 {
565  cur_line_num = 1;
566 
567  cur_line_end = strchr(cur_line_start, '\n');
568 }
569 
570 /* return the most recently computed lineno */
571 int
573 {
574  return cur_line_num;
575 }
576 
577 
578 /*
579  * Called before any actual parsing is done
580  *
581  * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
582  * Although it is not fed directly to flex, we need the original string
583  * to cite in error messages.
584  */
585 void
587 {
588  /* Start up the core scanner */
590  &ReservedPLKeywords, ReservedPLKeywordTokens);
591 
592  /*
593  * scanorig points to the original string, which unlike the scanner's
594  * scanbuf won't be modified on-the-fly by flex. Notice that although
595  * yytext points into scanbuf, we rely on being able to apply locations
596  * (offsets from string start) to scanorig as well.
597  */
598  scanorig = str;
599 
600  /* Other setup */
602  plpgsql_yytoken = 0;
603 
604  num_pushbacks = 0;
605 
607 }
608 
609 /*
610  * Called after parsing is done to clean up after plpgsql_scanner_init()
611  */
612 void
614 {
615  /* release storage */
617  /* avoid leaving any dangling pointers */
618  yyscanner = NULL;
619  scanorig = NULL;
620 }
unsigned short uint16
Definition: c.h:494
#define lengthof(array)
Definition: c.h:777
int internalerrquery(const char *query)
Definition: elog.c:1481
int internalerrposition(int cursorpos)
Definition: elog.c:1461
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define _(x)
Definition: elog.c:91
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
#define token
Definition: indent_globs.h:126
int i
Definition: isn.c:73
int ScanKeywordLookup(const char *str, const ScanKeywordList *keywords)
Definition: kwlookup.c:38
static const char * GetScanKeyword(int n, const ScanKeywordList *keywords)
Definition: kwlookup.h:39
Assert(fmt[strlen(fmt) - 1] !='\n')
int pg_mbstrlen_with_len(const char *mbstr, int limit)
Definition: mbutils.c:1058
char * pstrdup(const char *in)
Definition: mcxt.c:1644
static char * buf
Definition: pg_test_fsync.c:67
bool plpgsql_parse_dblword(char *word1, char *word2, PLwdatum *wdatum, PLcword *cword)
Definition: pl_comp.c:1441
bool plpgsql_parse_word(char *word1, const char *yytxt, bool lookup, PLwdatum *wdatum, PLword *word)
Definition: pl_comp.c:1386
bool plpgsql_parse_tripword(char *word1, char *word2, char *word3, PLwdatum *wdatum, PLcword *cword)
Definition: pl_comp.c:1522
void plpgsql_scanner_finish(void)
Definition: pl_scanner.c:613
static TokenAuxData pushback_auxdata[MAX_PUSHBACKS]
Definition: pl_scanner.c:123
int plpgsql_scanner_errposition(int location)
Definition: pl_scanner.c:472
#define AT_STMT_START(prev_token)
Definition: pl_scanner.c:82
static void push_back_token(int token, TokenAuxData *auxdata)
Definition: pl_scanner.c:354
static const char * scanorig
Definition: pl_scanner.c:110
static core_yyscan_t yyscanner
Definition: pl_scanner.c:106
static int num_pushbacks
Definition: pl_scanner.c:121
static int plpgsql_yytoken
Definition: pl_scanner.c:116
void plpgsql_yyerror(const char *message)
Definition: pl_scanner.c:499
static int plpgsql_yyleng
Definition: pl_scanner.c:113
int plpgsql_location_to_lineno(int location)
Definition: pl_scanner.c:538
static int cur_line_num
Definition: pl_scanner.c:128
static const uint16 UnreservedPLKeywordTokens[]
Definition: pl_scanner.c:70
IdentifierLookup plpgsql_IdentifierLookup
Definition: pl_scanner.c:26
bool plpgsql_token_is_unreserved_keyword(int token)
Definition: pl_scanner.c:387
void plpgsql_append_source_text(StringInfo buf, int startlocation, int endlocation)
Definition: pl_scanner.c:404
static const uint16 ReservedPLKeywordTokens[]
Definition: pl_scanner.c:66
static int pushback_token[MAX_PUSHBACKS]
Definition: pl_scanner.c:122
void plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
Definition: pl_scanner.c:439
int plpgsql_latest_lineno(void)
Definition: pl_scanner.c:572
int plpgsql_yylex(void)
Definition: pl_scanner.c:146
static int internal_yylex(TokenAuxData *auxdata)
Definition: pl_scanner.c:308
static const char * cur_line_end
Definition: pl_scanner.c:127
static core_yy_extra_type core_yy
Definition: pl_scanner.c:107
int plpgsql_peek(void)
Definition: pl_scanner.c:420
static void location_lineno_init(void)
Definition: pl_scanner.c:562
void plpgsql_scanner_init(const char *str)
Definition: pl_scanner.c:586
void plpgsql_push_back_token(int token)
Definition: pl_scanner.c:370
static const char * cur_line_start
Definition: pl_scanner.c:126
#define MAX_PUSHBACKS
Definition: pl_scanner.c:119
IdentifierLookup
Definition: plpgsql.h:1188
@ IDENTIFIER_LOOKUP_NORMAL
Definition: plpgsql.h:1189
#define YYLTYPE
Definition: scanner.h:44
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
void scanner_finish(core_yyscan_t yyscanner)
void * core_yyscan_t
Definition: scanner.h:121
int core_yylex(core_YYSTYPE *yylval_param, YYLTYPE *yylloc_param, core_yyscan_t yyscanner)
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:227
YYLTYPE lloc
Definition: pl_scanner.c:94
YYSTYPE lval
Definition: pl_scanner.c:93
char * scanbuf
Definition: scanner.h:72