backend_2parser_2parser_8c_source.html

/*-------------------------------------------------------------------------

 *

 * parser.c

 *      Main entry point/driver for PostgreSQL grammar

 *

 * Note that the grammar is not allowed to perform any table access

 * (since we need to be able to do basic parsing even while inside an

 * aborted transaction).  Therefore, the data structures returned by

 * the grammar are "raw" parsetrees that still need to be analyzed by

 * analyze.c and related files.

 *

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 * IDENTIFICATION

 *    src/backend/parser/parser.c

 *

 *-------------------------------------------------------------------------

 */


#include "postgres.h"


#include "gramparse.h"

#include "mb/pg_wchar.h"

#include "parser/parser.h"

#include "parser/scansup.h"


static bool check_uescapechar(unsigned char escape);

static char *str_udeescape(const char *str, char escape,

                           int position, core_yyscan_t yyscanner);


/*

 * raw_parser

 *      Given a query in string form, do lexical and grammatical analysis.

 *

 * Returns a list of raw (un-analyzed) parse trees.  The contents of the

 * list have the form required by the specified RawParseMode.

 */

List *

raw_parser(const char *str, RawParseMode mode)

{

    core_yyscan_t yyscanner;

    base_yy_extra_type yyextra;

    int         yyresult;


    /* initialize the flex scanner */

    yyscanner = scanner_init(str, &yyextra.core_yy_extra,

                             &ScanKeywords, ScanKeywordTokens);


    /* base_yylex() only needs us to initialize the lookahead token, if any */

    if (mode == RAW_PARSE_DEFAULT)

        yyextra.have_lookahead = false;

    else

    {

        /* this array is indexed by RawParseMode enum */

        static const int mode_token[] = {

            [RAW_PARSE_DEFAULT] = 0,

            [RAW_PARSE_TYPE_NAME] = MODE_TYPE_NAME,

            [RAW_PARSE_PLPGSQL_EXPR] = MODE_PLPGSQL_EXPR,

            [RAW_PARSE_PLPGSQL_ASSIGN1] = MODE_PLPGSQL_ASSIGN1,

            [RAW_PARSE_PLPGSQL_ASSIGN2] = MODE_PLPGSQL_ASSIGN2,

            [RAW_PARSE_PLPGSQL_ASSIGN3] = MODE_PLPGSQL_ASSIGN3,

        };


        yyextra.have_lookahead = true;

        yyextra.lookahead_token = mode_token[mode];

        yyextra.lookahead_yylloc = 0;

        yyextra.lookahead_end = NULL;

    }


    /* initialize the bison parser */

    parser_init(&yyextra);


    /* Parse! */

    yyresult = base_yyparse(yyscanner);


    /* Clean up (release memory) */

    scanner_finish(yyscanner);


    if (yyresult)               /* error */

        return NIL;


    return yyextra.parsetree;

}


/*

 * Intermediate filter between parser and core lexer (core_yylex in scan.l).

 *

 * This filter is needed because in some cases the standard SQL grammar

 * requires more than one token lookahead.  We reduce these cases to one-token

 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).

 *

 * Using a filter is simpler than trying to recognize multiword tokens

 * directly in scan.l, because we'd have to allow for comments between the

 * words.  Furthermore it's not clear how to do that without re-introducing

 * scanner backtrack, which would cost more performance than this filter

 * layer does.

 *

 * We also use this filter to convert UIDENT and USCONST sequences into

 * plain IDENT and SCONST tokens.  While that could be handled by additional

 * productions in the main grammar, it's more efficient to do it like this.

 *

 * The filter also provides a convenient place to translate between

 * the core_YYSTYPE and YYSTYPE representations (which are really the

 * same thing anyway, but notationally they're different).

 */

int

base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)

{

    base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);

    int         cur_token;

    int         next_token;

    int         cur_token_length;

    YYLTYPE     cur_yylloc;


    /* Get next token --- we might already have it */

    if (yyextra->have_lookahead)

    {

        cur_token = yyextra->lookahead_token;

        lvalp->core_yystype = yyextra->lookahead_yylval;

        *llocp = yyextra->lookahead_yylloc;

        if (yyextra->lookahead_end)

            *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;

        yyextra->have_lookahead = false;

    }

    else

        cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);


    /*

     * If this token isn't one that requires lookahead, just return it.  If it

     * does, determine the token length.  (We could get that via strlen(), but

     * since we have such a small set of possibilities, hardwiring seems

     * feasible and more efficient --- at least for the fixed-length cases.)

     */

    switch (cur_token)

    {

        case FORMAT:

            cur_token_length = 6;

            break;

        case NOT:

            cur_token_length = 3;

            break;

        case NULLS_P:

            cur_token_length = 5;

            break;

        case WITH:

            cur_token_length = 4;

            break;

        case UIDENT:

        case USCONST:

            cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);

            break;

        case WITHOUT:

            cur_token_length = 7;

            break;

        default:

            return cur_token;

    }


    /*

     * Identify end+1 of current token.  core_yylex() has temporarily stored a

     * '\0' here, and will undo that when we call it again.  We need to redo

     * it to fully revert the lookahead call for error reporting purposes.

     */

    yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +

        *llocp + cur_token_length;

    Assert(*(yyextra->lookahead_end) == '\0');


    /*

     * Save and restore *llocp around the call.  It might look like we could

     * avoid this by just passing &lookahead_yylloc to core_yylex(), but that

     * does not work because flex actually holds onto the last-passed pointer

     * internally, and will use that for error reporting.  We need any error

     * reports to point to the current token, not the next one.

     */

    cur_yylloc = *llocp;


    /* Get next token, saving outputs into lookahead variables */

    next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);

    yyextra->lookahead_token = next_token;

    yyextra->lookahead_yylloc = *llocp;


    *llocp = cur_yylloc;


    /* Now revert the un-truncation of the current token */

    yyextra->lookahead_hold_char = *(yyextra->lookahead_end);

    *(yyextra->lookahead_end) = '\0';


    yyextra->have_lookahead = true;


    /* Replace cur_token if needed, based on lookahead */

    switch (cur_token)

    {

        case FORMAT:

            /* Replace FORMAT by FORMAT_LA if it's followed by JSON */

            switch (next_token)

            {

                case JSON:

                    cur_token = FORMAT_LA;

                    break;

            }

            break;


        case NOT:

            /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */

            switch (next_token)

            {

                case BETWEEN:

                case IN_P:

                case LIKE:

                case ILIKE:

                case SIMILAR:

                    cur_token = NOT_LA;

                    break;

            }

            break;


        case NULLS_P:

            /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */

            switch (next_token)

            {

                case FIRST_P:

                case LAST_P:

                    cur_token = NULLS_LA;

                    break;

            }

            break;


        case WITH:

            /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */

            switch (next_token)

            {

                case TIME:

                case ORDINALITY:

                    cur_token = WITH_LA;

                    break;

            }

            break;


        case WITHOUT:

            /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */

            switch (next_token)

            {

                case TIME:

                    cur_token = WITHOUT_LA;

                    break;

            }

            break;


        case UIDENT:

        case USCONST:

            /* Look ahead for UESCAPE */

            if (next_token == UESCAPE)

            {

                /* Yup, so get third token, which had better be SCONST */

                const char *escstr;


                /* Again save and restore *llocp */

                cur_yylloc = *llocp;


                /* Un-truncate current token so errors point to third token */

                *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;


                /* Get third token */

                next_token = core_yylex(&(yyextra->lookahead_yylval),

                                        llocp, yyscanner);


                /* If we throw error here, it will point to third token */

                if (next_token != SCONST)

                    scanner_yyerror("UESCAPE must be followed by a simple string literal",

                                    yyscanner);


                escstr = yyextra->lookahead_yylval.str;

                if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))

                    scanner_yyerror("invalid Unicode escape character",

                                    yyscanner);


                /* Now restore *llocp; errors will point to first token */

                *llocp = cur_yylloc;


                /* Apply Unicode conversion */

                lvalp->core_yystype.str =

                    str_udeescape(lvalp->core_yystype.str,

                                  escstr[0],

                                  *llocp,

                                  yyscanner);


                /*

                 * We don't need to revert the un-truncation of UESCAPE.  What

                 * we do want to do is clear have_lookahead, thereby consuming

                 * all three tokens.

                 */

                yyextra->have_lookahead = false;

            }

            else

            {

                /* No UESCAPE, so convert using default escape character */

                lvalp->core_yystype.str =

                    str_udeescape(lvalp->core_yystype.str,

                                  '\\',

                                  *llocp,

                                  yyscanner);

            }


            if (cur_token == UIDENT)

            {

                /* It's an identifier, so truncate as appropriate */

                truncate_identifier(lvalp->core_yystype.str,

                                    strlen(lvalp->core_yystype.str),

                                    true);

                cur_token = IDENT;

            }

            else if (cur_token == USCONST)

            {

                cur_token = SCONST;

            }

            break;

    }


    return cur_token;

}


/* convert hex digit (caller should have verified that) to value */

static unsigned int

hexval(unsigned char c)

{

    if (c >= '0' && c <= '9')

        return c - '0';

    if (c >= 'a' && c <= 'f')

        return c - 'a' + 0xA;

    if (c >= 'A' && c <= 'F')

        return c - 'A' + 0xA;

    elog(ERROR, "invalid hexadecimal digit");

    return 0;                   /* not reached */

}


/* is Unicode code point acceptable? */

static void

check_unicode_value(pg_wchar c)

{

    if (!is_valid_unicode_codepoint(c))

        ereport(ERROR,

                (errcode(ERRCODE_SYNTAX_ERROR),

                 errmsg("invalid Unicode escape value")));

}


/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */

static bool

check_uescapechar(unsigned char escape)

{

    if (isxdigit(escape)

        || escape == '+'

        || escape == '\''

        || escape == '"'

        || scanner_isspace(escape))

        return false;

    else

        return true;

}


/*

 * Process Unicode escapes in "str", producing a palloc'd plain string

 *

 * escape: the escape character to use

 * position: start position of U&'' or U&"" string token

 * yyscanner: context information needed for error reports

 */

static char *

str_udeescape(const char *str, char escape,

              int position, core_yyscan_t yyscanner)

{

    const char *in;

    char       *new,

               *out;

    size_t      new_len;

    pg_wchar    pair_first = 0;

    ScannerCallbackState scbstate;


    /*

     * Guesstimate that result will be no longer than input, but allow enough

     * padding for Unicode conversion.

     */

    new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;

    new = palloc(new_len);


    in = str;

    out = new;

    while (*in)

    {

        /* Enlarge string if needed */

        size_t      out_dist = out - new;


        if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))

        {

            new_len *= 2;

            new = repalloc(new, new_len);

            out = new + out_dist;

        }


        if (in[0] == escape)

        {

            /*

             * Any errors reported while processing this escape sequence will

             * have an error cursor pointing at the escape.

             */

            setup_scanner_errposition_callback(&scbstate, yyscanner,

                                               in - str + position + 3);    /* 3 for U&" */

            if (in[1] == escape)

            {

                if (pair_first)

                    goto invalid_pair;

                *out++ = escape;

                in += 2;

            }

            else if (isxdigit((unsigned char) in[1]) &&

                     isxdigit((unsigned char) in[2]) &&

                     isxdigit((unsigned char) in[3]) &&

                     isxdigit((unsigned char) in[4]))

            {

                pg_wchar    unicode;


                unicode = (hexval(in[1]) << 12) +

                    (hexval(in[2]) << 8) +

                    (hexval(in[3]) << 4) +

                    hexval(in[4]);

                check_unicode_value(unicode);

                if (pair_first)

                {

                    if (is_utf16_surrogate_second(unicode))

                    {

                        unicode = surrogate_pair_to_codepoint(pair_first, unicode);

                        pair_first = 0;

                    }

                    else

                        goto invalid_pair;

                }

                else if (is_utf16_surrogate_second(unicode))

                    goto invalid_pair;


                if (is_utf16_surrogate_first(unicode))

                    pair_first = unicode;

                else

                {

                    pg_unicode_to_server(unicode, (unsigned char *) out);

                    out += strlen(out);

                }

                in += 5;

            }

            else if (in[1] == '+' &&

                     isxdigit((unsigned char) in[2]) &&

                     isxdigit((unsigned char) in[3]) &&

                     isxdigit((unsigned char) in[4]) &&

                     isxdigit((unsigned char) in[5]) &&

                     isxdigit((unsigned char) in[6]) &&

                     isxdigit((unsigned char) in[7]))

            {

                pg_wchar    unicode;


                unicode = (hexval(in[2]) << 20) +

                    (hexval(in[3]) << 16) +

                    (hexval(in[4]) << 12) +

                    (hexval(in[5]) << 8) +

                    (hexval(in[6]) << 4) +

                    hexval(in[7]);

                check_unicode_value(unicode);

                if (pair_first)

                {

                    if (is_utf16_surrogate_second(unicode))

                    {

                        unicode = surrogate_pair_to_codepoint(pair_first, unicode);

                        pair_first = 0;

                    }

                    else

                        goto invalid_pair;

                }

                else if (is_utf16_surrogate_second(unicode))

                    goto invalid_pair;


                if (is_utf16_surrogate_first(unicode))

                    pair_first = unicode;

                else

                {

                    pg_unicode_to_server(unicode, (unsigned char *) out);

                    out += strlen(out);

                }

                in += 8;

            }

            else

                ereport(ERROR,

                        (errcode(ERRCODE_SYNTAX_ERROR),

                         errmsg("invalid Unicode escape"),

                         errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));


            cancel_scanner_errposition_callback(&scbstate);

        }

        else

        {

            if (pair_first)

                goto invalid_pair;


            *out++ = *in++;

        }

    }


    /* unfinished surrogate pair? */

    if (pair_first)

        goto invalid_pair;


    *out = '\0';

    return new;


    /*

     * We might get here with the error callback active, or not.  Call

     * scanner_errposition to make sure an error cursor appears; if the

     * callback is active, this is duplicative but harmless.

     */

invalid_pair:

    ereport(ERROR,

            (errcode(ERRCODE_SYNTAX_ERROR),

             errmsg("invalid Unicode surrogate pair"),

             scanner_errposition(in - str + position + 3,   /* 3 for U&" */

                                 yyscanner)));

    return NULL;                /* keep compiler quiet */

}

check_unicode_value
static void check_unicode_value(pg_wchar c)
Definition: parser.c:342

raw_parser
List * raw_parser(const char *str, RawParseMode mode)
Definition: parser.c:42

hexval
static unsigned int hexval(unsigned char c)
Definition: parser.c:328

str_udeescape
static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner)
Definition: parser.c:372

base_yylex
int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
Definition: parser.c:111

check_uescapechar
static bool check_uescapechar(unsigned char escape)
Definition: parser.c:352

errhint
int errhint(const char *fmt,...)
Definition: elog.c:1318

errcode
int errcode(int sqlerrcode)
Definition: elog.c:854

errmsg
int errmsg(const char *fmt,...)
Definition: elog.c:1071

ERROR
#define ERROR
Definition: elog.h:39

elog
#define elog(elevel,...)
Definition: elog.h:225

ereport
#define ereport(elevel,...)
Definition: elog.h:149

gramparse.h

pg_yyget_extra
#define pg_yyget_extra(yyscanner)
Definition: gramparse.h:64

parser_init
void parser_init(base_yy_extra_type *yyext)

base_yyparse
int base_yyparse(core_yyscan_t yyscanner)

Assert
Assert(PointerIsAligned(start, uint64))

str
const char * str
Definition: hashfn_unstable.h:254

next_token
static bool next_token(char **lineptr, StringInfo buf, bool *initial_quote, bool *terminating_comma)
Definition: hba.c:187

ScanKeywords
PGDLLIMPORT const ScanKeywordList ScanKeywords

pg_wchar
unsigned int pg_wchar
Definition: mbprint.c:31

pg_unicode_to_server
void pg_unicode_to_server(pg_wchar c, unsigned char *s)
Definition: mbutils.c:865

repalloc
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1548

palloc
void * palloc(Size size)
Definition: mcxt.c:1321

parser.h

RawParseMode
RawParseMode
Definition: parser.h:38

RAW_PARSE_PLPGSQL_EXPR
@ RAW_PARSE_PLPGSQL_EXPR
Definition: parser.h:41

RAW_PARSE_PLPGSQL_ASSIGN2
@ RAW_PARSE_PLPGSQL_ASSIGN2
Definition: parser.h:43

RAW_PARSE_PLPGSQL_ASSIGN1
@ RAW_PARSE_PLPGSQL_ASSIGN1
Definition: parser.h:42

RAW_PARSE_TYPE_NAME
@ RAW_PARSE_TYPE_NAME
Definition: parser.h:40

RAW_PARSE_PLPGSQL_ASSIGN3
@ RAW_PARSE_PLPGSQL_ASSIGN3
Definition: parser.h:44

RAW_PARSE_DEFAULT
@ RAW_PARSE_DEFAULT
Definition: parser.h:39

mode
static PgChecksumMode mode
Definition: pg_checksums.c:55

NIL
#define NIL
Definition: pg_list.h:68

pg_wchar.h

MAX_UNICODE_EQUIVALENT_STRING
#define MAX_UNICODE_EQUIVALENT_STRING
Definition: pg_wchar.h:329

is_valid_unicode_codepoint
static bool is_valid_unicode_codepoint(pg_wchar c)
Definition: pg_wchar.h:519

surrogate_pair_to_codepoint
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
Definition: pg_wchar.h:537

is_utf16_surrogate_first
static bool is_utf16_surrogate_first(pg_wchar c)
Definition: pg_wchar.h:525

is_utf16_surrogate_second
static bool is_utf16_surrogate_second(pg_wchar c)
Definition: pg_wchar.h:531

postgres.h

c
char * c
Definition: preproc-cursor.c:31

YYLTYPE
const char * YYLTYPE
Definition: preproc_extern.h:20

YYSTYPE
int YYSTYPE
Definition: psqlscanslash.l:39

scanner_errposition
int scanner_errposition(int location, core_yyscan_t yyscanner)
Definition: scan.l:1140

scanner_init
core_yyscan_t scanner_init(const char *str, core_yy_extra_type *yyext, const ScanKeywordList *keywordlist, const uint16 *keyword_tokens)
Definition: scan.l:1249

setup_scanner_errposition_callback
void setup_scanner_errposition_callback(ScannerCallbackState *scbstate, core_yyscan_t yyscanner, int location)
Definition: scan.l:1186

scanner_finish
void scanner_finish(core_yyscan_t yyscanner)
Definition: scan.l:1291

cancel_scanner_errposition_callback
void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
Definition: scan.l:1203

yyextra
#define yyextra
Definition: scan.l:1118

ScanKeywordTokens
const uint16 ScanKeywordTokens[]
Definition: scan.l:81

scanner_yyerror
void scanner_yyerror(const char *message, core_yyscan_t yyscanner)
Definition: scan.l:1222

core_yyscan_t
void * core_yyscan_t
Definition: scanner.h:121

core_yylex
int core_yylex(core_YYSTYPE *yylval_param, YYLTYPE *yylloc_param, core_yyscan_t yyscanner)

truncate_identifier
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:93

scanner_isspace
bool scanner_isspace(char ch)
Definition: scansup.c:117

scansup.h

List
Definition: pg_list.h:54

ScannerCallbackState
Definition: scanner.h:125

base_yy_extra_type
Definition: gramparse.h:36