scansup_8c_source.html

/*-------------------------------------------------------------------------

 *

 * scansup.c

 *    scanner support routines used by the core lexer

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 *

 * IDENTIFICATION

 *    src/backend/parser/scansup.c

 *

 *-------------------------------------------------------------------------

 */

#include "postgres.h"


#include <ctype.h>


#include "mb/pg_wchar.h"

#include "parser/scansup.h"


/*

 * downcase_truncate_identifier() --- do appropriate downcasing and

 * truncation of an unquoted identifier.  Optionally warn of truncation.

 *

 * Returns a palloc'd string containing the adjusted identifier.

 *

 * Note: in some usages the passed string is not null-terminated.

 *

 * Note: the API of this function is designed to allow for downcasing

 * transformations that increase the string length, but we don't yet

 * support that.  If you want to implement it, you'll need to fix

 * SplitIdentifierString() in utils/adt/varlena.c.

 */

char *

downcase_truncate_identifier(const char *ident, int len, bool warn)

{

    return downcase_identifier(ident, len, warn, true);

}


/*

 * a workhorse for downcase_truncate_identifier

 */

char *

downcase_identifier(const char *ident, int len, bool warn, bool truncate)

{

    char       *result;

    int         i;

    bool        enc_is_single_byte;


    result = palloc(len + 1);

    enc_is_single_byte = pg_database_encoding_max_length() == 1;


    /*

     * SQL99 specifies Unicode-aware case normalization, which we don't yet

     * have the infrastructure for.  Instead we use tolower() to provide a

     * locale-aware translation.  However, there are some locales where this

     * is not right either (eg, Turkish may do strange things with 'i' and

     * 'I').  Our current compromise is to use tolower() for characters with

     * the high bit set, as long as they aren't part of a multi-byte

     * character, and use an ASCII-only downcasing for 7-bit characters.

     */

    for (i = 0; i < len; i++)

    {

        unsigned char ch = (unsigned char) ident[i];


        if (ch >= 'A' && ch <= 'Z')

            ch += 'a' - 'A';

        else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))

            ch = tolower(ch);

        result[i] = (char) ch;

    }

    result[i] = '\0';


    if (i >= NAMEDATALEN && truncate)

        truncate_identifier(result, i, warn);


    return result;

}


/*

 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.

 *

 * The given string is modified in-place, if necessary.  A warning is

 * issued if requested.

 *

 * We require the caller to pass in the string length since this saves a

 * strlen() call in some common usages.

 */

void

truncate_identifier(char *ident, int len, bool warn)

{

    if (len >= NAMEDATALEN)

    {

        len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);

        if (warn)

            ereport(NOTICE,

                    (errcode(ERRCODE_NAME_TOO_LONG),

                     errmsg("identifier \"%s\" will be truncated to \"%.*s\"",

                            ident, len, ident)));

        ident[len] = '\0';

    }

}


/*

 * scanner_isspace() --- return true if flex scanner considers char whitespace

 *

 * This should be used instead of the potentially locale-dependent isspace()

 * function when it's important to match the lexer's behavior.

 *

 * In principle we might need similar functions for isalnum etc, but for the

 * moment only isspace seems needed.

 */

bool

scanner_isspace(char ch)

{

    /* This must match scan.l's list of {space} characters */

    if (ch == ' ' ||

        ch == '\t' ||

        ch == '\n' ||

        ch == '\r' ||

        ch == '\v' ||

        ch == '\f')

        return true;

    return false;

}

IS_HIGHBIT_SET
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1126

errcode
int errcode(int sqlerrcode)
Definition: elog.c:854

errmsg
int errmsg(const char *fmt,...)
Definition: elog.c:1071

NOTICE
#define NOTICE
Definition: elog.h:35

ereport
#define ereport(elevel,...)
Definition: elog.h:149

ident
#define ident
Definition: indent_codes.h:47

i
int i
Definition: isn.c:77

pg_mbcliplen
int pg_mbcliplen(const char *mbstr, int len, int limit)
Definition: mbutils.c:1084

pg_database_encoding_max_length
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1547

palloc
void * palloc(Size size)
Definition: mcxt.c:1321

NAMEDATALEN
#define NAMEDATALEN
Definition: pg_config_manual.h:29

len
const void size_t len
Definition: pg_crc32c_sse42.c:28

pg_wchar.h

postgres.h

downcase_identifier
char * downcase_identifier(const char *ident, int len, bool warn, bool truncate)
Definition: scansup.c:46

truncate_identifier
void truncate_identifier(char *ident, int len, bool warn)
Definition: scansup.c:93

downcase_truncate_identifier
char * downcase_truncate_identifier(const char *ident, int len, bool warn)
Definition: scansup.c:37

scanner_isspace
bool scanner_isspace(char ch)
Definition: scansup.c:117

scansup.h

warn
warn
Definition: strftime.c:110