mbprint_8c_source.html

/*-------------------------------------------------------------------------

 *

 * Multibyte character printing support for frontend code

 *

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 * src/fe_utils/mbprint.c

 *

 *-------------------------------------------------------------------------

 */

#include "postgres_fe.h"


#include "fe_utils/mbprint.h"


#include "libpq-fe.h"


/*

 * To avoid version-skew problems, this file must not use declarations

 * from pg_wchar.h: the encoding IDs we are dealing with are determined

 * by the libpq.so we are linked with, and that might not match the

 * numbers we see at compile time.  (If this file were inside libpq,

 * the problem would go away...)

 *

 * Hence, we have our own definition of pg_wchar, and we get the values

 * of any needed encoding IDs on-the-fly.

 */


typedef unsigned int pg_wchar;


static int

pg_get_utf8_id(void)

{

    static int  utf8_id = -1;


    if (utf8_id < 0)

        utf8_id = pg_char_to_encoding("utf8");

    return utf8_id;

}


#define PG_UTF8     pg_get_utf8_id()


/*

 * Convert a UTF-8 character to a Unicode code point.

 * This is a one-character version of pg_utf2wchar_with_len.

 *

 * No error checks here, c must point to a long-enough string.

 */

static pg_wchar

utf8_to_unicode(const unsigned char *c)

{

    if ((*c & 0x80) == 0)

        return (pg_wchar) c[0];

    else if ((*c & 0xe0) == 0xc0)

        return (pg_wchar) (((c[0] & 0x1f) << 6) |

                           (c[1] & 0x3f));

    else if ((*c & 0xf0) == 0xe0)

        return (pg_wchar) (((c[0] & 0x0f) << 12) |

                           ((c[1] & 0x3f) << 6) |

                           (c[2] & 0x3f));

    else if ((*c & 0xf8) == 0xf0)

        return (pg_wchar) (((c[0] & 0x07) << 18) |

                           ((c[1] & 0x3f) << 12) |

                           ((c[2] & 0x3f) << 6) |

                           (c[3] & 0x3f));

    else

        /* that is an invalid code on purpose */

        return 0xffffffff;

}


/*

 * Unicode 3.1 compliant validation : for each category, it checks the

 * combination of each byte to make sure it maps to a valid range. It also

 * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =

 * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)

 */

static int

utf_charcheck(const unsigned char *c)

{

    if ((*c & 0x80) == 0)

        return 1;

    else if ((*c & 0xe0) == 0xc0)

    {

        /* two-byte char */

        if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))

            return 2;

        return -1;

    }

    else if ((*c & 0xf0) == 0xe0)

    {

        /* three-byte char */

        if (((c[1] & 0xc0) == 0x80) &&

            (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&

            ((c[2] & 0xc0) == 0x80))

        {

            int         z = c[0] & 0x0f;

            int         yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);

            int         lx = yx & 0x7f;


            /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */

            if (((z == 0x0f) &&

                 (((yx & 0xffe) == 0xffe) ||

                  (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||

                ((z == 0x0d) && ((yx & 0xb00) == 0x800)))

                return -1;

            return 3;

        }

        return -1;

    }

    else if ((*c & 0xf8) == 0xf0)

    {

        int         u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);


        /* four-byte char */

        if (((c[1] & 0xc0) == 0x80) &&

            (u > 0x00) && (u <= 0x10) &&

            ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))

        {

            /* test for 0xzzzzfffe/0xzzzzfffff */

            if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&

                ((c[3] & 0x3e) == 0x3e))

                return -1;

            return 4;

        }

        return -1;

    }

    return -1;

}


static void

mb_utf_validate(unsigned char *pwcs)

{

    unsigned char *p = pwcs;


    while (*pwcs)

    {

        int         len;


        if ((len = utf_charcheck(pwcs)) > 0)

        {

            if (p != pwcs)

            {

                int         i;


                for (i = 0; i < len; i++)

                    *p++ = *pwcs++;

            }

            else

            {

                pwcs += len;

                p += len;

            }

        }

        else

            /* we skip the char */

            pwcs++;

    }

    if (p != pwcs)

        *p = '\0';

}


/*

 * public functions : wcswidth and mbvalidate

 */


/*

 * pg_wcswidth is the dumb display-width function.

 * It assumes that everything will appear on one line.

 * OTOH it is easier to use than pg_wcssize if this applies to you.

 */

int

pg_wcswidth(const char *pwcs, size_t len, int encoding)

{

    int         width = 0;


    while (len > 0)

    {

        int         chlen,

                    chwidth;


        chlen = PQmblen(pwcs, encoding);

        if (len < (size_t) chlen)

            break;              /* Invalid string */


        chwidth = PQdsplen(pwcs, encoding);

        if (chwidth > 0)

            width += chwidth;


        pwcs += chlen;

        len -= chlen;

    }

    return width;

}


/*

 * pg_wcssize takes the given string in the given encoding and returns three

 * values:

 *    result_width: Width in display characters of the longest line in string

 *    result_height: Number of lines in display output

 *    result_format_size: Number of bytes required to store formatted

 *      representation of string

 *

 * This MUST be kept in sync with pg_wcsformat!

 */

void

pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,

           int *result_width, int *result_height, int *result_format_size)

{

    int         w,

                chlen = 0,

                linewidth = 0;

    int         width = 0;

    int         height = 1;

    int         format_size = 0;


    for (; *pwcs && len > 0; pwcs += chlen)

    {

        chlen = PQmblen((const char *) pwcs, encoding);

        if (len < (size_t) chlen)

            break;

        w = PQdsplen((const char *) pwcs, encoding);


        if (chlen == 1)         /* single-byte char */

        {

            if (*pwcs == '\n')  /* Newline */

            {

                if (linewidth > width)

                    width = linewidth;

                linewidth = 0;

                height += 1;

                format_size += 1;   /* For NUL char */

            }

            else if (*pwcs == '\r') /* Linefeed */

            {

                linewidth += 2;

                format_size += 2;

            }

            else if (*pwcs == '\t') /* Tab */

            {

                do

                {

                    linewidth++;

                    format_size++;

                } while (linewidth % 8 != 0);

            }

            else if (w < 0)     /* Other control char */

            {

                linewidth += 4;

                format_size += 4;

            }

            else                /* Output it as-is */

            {

                linewidth += w;

                format_size += 1;

            }

        }

        else if (w < 0)         /* Non-ascii control char */

        {

            linewidth += 6;     /* \u0000 */

            format_size += 6;

        }

        else                    /* All other chars */

        {

            linewidth += w;

            format_size += chlen;

        }

        len -= chlen;

    }

    if (linewidth > width)

        width = linewidth;

    format_size += 1;           /* For NUL char */


    /* Set results */

    if (result_width)

        *result_width = width;

    if (result_height)

        *result_height = height;

    if (result_format_size)

        *result_format_size = format_size;

}


/*

 *  Format a string into one or more "struct lineptr" lines.

 *  lines[i].ptr == NULL indicates the end of the array.

 *

 * This MUST be kept in sync with pg_wcssize!

 */

void

pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,

             struct lineptr *lines, int count)

{

    int         w,

                chlen = 0;

    int         linewidth = 0;

    unsigned char *ptr = lines->ptr;    /* Pointer to data area */


    for (; *pwcs && len > 0; pwcs += chlen)

    {

        chlen = PQmblen((const char *) pwcs, encoding);

        if (len < (size_t) chlen)

            break;

        w = PQdsplen((const char *) pwcs, encoding);


        if (chlen == 1)         /* single-byte char */

        {

            if (*pwcs == '\n')  /* Newline */

            {

                *ptr++ = '\0';

                lines->width = linewidth;

                linewidth = 0;

                lines++;

                count--;

                if (count <= 0)

                    exit(1);    /* Screwup */


                /* make next line point to remaining memory */

                lines->ptr = ptr;

            }

            else if (*pwcs == '\r') /* Linefeed */

            {

                strcpy((char *) ptr, "\\r");

                linewidth += 2;

                ptr += 2;

            }

            else if (*pwcs == '\t') /* Tab */

            {

                do

                {

                    *ptr++ = ' ';

                    linewidth++;

                } while (linewidth % 8 != 0);

            }

            else if (w < 0)     /* Other control char */

            {

                sprintf((char *) ptr, "\\x%02X", *pwcs);

                linewidth += 4;

                ptr += 4;

            }

            else                /* Output it as-is */

            {

                linewidth += w;

                *ptr++ = *pwcs;

            }

        }

        else if (w < 0)         /* Non-ascii control char */

        {

            if (encoding == PG_UTF8)

                sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));

            else

            {

                /*

                 * This case cannot happen in the current code because only

                 * UTF-8 signals multibyte control characters. But we may need

                 * to support it at some stage

                 */

                sprintf((char *) ptr, "\\u????");

            }

            ptr += 6;

            linewidth += 6;

        }

        else                    /* All other chars */

        {

            int         i;


            for (i = 0; i < chlen; i++)

                *ptr++ = pwcs[i];

            linewidth += w;

        }

        len -= chlen;

    }

    lines->width = linewidth;

    *ptr++ = '\0';              /* Terminate formatted string */


    if (count <= 0)

        exit(1);                /* Screwup */


    (lines + 1)->ptr = NULL;    /* terminate line array */

}


/*

 * Encoding validation: delete any unvalidatable characters from the string

 *

 * This seems redundant with existing functionality elsewhere?

 */

unsigned char *

mbvalidate(unsigned char *pwcs, int encoding)

{

    if (encoding == PG_UTF8)

        mb_utf_validate(pwcs);

    else

    {

        /*

         * other encodings needing validation should add their own routines

         * here

         */

    }


    return pwcs;

}

PQmblen
int PQmblen(const char *s, int encoding)
Definition: fe-misc.c:1229

PQdsplen
int PQdsplen(const char *s, int encoding)
Definition: fe-misc.c:1250

i
int i
Definition: isn.c:77

libpq-fe.h

PG_UTF8
#define PG_UTF8
Definition: mbprint.c:43

mb_utf_validate
static void mb_utf_validate(unsigned char *pwcs)
Definition: mbprint.c:136

pg_wcssize
void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, int *result_width, int *result_height, int *result_format_size)
Definition: mbprint.c:211

pg_wcswidth
int pg_wcswidth(const char *pwcs, size_t len, int encoding)
Definition: mbprint.c:177

pg_wcsformat
void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr *lines, int count)
Definition: mbprint.c:294

utf_charcheck
static int utf_charcheck(const unsigned char *c)
Definition: mbprint.c:82

pg_get_utf8_id
static int pg_get_utf8_id(void)
Definition: mbprint.c:34

utf8_to_unicode
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53

pg_wchar
unsigned int pg_wchar
Definition: mbprint.c:31

mbvalidate
unsigned char * mbvalidate(unsigned char *pwcs, int encoding)
Definition: mbprint.c:392

mbprint.h

len
const void size_t len
Definition: pg_crc32c_sse42.c:28

encoding
int32 encoding
Definition: pg_database.h:41

pg_char_to_encoding
#define pg_char_to_encoding
Definition: pg_wchar.h:629

sprintf
#define sprintf
Definition: port.h:241

postgres_fe.h

c
char * c
Definition: preproc-cursor.c:31

lineptr
Definition: mbprint.h:17

lineptr::width
int width
Definition: mbprint.h:19

lineptr::ptr
unsigned char * ptr
Definition: mbprint.h:18