PostgreSQL Source Code  git master
mbprint.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Multibyte character printing support for frontend code
4  *
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/fe_utils/mbprint.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres_fe.h"
14 
15 #include "fe_utils/mbprint.h"
16 
17 #include "libpq-fe.h"
18 
19 
20 /*
21  * To avoid version-skew problems, this file must not use declarations
22  * from pg_wchar.h: the encoding IDs we are dealing with are determined
23  * by the libpq.so we are linked with, and that might not match the
24  * numbers we see at compile time. (If this file were inside libpq,
25  * the problem would go away...)
26  *
27  * Hence, we have our own definition of pg_wchar, and we get the values
28  * of any needed encoding IDs on-the-fly.
29  */
30 
31 typedef unsigned int pg_wchar;
32 
33 static int
35 {
36  static int utf8_id = -1;
37 
38  if (utf8_id < 0)
39  utf8_id = pg_char_to_encoding("utf8");
40  return utf8_id;
41 }
42 
43 #define PG_UTF8 pg_get_utf8_id()
44 
45 
46 /*
47  * Convert a UTF-8 character to a Unicode code point.
48  * This is a one-character version of pg_utf2wchar_with_len.
49  *
50  * No error checks here, c must point to a long-enough string.
51  */
52 static pg_wchar
53 utf8_to_unicode(const unsigned char *c)
54 {
55  if ((*c & 0x80) == 0)
56  return (pg_wchar) c[0];
57  else if ((*c & 0xe0) == 0xc0)
58  return (pg_wchar) (((c[0] & 0x1f) << 6) |
59  (c[1] & 0x3f));
60  else if ((*c & 0xf0) == 0xe0)
61  return (pg_wchar) (((c[0] & 0x0f) << 12) |
62  ((c[1] & 0x3f) << 6) |
63  (c[2] & 0x3f));
64  else if ((*c & 0xf8) == 0xf0)
65  return (pg_wchar) (((c[0] & 0x07) << 18) |
66  ((c[1] & 0x3f) << 12) |
67  ((c[2] & 0x3f) << 6) |
68  (c[3] & 0x3f));
69  else
70  /* that is an invalid code on purpose */
71  return 0xffffffff;
72 }
73 
74 
75 /*
76  * Unicode 3.1 compliant validation : for each category, it checks the
77  * combination of each byte to make sure it maps to a valid range. It also
78  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
79  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
80  */
81 static int
82 utf_charcheck(const unsigned char *c)
83 {
84  if ((*c & 0x80) == 0)
85  return 1;
86  else if ((*c & 0xe0) == 0xc0)
87  {
88  /* two-byte char */
89  if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
90  return 2;
91  return -1;
92  }
93  else if ((*c & 0xf0) == 0xe0)
94  {
95  /* three-byte char */
96  if (((c[1] & 0xc0) == 0x80) &&
97  (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
98  ((c[2] & 0xc0) == 0x80))
99  {
100  int z = c[0] & 0x0f;
101  int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
102  int lx = yx & 0x7f;
103 
104  /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
105  if (((z == 0x0f) &&
106  (((yx & 0xffe) == 0xffe) ||
107  (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
108  ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
109  return -1;
110  return 3;
111  }
112  return -1;
113  }
114  else if ((*c & 0xf8) == 0xf0)
115  {
116  int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
117 
118  /* four-byte char */
119  if (((c[1] & 0xc0) == 0x80) &&
120  (u > 0x00) && (u <= 0x10) &&
121  ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
122  {
123  /* test for 0xzzzzfffe/0xzzzzfffff */
124  if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
125  ((c[3] & 0x3e) == 0x3e))
126  return -1;
127  return 4;
128  }
129  return -1;
130  }
131  return -1;
132 }
133 
134 
135 static void
136 mb_utf_validate(unsigned char *pwcs)
137 {
138  unsigned char *p = pwcs;
139 
140  while (*pwcs)
141  {
142  int len;
143 
144  if ((len = utf_charcheck(pwcs)) > 0)
145  {
146  if (p != pwcs)
147  {
148  int i;
149 
150  for (i = 0; i < len; i++)
151  *p++ = *pwcs++;
152  }
153  else
154  {
155  pwcs += len;
156  p += len;
157  }
158  }
159  else
160  /* we skip the char */
161  pwcs++;
162  }
163  if (p != pwcs)
164  *p = '\0';
165 }
166 
167 /*
168  * public functions : wcswidth and mbvalidate
169  */
170 
171 /*
172  * pg_wcswidth is the dumb display-width function.
173  * It assumes that everything will appear on one line.
174  * OTOH it is easier to use than pg_wcssize if this applies to you.
175  */
176 int
177 pg_wcswidth(const char *pwcs, size_t len, int encoding)
178 {
179  int width = 0;
180 
181  while (len > 0)
182  {
183  int chlen,
184  chwidth;
185 
186  chlen = PQmblen(pwcs, encoding);
187  if (len < (size_t) chlen)
188  break; /* Invalid string */
189 
190  chwidth = PQdsplen(pwcs, encoding);
191  if (chwidth > 0)
192  width += chwidth;
193 
194  pwcs += chlen;
195  len -= chlen;
196  }
197  return width;
198 }
199 
200 /*
201  * pg_wcssize takes the given string in the given encoding and returns three
202  * values:
203  * result_width: Width in display characters of the longest line in string
204  * result_height: Number of lines in display output
205  * result_format_size: Number of bytes required to store formatted
206  * representation of string
207  *
208  * This MUST be kept in sync with pg_wcsformat!
209  */
210 void
211 pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
212  int *result_width, int *result_height, int *result_format_size)
213 {
214  int w,
215  chlen = 0,
216  linewidth = 0;
217  int width = 0;
218  int height = 1;
219  int format_size = 0;
220 
221  for (; *pwcs && len > 0; pwcs += chlen)
222  {
223  chlen = PQmblen((const char *) pwcs, encoding);
224  if (len < (size_t) chlen)
225  break;
226  w = PQdsplen((const char *) pwcs, encoding);
227 
228  if (chlen == 1) /* single-byte char */
229  {
230  if (*pwcs == '\n') /* Newline */
231  {
232  if (linewidth > width)
233  width = linewidth;
234  linewidth = 0;
235  height += 1;
236  format_size += 1; /* For NUL char */
237  }
238  else if (*pwcs == '\r') /* Linefeed */
239  {
240  linewidth += 2;
241  format_size += 2;
242  }
243  else if (*pwcs == '\t') /* Tab */
244  {
245  do
246  {
247  linewidth++;
248  format_size++;
249  } while (linewidth % 8 != 0);
250  }
251  else if (w < 0) /* Other control char */
252  {
253  linewidth += 4;
254  format_size += 4;
255  }
256  else /* Output it as-is */
257  {
258  linewidth += w;
259  format_size += 1;
260  }
261  }
262  else if (w < 0) /* Non-ascii control char */
263  {
264  linewidth += 6; /* \u0000 */
265  format_size += 6;
266  }
267  else /* All other chars */
268  {
269  linewidth += w;
270  format_size += chlen;
271  }
272  len -= chlen;
273  }
274  if (linewidth > width)
275  width = linewidth;
276  format_size += 1; /* For NUL char */
277 
278  /* Set results */
279  if (result_width)
280  *result_width = width;
281  if (result_height)
282  *result_height = height;
283  if (result_format_size)
284  *result_format_size = format_size;
285 }
286 
287 /*
288  * Format a string into one or more "struct lineptr" lines.
289  * lines[i].ptr == NULL indicates the end of the array.
290  *
291  * This MUST be kept in sync with pg_wcssize!
292  */
293 void
294 pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
295  struct lineptr *lines, int count)
296 {
297  int w,
298  chlen = 0;
299  int linewidth = 0;
300  unsigned char *ptr = lines->ptr; /* Pointer to data area */
301 
302  for (; *pwcs && len > 0; pwcs += chlen)
303  {
304  chlen = PQmblen((const char *) pwcs, encoding);
305  if (len < (size_t) chlen)
306  break;
307  w = PQdsplen((const char *) pwcs, encoding);
308 
309  if (chlen == 1) /* single-byte char */
310  {
311  if (*pwcs == '\n') /* Newline */
312  {
313  *ptr++ = '\0';
314  lines->width = linewidth;
315  linewidth = 0;
316  lines++;
317  count--;
318  if (count <= 0)
319  exit(1); /* Screwup */
320 
321  /* make next line point to remaining memory */
322  lines->ptr = ptr;
323  }
324  else if (*pwcs == '\r') /* Linefeed */
325  {
326  strcpy((char *) ptr, "\\r");
327  linewidth += 2;
328  ptr += 2;
329  }
330  else if (*pwcs == '\t') /* Tab */
331  {
332  do
333  {
334  *ptr++ = ' ';
335  linewidth++;
336  } while (linewidth % 8 != 0);
337  }
338  else if (w < 0) /* Other control char */
339  {
340  sprintf((char *) ptr, "\\x%02X", *pwcs);
341  linewidth += 4;
342  ptr += 4;
343  }
344  else /* Output it as-is */
345  {
346  linewidth += w;
347  *ptr++ = *pwcs;
348  }
349  }
350  else if (w < 0) /* Non-ascii control char */
351  {
352  if (encoding == PG_UTF8)
353  sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
354  else
355  {
356  /*
357  * This case cannot happen in the current code because only
358  * UTF-8 signals multibyte control characters. But we may need
359  * to support it at some stage
360  */
361  sprintf((char *) ptr, "\\u????");
362  }
363  ptr += 6;
364  linewidth += 6;
365  }
366  else /* All other chars */
367  {
368  int i;
369 
370  for (i = 0; i < chlen; i++)
371  *ptr++ = pwcs[i];
372  linewidth += w;
373  }
374  len -= chlen;
375  }
376  lines->width = linewidth;
377  *ptr++ = '\0'; /* Terminate formatted string */
378 
379  if (count <= 0)
380  exit(1); /* Screwup */
381 
382  (lines + 1)->ptr = NULL; /* terminate line array */
383 }
384 
385 
386 /*
387  * Encoding validation: delete any unvalidatable characters from the string
388  *
389  * This seems redundant with existing functionality elsewhere?
390  */
391 unsigned char *
392 mbvalidate(unsigned char *pwcs, int encoding)
393 {
394  if (encoding == PG_UTF8)
395  mb_utf_validate(pwcs);
396  else
397  {
398  /*
399  * other encodings needing validation should add their own routines
400  * here
401  */
402  }
403 
404  return pwcs;
405 }
int PQmblen(const char *s, int encoding)
Definition: fe-misc.c:1224
int PQdsplen(const char *s, int encoding)
Definition: fe-misc.c:1244
int i
Definition: isn.c:73
exit(1)
#define PG_UTF8
Definition: mbprint.c:43
static void mb_utf_validate(unsigned char *pwcs)
Definition: mbprint.c:136
void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, int *result_width, int *result_height, int *result_format_size)
Definition: mbprint.c:211
int pg_wcswidth(const char *pwcs, size_t len, int encoding)
Definition: mbprint.c:177
void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr *lines, int count)
Definition: mbprint.c:294
static int utf_charcheck(const unsigned char *c)
Definition: mbprint.c:82
static int pg_get_utf8_id(void)
Definition: mbprint.c:34
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned char * mbvalidate(unsigned char *pwcs, int encoding)
Definition: mbprint.c:392
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
int32 encoding
Definition: pg_database.h:41
#define pg_char_to_encoding
Definition: pg_wchar.h:629
#define sprintf
Definition: port.h:240
char * c
int width
Definition: mbprint.h:19
unsigned char * ptr
Definition: mbprint.h:18