PostgreSQL Source Code  git master
unicode_case.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * unicode_case.c
3  * Unicode case mapping and case conversion.
4  *
5  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  * src/common/unicode_case.c
9  *
10  *-------------------------------------------------------------------------
11  */
12 #ifndef FRONTEND
13 #include "postgres.h"
14 #else
15 #include "postgres_fe.h"
16 #endif
17 
18 #include "common/unicode_case.h"
20 #include "mb/pg_wchar.h"
21 
22 static const pg_case_map *find_case_map(pg_wchar ucs);
23 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
24  CaseKind str_casekind, WordBoundaryNext wbnext,
25  void *wbstate);
26 
29 {
30  const pg_case_map *map = find_case_map(code);
31 
32  return map ? map->simplemap[CaseLower] : code;
33 }
34 
37 {
38  const pg_case_map *map = find_case_map(code);
39 
40  return map ? map->simplemap[CaseTitle] : code;
41 }
42 
45 {
46  const pg_case_map *map = find_case_map(code);
47 
48  return map ? map->simplemap[CaseUpper] : code;
49 }
50 
51 /*
52  * unicode_strlower()
53  *
54  * Convert src to lowercase, and return the result length (not including
55  * terminating NUL).
56  *
57  * String src must be encoded in UTF-8. If srclen < 0, src must be
58  * NUL-terminated.
59  *
60  * Result string is stored in dst, truncating if larger than dstsize. If
61  * dstsize is greater than the result length, dst will be NUL-terminated;
62  * otherwise not.
63  *
64  * If dstsize is zero, dst may be NULL. This is useful for calculating the
65  * required buffer size before allocating.
66  */
67 size_t
68 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
69 {
70  return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
71 }
72 
73 /*
74  * unicode_strtitle()
75  *
76  * Convert src to titlecase, and return the result length (not including
77  * terminating NUL).
78  *
79  * String src must be encoded in UTF-8. If srclen < 0, src must be
80  * NUL-terminated.
81  *
82  * Result string is stored in dst, truncating if larger than dstsize. If
83  * dstsize is greater than the result length, dst will be NUL-terminated;
84  * otherwise not.
85  *
86  * If dstsize is zero, dst may be NULL. This is useful for calculating the
87  * required buffer size before allocating.
88  *
89  * Titlecasing requires knowledge about word boundaries, which is provided by
90  * the callback wbnext. A word boundary is the offset of the start of a word
91  * or the offset of the character immediately following a word.
92  *
93  * The caller is expected to initialize and free the callback state
94  * wbstate. The callback should first return offset 0 for the first boundary;
95  * then the offset of each subsequent word boundary; then the total length of
96  * the string to indicate the final boundary.
97  */
98 size_t
99 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
100  WordBoundaryNext wbnext, void *wbstate)
101 {
102  return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
103  wbstate);
104 }
105 
106 /*
107  * unicode_strupper()
108  *
109  * Convert src to uppercase, and return the result length (not including
110  * terminating NUL).
111  *
112  * String src must be encoded in UTF-8. If srclen < 0, src must be
113  * NUL-terminated.
114  *
115  * Result string is stored in dst, truncating if larger than dstsize. If
116  * dstsize is greater than the result length, dst will be NUL-terminated;
117  * otherwise not.
118  *
119  * If dstsize is zero, dst may be NULL. This is useful for calculating the
120  * required buffer size before allocating.
121  */
122 size_t
123 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
124 {
125  return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
126 }
127 
128 /*
129  * If str_casekind is CaseLower or CaseUpper, map each character in the string
130  * for which a mapping is available.
131  *
132  * If str_casekind is CaseTitle, maps characters found on a word boundary to
133  * uppercase and other characters to lowercase.
134  */
135 static size_t
136 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
137  CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
138 {
139  /* character CaseKind varies while titlecasing */
140  CaseKind chr_casekind = str_casekind;
141  size_t srcoff = 0;
142  size_t result_len = 0;
143  size_t boundary = 0;
144 
145  Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
146  (str_casekind != CaseTitle && !wbnext && !wbstate));
147 
148  if (str_casekind == CaseTitle)
149  {
150  boundary = wbnext(wbstate);
151  Assert(boundary == 0); /* start of text is always a boundary */
152  }
153 
154  while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
155  {
156  pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
157  int u1len = unicode_utf8len(u1);
158  const pg_case_map *casemap = find_case_map(u1);
159 
160  if (str_casekind == CaseTitle)
161  {
162  if (srcoff == boundary)
163  {
164  chr_casekind = CaseUpper;
165  boundary = wbnext(wbstate);
166  }
167  else
168  chr_casekind = CaseLower;
169  }
170 
171  /* perform mapping, update result_len, and write to dst */
172  if (casemap)
173  {
174  pg_wchar u2 = casemap->simplemap[chr_casekind];
175  pg_wchar u2len = unicode_utf8len(u2);
176 
177  if (result_len + u2len <= dstsize)
178  unicode_to_utf8(u2, (unsigned char *) dst + result_len);
179 
180  result_len += u2len;
181  }
182  else
183  {
184  /* no mapping; copy bytes from src */
185  if (result_len + u1len <= dstsize)
186  memcpy(dst + result_len, src + srcoff, u1len);
187 
188  result_len += u1len;
189  }
190 
191  srcoff += u1len;
192  }
193 
194  if (result_len < dstsize)
195  dst[result_len] = '\0';
196 
197  return result_len;
198 }
199 
200 /* find entry in simple case map, if any */
201 static const pg_case_map *
203 {
204  int min;
205  int mid;
206  int max;
207 
208  /* all chars <= 0x80 are stored in array for fast lookup */
209  Assert(lengthof(case_map) >= 0x80);
210  if (ucs < 0x80)
211  {
212  const pg_case_map *map = &case_map[ucs];
213 
214  Assert(map->codepoint == ucs);
215  return map;
216  }
217 
218  /* otherwise, binary search */
219  min = 0x80;
220  max = lengthof(case_map) - 1;
221  while (max >= min)
222  {
223  mid = (min + max) / 2;
224  if (ucs > case_map[mid].codepoint)
225  min = mid + 1;
226  else if (ucs < case_map[mid].codepoint)
227  max = mid - 1;
228  else
229  return &case_map[mid];
230  }
231 
232  return NULL;
233 }
#define Assert(condition)
Definition: c.h:812
#define lengthof(array)
Definition: c.h:742
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
pg_wchar simplemap[NCaseKind]
pg_wchar codepoint
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:44
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: unicode_case.c:68
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:36
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: unicode_case.c:123
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:136
static const pg_case_map * find_case_map(pg_wchar ucs)
Definition: unicode_case.c:202
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:28
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:99
size_t(* WordBoundaryNext)(void *wbstate)
Definition: unicode_case.h:19
static const pg_case_map case_map[2955]
@ CaseTitle
@ CaseLower
@ CaseUpper