PostgreSQL Source Code git master
Loading...
Searching...
No Matches
unicode_case.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * unicode_case.c
3 * Unicode case mapping and case conversion.
4 *
5 * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode_case.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#ifndef FRONTEND
13#include "postgres.h"
14#else
15#include "postgres_fe.h"
16#endif
17
18#include "common/unicode_case.h"
21#include "mb/pg_wchar.h"
22
29
30/*
31 * Map for each case kind.
32 */
33static const char32_t *const casekind_map[NCaseKind] =
34{
39};
40
41static char32_t find_case_map(char32_t ucs, const char32_t *map);
42static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
44 void *wbstate);
45static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
46 const char *src, size_t srclen, size_t srcoff,
47 char32_t *simple, const char32_t **special);
48
49char32_t
51{
52 char32_t cp = find_case_map(code, case_map_lower);
53
54 return cp != 0 ? cp : code;
55}
56
57char32_t
59{
60 char32_t cp = find_case_map(code, case_map_title);
61
62 return cp != 0 ? cp : code;
63}
64
65char32_t
67{
68 char32_t cp = find_case_map(code, case_map_upper);
69
70 return cp != 0 ? cp : code;
71}
72
73char32_t
75{
76 char32_t cp = find_case_map(code, case_map_fold);
77
78 return cp != 0 ? cp : code;
79}
80
81/*
82 * unicode_strlower()
83 *
84 * Convert src to lowercase, and return the result length (not including
85 * terminating NUL).
86 *
87 * String src must be encoded in UTF-8.
88 *
89 * Result string is stored in dst, truncating if larger than dstsize. If
90 * dstsize is greater than the result length, dst will be NUL-terminated;
91 * otherwise not.
92 *
93 * If dstsize is zero, dst may be NULL. This is useful for calculating the
94 * required buffer size before allocating.
95 *
96 * If full is true, use special case mappings if available and if the
97 * conditions are satisfied.
98 */
99size_t
100unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
101 bool full)
102{
103 return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
104 NULL);
105}
106
107/*
108 * unicode_strtitle()
109 *
110 * Convert src to titlecase, and return the result length (not including
111 * terminating NUL).
112 *
113 * String src must be encoded in UTF-8.
114 *
115 * Result string is stored in dst, truncating if larger than dstsize. If
116 * dstsize is greater than the result length, dst will be NUL-terminated;
117 * otherwise not.
118 *
119 * If dstsize is zero, dst may be NULL. This is useful for calculating the
120 * required buffer size before allocating.
121 *
122 * If full is true, use special case mappings if available and if the
123 * conditions are satisfied. Otherwise, use only simple mappings and use
124 * uppercase instead of titlecase.
125 *
126 * Titlecasing requires knowledge about word boundaries, which is provided by
127 * the callback wbnext. A word boundary is the offset of the start of a word
128 * or the offset of the character immediately following a word.
129 *
130 * The caller is expected to initialize and free the callback state
131 * wbstate. The callback should first return offset 0 for the first boundary;
132 * then the offset of each subsequent word boundary; then the total length of
133 * the string to indicate the final boundary.
134 */
135size_t
136unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
137 bool full, WordBoundaryNext wbnext, void *wbstate)
138{
139 return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
140 wbstate);
141}
142
143/*
144 * unicode_strupper()
145 *
146 * Convert src to uppercase, and return the result length (not including
147 * terminating NUL).
148 *
149 * String src must be encoded in UTF-8.
150 *
151 * Result string is stored in dst, truncating if larger than dstsize. If
152 * dstsize is greater than the result length, dst will be NUL-terminated;
153 * otherwise not.
154 *
155 * If dstsize is zero, dst may be NULL. This is useful for calculating the
156 * required buffer size before allocating.
157 *
158 * If full is true, use special case mappings if available and if the
159 * conditions are satisfied.
160 */
161size_t
162unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
163 bool full)
164{
165 return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
166 NULL);
167}
168
169/*
170 * unicode_strfold()
171 *
172 * Case fold src, and return the result length (not including terminating
173 * NUL).
174 *
175 * String src must be encoded in UTF-8.
176 *
177 * Result string is stored in dst, truncating if larger than dstsize. If
178 * dstsize is greater than the result length, dst will be NUL-terminated;
179 * otherwise not.
180 *
181 * If dstsize is zero, dst may be NULL. This is useful for calculating the
182 * required buffer size before allocating.
183 */
184size_t
185unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
186 bool full)
187{
188 return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
189 NULL);
190}
191
192/*
193 * Implement Unicode Default Case Conversion algorithm.
194 *
195 * If str_casekind is CaseLower or CaseUpper, map each character in the string
196 * for which a mapping is available.
197 *
198 * If str_casekind is CaseTitle, maps characters found on a word boundary to
199 * titlecase (or uppercase if full is false) and other characters to
200 * lowercase. NB: does not currently implement the Unicode behavior in which
201 * the word boundary is adjusted to the next Cased character. That behavior
202 * could be implemented as an option, but it doesn't match the default
203 * behavior of ICU, nor does it match the documented behavior of INITCAP().
204 *
205 * If full is true, use special mappings for relevant characters, which can
206 * map a single codepoint to multiple codepoints, or depend on conditions.
207 */
208static size_t
209convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
211 void *wbstate)
212{
213 /* character CaseKind varies while titlecasing */
215 size_t srcoff = 0;
216 size_t result_len = 0;
217 size_t boundary = 0;
218
220 (str_casekind != CaseTitle && !wbnext && !wbstate));
221
222 if (str_casekind == CaseTitle)
223 {
224 boundary = wbnext(wbstate);
225 Assert(boundary == 0); /* start of text is always a boundary */
226 }
227
228 while (srcoff < srclen)
229 {
230 char32_t u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
231 int u1len = unicode_utf8len(u1);
232 char32_t simple = 0;
233 const char32_t *special = NULL;
235
236 if (str_casekind == CaseTitle)
237 {
238 if (srcoff == boundary)
239 {
241 boundary = wbnext(wbstate);
242 }
243 else
245 }
246
248 &simple, &special);
249
250 switch (casemap_result)
251 {
252 case CASEMAP_SELF:
253 /* no mapping; copy bytes from src */
254 Assert(simple == 0);
255 Assert(special == NULL);
256 if (result_len + u1len <= dstsize)
257 memcpy(dst + result_len, src + srcoff, u1len);
258
259 result_len += u1len;
260 break;
261 case CASEMAP_SIMPLE:
262 {
263 /* replace with single character */
264 char32_t u2 = simple;
265 char32_t u2len = unicode_utf8len(u2);
266
267 Assert(special == NULL);
268 if (result_len + u2len <= dstsize)
269 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
270
271 result_len += u2len;
272 }
273 break;
274 case CASEMAP_SPECIAL:
275 /* replace with up to MAX_CASE_EXPANSION characters */
276 Assert(simple == 0);
277 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
278 {
279 char32_t u2 = special[i];
280 size_t u2len = unicode_utf8len(u2);
281
282 if (result_len + u2len <= dstsize)
283 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
284
285 result_len += u2len;
286 }
287 break;
288 }
289
290 srcoff += u1len;
291 }
292
293 if (result_len < dstsize)
294 dst[result_len] = '\0';
295
296 return result_len;
297}
298
299/*
300 * Check that the condition matches Final_Sigma, described in Unicode Table
301 * 3-17. The character at the given offset must be directly preceded by a
302 * Cased character, and must not be directly followed by a Cased character.
303 *
304 * Case_Ignorable characters are ignored. NB: some characters may be both
305 * Cased and Case_Ignorable, in which case they are ignored.
306 */
307static bool
308check_final_sigma(const unsigned char *str, size_t len, size_t offset)
309{
310 /* the start of the string is not preceded by a Cased character */
311 if (offset == 0)
312 return false;
313
314 /* iterate backwards, looking for Cased character */
315 for (int i = offset - 1; i >= 0; i--)
316 {
317 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
318 {
319 char32_t curr = utf8_to_unicode(str + i);
320
322 continue;
323 else if (pg_u_prop_cased(curr))
324 break;
325 else
326 return false;
327 }
328 else if ((str[i] & 0xC0) == 0x80)
329 continue;
330
331 Assert(false); /* invalid UTF-8 */
332 }
333
334 /* end of string is not followed by a Cased character */
335 if (offset == len)
336 return true;
337
338 /* iterate forwards, looking for Cased character */
339 for (int i = offset + 1; i < len && str[i] != '\0'; i++)
340 {
341 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
342 {
343 char32_t curr = utf8_to_unicode(str + i);
344
346 continue;
347 else if (pg_u_prop_cased(curr))
348 return false;
349 else
350 break;
351 }
352 else if ((str[i] & 0xC0) == 0x80)
353 continue;
354
355 Assert(false); /* invalid UTF-8 */
356 }
357
358 return true;
359}
360
361/*
362 * Unicode allows for special casing to be applied only under certain
363 * circumstances. The only currently-supported condition is Final_Sigma.
364 */
365static bool
366check_special_conditions(int conditions, const char *str, size_t len,
367 size_t offset)
368{
369 if (conditions == 0)
370 return true;
371 else if (conditions == PG_U_FINAL_SIGMA)
372 return check_final_sigma((const unsigned char *) str, len, offset);
373
374 /* no other conditions supported */
375 Assert(false);
376 return false;
377}
378
379/*
380 * Map the given character to the requested case.
381 *
382 * If full is true, and a special case mapping is found and the conditions are
383 * met, 'special' is set to the mapping result (which is an array of up to
384 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
385 *
386 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
387 * result and return CASEMAP_SIMPLE.
388 *
389 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
390 * character without modification.
391 */
392static enum CaseMapResult
393casemap(char32_t u1, CaseKind casekind, bool full,
394 const char *src, size_t srclen, size_t srcoff,
395 char32_t *simple, const char32_t **special)
396{
397 uint16 idx;
398
399 /* Fast path for codepoints < 0x80 */
400 if (u1 < 0x80)
401 {
402 /*
403 * The first elements in all tables are reserved as 0 (as NULL). The
404 * data starts at index 1, not 0.
405 */
406 *simple = casekind_map[casekind][u1 + 1];
407
408 return CASEMAP_SIMPLE;
409 }
410
411 idx = case_index(u1);
412
413 if (idx == 0)
414 return CASEMAP_SELF;
415
416 if (full && case_map_special[idx] &&
418 src, srclen, srcoff))
419 {
421 return CASEMAP_SPECIAL;
422 }
423
424 *simple = casekind_map[casekind][idx];
425
426 return CASEMAP_SIMPLE;
427}
428
429/*
430 * Find entry in simple case map.
431 * If the entry does not exist, 0 will be returned.
432 */
433static char32_t
434find_case_map(char32_t ucs, const char32_t *map)
435{
436 /* Fast path for codepoints < 0x80 */
437 if (ucs < 0x80)
438 /* The first elements in all tables are reserved as 0 (as NULL). */
439 return map[ucs + 1];
440 return map[case_index(ucs)];
441}
Datum idx(PG_FUNCTION_ARGS)
Definition _int_op.c:263
#define Assert(condition)
Definition c.h:943
uint16_t uint16
Definition c.h:623
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
const char * str
int i
Definition isn.c:77
static char32_t utf8_to_unicode(const unsigned char *c)
Definition mbprint.c:53
const void size_t len
static unsigned char * unicode_to_utf8(char32_t c, unsigned char *utf8string)
Definition pg_wchar.h:428
static int unicode_utf8len(char32_t c)
Definition pg_wchar.h:460
static int fb(int x)
char32_t map[NCaseKind][MAX_CASE_EXPANSION]
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
size_t unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen, bool full)
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen, bool full)
static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)
char32_t unicode_titlecase_simple(char32_t code)
char32_t unicode_casefold_simple(char32_t code)
char32_t unicode_lowercase_simple(char32_t code)
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen, bool full)
static char32_t find_case_map(char32_t ucs, const char32_t *map)
static const char32_t *const casekind_map[NCaseKind]
static bool check_special_conditions(int conditions, const char *str, size_t len, size_t offset)
static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, const char *src, size_t srclen, size_t srcoff, char32_t *simple, const char32_t **special)
CaseMapResult
@ CASEMAP_SPECIAL
@ CASEMAP_SIMPLE
@ CASEMAP_SELF
static bool check_final_sigma(const unsigned char *str, size_t len, size_t offset)
char32_t unicode_uppercase_simple(char32_t code)
size_t(* WordBoundaryNext)(void *wbstate)
#define MAX_CASE_EXPANSION
static const char32_t case_map_upper[1732]
static const char32_t case_map_fold[1732]
static const char32_t case_map_lower[1732]
#define PG_U_FINAL_SIGMA
@ CaseFold
@ CaseTitle
@ NCaseKind
@ CaseLower
@ CaseUpper
static const pg_special_case special_case[106]
static const char32_t case_map_title[1732]
static uint16 case_index(char32_t cp)
static const uint8 case_map_special[1732]
bool pg_u_prop_cased(char32_t code)
bool pg_u_prop_case_ignorable(char32_t code)