PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
unicode_case.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 * unicode_case.c
3 * Unicode case mapping and case conversion.
4 *
5 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6 *
7 * IDENTIFICATION
8 * src/common/unicode_case.c
9 *
10 *-------------------------------------------------------------------------
11 */
12#ifndef FRONTEND
13#include "postgres.h"
14#else
15#include "postgres_fe.h"
16#endif
17
18#include "common/unicode_case.h"
20#include "mb/pg_wchar.h"
21
22static const pg_case_map *find_case_map(pg_wchar ucs);
23static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
24 CaseKind str_casekind, WordBoundaryNext wbnext,
25 void *wbstate);
26
29{
30 const pg_case_map *map = find_case_map(code);
31
32 return map ? map->simplemap[CaseLower] : code;
33}
34
37{
38 const pg_case_map *map = find_case_map(code);
39
40 return map ? map->simplemap[CaseTitle] : code;
41}
42
45{
46 const pg_case_map *map = find_case_map(code);
47
48 return map ? map->simplemap[CaseUpper] : code;
49}
50
51/*
52 * unicode_strlower()
53 *
54 * Convert src to lowercase, and return the result length (not including
55 * terminating NUL).
56 *
57 * String src must be encoded in UTF-8. If srclen < 0, src must be
58 * NUL-terminated.
59 *
60 * Result string is stored in dst, truncating if larger than dstsize. If
61 * dstsize is greater than the result length, dst will be NUL-terminated;
62 * otherwise not.
63 *
64 * If dstsize is zero, dst may be NULL. This is useful for calculating the
65 * required buffer size before allocating.
66 */
67size_t
68unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
69{
70 return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
71}
72
73/*
74 * unicode_strtitle()
75 *
76 * Convert src to titlecase, and return the result length (not including
77 * terminating NUL).
78 *
79 * String src must be encoded in UTF-8. If srclen < 0, src must be
80 * NUL-terminated.
81 *
82 * Result string is stored in dst, truncating if larger than dstsize. If
83 * dstsize is greater than the result length, dst will be NUL-terminated;
84 * otherwise not.
85 *
86 * If dstsize is zero, dst may be NULL. This is useful for calculating the
87 * required buffer size before allocating.
88 *
89 * Titlecasing requires knowledge about word boundaries, which is provided by
90 * the callback wbnext. A word boundary is the offset of the start of a word
91 * or the offset of the character immediately following a word.
92 *
93 * The caller is expected to initialize and free the callback state
94 * wbstate. The callback should first return offset 0 for the first boundary;
95 * then the offset of each subsequent word boundary; then the total length of
96 * the string to indicate the final boundary.
97 */
98size_t
99unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
100 WordBoundaryNext wbnext, void *wbstate)
101{
102 return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
103 wbstate);
104}
105
106/*
107 * unicode_strupper()
108 *
109 * Convert src to uppercase, and return the result length (not including
110 * terminating NUL).
111 *
112 * String src must be encoded in UTF-8. If srclen < 0, src must be
113 * NUL-terminated.
114 *
115 * Result string is stored in dst, truncating if larger than dstsize. If
116 * dstsize is greater than the result length, dst will be NUL-terminated;
117 * otherwise not.
118 *
119 * If dstsize is zero, dst may be NULL. This is useful for calculating the
120 * required buffer size before allocating.
121 */
122size_t
123unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
124{
125 return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
126}
127
128/*
129 * If str_casekind is CaseLower or CaseUpper, map each character in the string
130 * for which a mapping is available.
131 *
132 * If str_casekind is CaseTitle, maps characters found on a word boundary to
133 * uppercase and other characters to lowercase.
134 */
135static size_t
136convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
137 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
138{
139 /* character CaseKind varies while titlecasing */
140 CaseKind chr_casekind = str_casekind;
141 size_t srcoff = 0;
142 size_t result_len = 0;
143 size_t boundary = 0;
144
145 Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
146 (str_casekind != CaseTitle && !wbnext && !wbstate));
147
148 if (str_casekind == CaseTitle)
149 {
150 boundary = wbnext(wbstate);
151 Assert(boundary == 0); /* start of text is always a boundary */
152 }
153
154 while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
155 {
156 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
157 int u1len = unicode_utf8len(u1);
158 const pg_case_map *casemap = find_case_map(u1);
159
160 if (str_casekind == CaseTitle)
161 {
162 if (srcoff == boundary)
163 {
164 chr_casekind = CaseUpper;
165 boundary = wbnext(wbstate);
166 }
167 else
168 chr_casekind = CaseLower;
169 }
170
171 /* perform mapping, update result_len, and write to dst */
172 if (casemap)
173 {
174 pg_wchar u2 = casemap->simplemap[chr_casekind];
175 pg_wchar u2len = unicode_utf8len(u2);
176
177 if (result_len + u2len <= dstsize)
178 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
179
180 result_len += u2len;
181 }
182 else
183 {
184 /* no mapping; copy bytes from src */
185 if (result_len + u1len <= dstsize)
186 memcpy(dst + result_len, src + srcoff, u1len);
187
188 result_len += u1len;
189 }
190
191 srcoff += u1len;
192 }
193
194 if (result_len < dstsize)
195 dst[result_len] = '\0';
196
197 return result_len;
198}
199
200/* find entry in simple case map, if any */
201static const pg_case_map *
203{
204 int min;
205 int mid;
206 int max;
207
208 /* all chars <= 0x80 are stored in array for fast lookup */
209 Assert(lengthof(case_map) >= 0x80);
210 if (ucs < 0x80)
211 {
212 const pg_case_map *map = &case_map[ucs];
213
214 Assert(map->codepoint == ucs);
215 return map;
216 }
217
218 /* otherwise, binary search */
219 min = 0x80;
220 max = lengthof(case_map) - 1;
221 while (max >= min)
222 {
223 mid = (min + max) / 2;
224 if (ucs > case_map[mid].codepoint)
225 min = mid + 1;
226 else if (ucs < case_map[mid].codepoint)
227 max = mid - 1;
228 else
229 return &case_map[mid];
230 }
231
232 return NULL;
233}
#define Assert(condition)
Definition: c.h:812
#define lengthof(array)
Definition: c.h:742
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
pg_wchar simplemap[NCaseKind]
pg_wchar codepoint
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:44
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: unicode_case.c:68
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:36
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: unicode_case.c:123
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:136
static const pg_case_map * find_case_map(pg_wchar ucs)
Definition: unicode_case.c:202
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:28
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:99
size_t(* WordBoundaryNext)(void *wbstate)
Definition: unicode_case.h:19
static const pg_case_map case_map[2955]
@ CaseTitle
@ CaseLower
@ CaseUpper