PostgreSQL Source Code git master
unicode_case.c File Reference
Include dependency graph for unicode_case.c:

Go to the source code of this file.

Functions

static const pg_case_mapfind_case_map (pg_wchar ucs)
 
static size_t convert_case (char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)
 
static bool check_special_conditions (int conditions, const char *str, size_t len, size_t offset)
 
pg_wchar unicode_lowercase_simple (pg_wchar code)
 
pg_wchar unicode_titlecase_simple (pg_wchar code)
 
pg_wchar unicode_uppercase_simple (pg_wchar code)
 
pg_wchar unicode_casefold_simple (pg_wchar code)
 
size_t unicode_strlower (char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
 
size_t unicode_strtitle (char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate)
 
size_t unicode_strupper (char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
 
size_t unicode_strfold (char *dst, size_t dstsize, const char *src, ssize_t srclen, bool full)
 
static bool check_final_sigma (const unsigned char *str, size_t len, size_t offset)
 

Function Documentation

◆ check_final_sigma()

static bool check_final_sigma ( const unsigned char *  str,
size_t  len,
size_t  offset 
)
static

Definition at line 301 of file unicode_case.c.

302{
303 /* the start of the string is not preceded by a Cased character */
304 if (offset == 0)
305 return false;
306
307 /* iterate backwards, looking for Cased character */
308 for (int i = offset - 1; i >= 0; i--)
309 {
310 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
311 {
312 pg_wchar curr = utf8_to_unicode(str + i);
313
314 if (pg_u_prop_case_ignorable(curr))
315 continue;
316 else if (pg_u_prop_cased(curr))
317 break;
318 else
319 return false;
320 }
321 else if ((str[i] & 0xC0) == 0x80)
322 continue;
323
324 Assert(false); /* invalid UTF-8 */
325 }
326
327 /* end of string is not followed by a Cased character */
328 if (offset == len)
329 return true;
330
331 /* iterate forwards, looking for Cased character */
332 for (int i = offset + 1; i < len && str[i] != '\0'; i++)
333 {
334 if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
335 {
336 pg_wchar curr = utf8_to_unicode(str + i);
337
338 if (pg_u_prop_case_ignorable(curr))
339 continue;
340 else if (pg_u_prop_cased(curr))
341 return false;
342 else
343 break;
344 }
345 else if ((str[i] & 0xC0) == 0x80)
346 continue;
347
348 Assert(false); /* invalid UTF-8 */
349 }
350
351 return true;
352}
#define Assert(condition)
Definition: c.h:815
const char * str
int i
Definition: isn.c:72
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
bool pg_u_prop_cased(pg_wchar code)
bool pg_u_prop_case_ignorable(pg_wchar code)

References Assert, i, len, pg_u_prop_case_ignorable(), pg_u_prop_cased(), str, and utf8_to_unicode().

Referenced by check_special_conditions().

◆ check_special_conditions()

static bool check_special_conditions ( int  conditions,
const char *  str,
size_t  len,
size_t  offset 
)
static

Definition at line 355 of file unicode_case.c.

357{
358 if (conditions == 0)
359 return true;
360 else if (conditions == PG_U_FINAL_SIGMA)
361 return check_final_sigma((unsigned char *) str, len, offset);
362
363 /* no other conditions supported */
364 Assert(false);
365 return false;
366}
static bool check_final_sigma(const unsigned char *str, size_t len, size_t offset)
Definition: unicode_case.c:301
#define PG_U_FINAL_SIGMA

References Assert, check_final_sigma(), len, PG_U_FINAL_SIGMA, and str.

Referenced by convert_case().

◆ convert_case()

static size_t convert_case ( char *  dst,
size_t  dstsize,
const char *  src,
ssize_t  srclen,
CaseKind  str_casekind,
bool  full,
WordBoundaryNext  wbnext,
void *  wbstate 
)
static

Definition at line 194 of file unicode_case.c.

197{
198 /* character CaseKind varies while titlecasing */
199 CaseKind chr_casekind = str_casekind;
200 size_t srcoff = 0;
201 size_t result_len = 0;
202 size_t boundary = 0;
203
204 Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
205 (str_casekind != CaseTitle && !wbnext && !wbstate));
206
207 if (str_casekind == CaseTitle)
208 {
209 boundary = wbnext(wbstate);
210 Assert(boundary == 0); /* start of text is always a boundary */
211 }
212
213 while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
214 {
215 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
216 int u1len = unicode_utf8len(u1);
217 const pg_case_map *casemap = find_case_map(u1);
218 const pg_special_case *special = NULL;
219
220 if (str_casekind == CaseTitle)
221 {
222 if (srcoff == boundary)
223 {
224 chr_casekind = full ? CaseTitle : CaseUpper;
225 boundary = wbnext(wbstate);
226 }
227 else
228 chr_casekind = CaseLower;
229 }
230
231 /*
232 * Find special case that matches the conditions, if any.
233 *
234 * Note: only a single special mapping per codepoint is currently
235 * supported, though Unicode allows for multiple special mappings for
236 * a single codepoint.
237 */
238 if (full && casemap && casemap->special_case)
239 {
240 int16 conditions = casemap->special_case->conditions;
241
242 Assert(casemap->special_case->codepoint == u1);
243 if (check_special_conditions(conditions, src, srclen, srcoff))
244 special = casemap->special_case;
245 }
246
247 /* perform mapping, update result_len, and write to dst */
248 if (special)
249 {
250 for (int i = 0; i < MAX_CASE_EXPANSION; i++)
251 {
252 pg_wchar u2 = special->map[chr_casekind][i];
253 size_t u2len = unicode_utf8len(u2);
254
255 if (u2 == '\0')
256 break;
257
258 if (result_len + u2len <= dstsize)
259 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
260
261 result_len += u2len;
262 }
263 }
264 else if (casemap)
265 {
266 pg_wchar u2 = casemap->simplemap[chr_casekind];
267 pg_wchar u2len = unicode_utf8len(u2);
268
269 if (result_len + u2len <= dstsize)
270 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
271
272 result_len += u2len;
273 }
274 else
275 {
276 /* no mapping; copy bytes from src */
277 if (result_len + u1len <= dstsize)
278 memcpy(dst + result_len, src + srcoff, u1len);
279
280 result_len += u1len;
281 }
282
283 srcoff += u1len;
284 }
285
286 if (result_len < dstsize)
287 dst[result_len] = '\0';
288
289 return result_len;
290}
int16_t int16
Definition: c.h:483
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
pg_wchar simplemap[NCaseKind]
const pg_special_case * special_case
pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]
static const pg_case_map * find_case_map(pg_wchar ucs)
Definition: unicode_case.c:370
static bool check_special_conditions(int conditions, const char *str, size_t len, size_t offset)
Definition: unicode_case.c:355
#define MAX_CASE_EXPANSION
@ CaseTitle
@ CaseLower
@ CaseUpper

References Assert, CaseLower, CaseTitle, CaseUpper, check_special_conditions(), pg_special_case::codepoint, pg_special_case::conditions, find_case_map(), i, pg_special_case::map, MAX_CASE_EXPANSION, pg_case_map::simplemap, pg_case_map::special_case, unicode_to_utf8(), unicode_utf8len(), and utf8_to_unicode().

Referenced by unicode_strfold(), unicode_strlower(), unicode_strtitle(), and unicode_strupper().

◆ find_case_map()

static const pg_case_map * find_case_map ( pg_wchar  ucs)
static

Definition at line 370 of file unicode_case.c.

371{
372 int min;
373 int mid;
374 int max;
375
376 /* all chars <= 0x80 are stored in array for fast lookup */
377 Assert(lengthof(case_map) >= 0x80);
378 if (ucs < 0x80)
379 {
380 const pg_case_map *map = &case_map[ucs];
381
382 Assert(map->codepoint == ucs);
383 return map;
384 }
385
386 /* otherwise, binary search */
387 min = 0x80;
388 max = lengthof(case_map) - 1;
389 while (max >= min)
390 {
391 mid = (min + max) / 2;
392 if (ucs > case_map[mid].codepoint)
393 min = mid + 1;
394 else if (ucs < case_map[mid].codepoint)
395 max = mid - 1;
396 else
397 return &case_map[mid];
398 }
399
400 return NULL;
401}
#define lengthof(array)
Definition: c.h:745
pg_wchar codepoint
static const pg_case_map case_map[3003]

References Assert, case_map, pg_case_map::codepoint, and lengthof.

Referenced by convert_case(), unicode_casefold_simple(), unicode_lowercase_simple(), unicode_titlecase_simple(), and unicode_uppercase_simple().

◆ unicode_casefold_simple()

pg_wchar unicode_casefold_simple ( pg_wchar  code)

Definition at line 55 of file unicode_case.c.

56{
57 const pg_case_map *map = find_case_map(code);
58
59 return map ? map->simplemap[CaseFold] : code;
60}
@ CaseFold

References CaseFold, find_case_map(), and pg_case_map::simplemap.

◆ unicode_lowercase_simple()

pg_wchar unicode_lowercase_simple ( pg_wchar  code)

Definition at line 31 of file unicode_case.c.

32{
33 const pg_case_map *map = find_case_map(code);
34
35 return map ? map->simplemap[CaseLower] : code;
36}

References CaseLower, find_case_map(), and pg_case_map::simplemap.

Referenced by pg_wc_tolower().

◆ unicode_strfold()

size_t unicode_strfold ( char *  dst,
size_t  dstsize,
const char *  src,
ssize_t  srclen,
bool  full 
)

Definition at line 170 of file unicode_case.c.

172{
173 return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
174 NULL);
175}
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:194

References CaseFold, and convert_case().

Referenced by strfold_builtin(), and tfunc_fold().

◆ unicode_strlower()

size_t unicode_strlower ( char *  dst,
size_t  dstsize,
const char *  src,
ssize_t  srclen,
bool  full 
)

Definition at line 82 of file unicode_case.c.

84{
85 return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
86 NULL);
87}

References CaseLower, and convert_case().

Referenced by strlower_builtin(), and tfunc_lower().

◆ unicode_strtitle()

size_t unicode_strtitle ( char *  dst,
size_t  dstsize,
const char *  src,
ssize_t  srclen,
bool  full,
WordBoundaryNext  wbnext,
void *  wbstate 
)

Definition at line 119 of file unicode_case.c.

121{
122 return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
123 wbstate);
124}

References CaseTitle, and convert_case().

Referenced by strtitle_builtin(), and tfunc_title().

◆ unicode_strupper()

size_t unicode_strupper ( char *  dst,
size_t  dstsize,
const char *  src,
ssize_t  srclen,
bool  full 
)

Definition at line 146 of file unicode_case.c.

148{
149 return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
150 NULL);
151}

References CaseUpper, and convert_case().

Referenced by strupper_builtin(), and tfunc_upper().

◆ unicode_titlecase_simple()

pg_wchar unicode_titlecase_simple ( pg_wchar  code)

Definition at line 39 of file unicode_case.c.

40{
41 const pg_case_map *map = find_case_map(code);
42
43 return map ? map->simplemap[CaseTitle] : code;
44}

References CaseTitle, find_case_map(), and pg_case_map::simplemap.

◆ unicode_uppercase_simple()

pg_wchar unicode_uppercase_simple ( pg_wchar  code)

Definition at line 47 of file unicode_case.c.

48{
49 const pg_case_map *map = find_case_map(code);
50
51 return map ? map->simplemap[CaseUpper] : code;
52}

References CaseUpper, find_case_map(), and pg_case_map::simplemap.

Referenced by pg_wc_toupper().