PostgreSQL Source Code  git master
unicode_case.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  * unicode_case.c
3  * Unicode case mapping and case conversion.
4  *
5  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
6  *
7  * IDENTIFICATION
8  * src/common/unicode_case.c
9  *
10  *-------------------------------------------------------------------------
11  */
12 #ifndef FRONTEND
13 #include "postgres.h"
14 #else
15 #include "postgres_fe.h"
16 #endif
17 
18 #include "common/unicode_case.h"
21 #include "mb/pg_wchar.h"
22 
23 static const pg_case_map *find_case_map(pg_wchar ucs);
24 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25  CaseKind str_casekind, WordBoundaryNext wbnext,
26  void *wbstate);
27 
30 {
31  const pg_case_map *map = find_case_map(code);
32 
33  return map ? map->simplemap[CaseLower] : code;
34 }
35 
38 {
39  const pg_case_map *map = find_case_map(code);
40 
41  return map ? map->simplemap[CaseTitle] : code;
42 }
43 
46 {
47  const pg_case_map *map = find_case_map(code);
48 
49  return map ? map->simplemap[CaseUpper] : code;
50 }
51 
52 /*
53  * unicode_strlower()
54  *
55  * Convert src to lowercase, and return the result length (not including
56  * terminating NUL).
57  *
58  * String src must be encoded in UTF-8. If srclen < 0, src must be
59  * NUL-terminated.
60  *
61  * Result string is stored in dst, truncating if larger than dstsize. If
62  * dstsize is greater than the result length, dst will be NUL-terminated;
63  * otherwise not.
64  *
65  * If dstsize is zero, dst may be NULL. This is useful for calculating the
66  * required buffer size before allocating.
67  */
68 size_t
69 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
70 {
71  return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
72 }
73 
74 /*
75  * unicode_strtitle()
76  *
77  * Convert src to titlecase, and return the result length (not including
78  * terminating NUL).
79  *
80  * String src must be encoded in UTF-8. If srclen < 0, src must be
81  * NUL-terminated.
82  *
83  * Result string is stored in dst, truncating if larger than dstsize. If
84  * dstsize is greater than the result length, dst will be NUL-terminated;
85  * otherwise not.
86  *
87  * If dstsize is zero, dst may be NULL. This is useful for calculating the
88  * required buffer size before allocating.
89  *
90  * Titlecasing requires knowledge about word boundaries, which is provided by
91  * the callback wbnext. A word boundary is the offset of the start of a word
92  * or the offset of the character immediately following a word.
93  *
94  * The caller is expected to initialize and free the callback state
95  * wbstate. The callback should first return offset 0 for the first boundary;
96  * then the offset of each subsequent word boundary; then the total length of
97  * the string to indicate the final boundary.
98  */
99 size_t
100 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
101  WordBoundaryNext wbnext, void *wbstate)
102 {
103  return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
104  wbstate);
105 }
106 
107 /*
108  * unicode_strupper()
109  *
110  * Convert src to uppercase, and return the result length (not including
111  * terminating NUL).
112  *
113  * String src must be encoded in UTF-8. If srclen < 0, src must be
114  * NUL-terminated.
115  *
116  * Result string is stored in dst, truncating if larger than dstsize. If
117  * dstsize is greater than the result length, dst will be NUL-terminated;
118  * otherwise not.
119  *
120  * If dstsize is zero, dst may be NULL. This is useful for calculating the
121  * required buffer size before allocating.
122  */
123 size_t
124 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
125 {
126  return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
127 }
128 
129 /*
130  * If str_casekind is CaseLower or CaseUpper, map each character in the string
131  * for which a mapping is available.
132  *
133  * If str_casekind is CaseTitle, maps characters found on a word boundary to
134  * uppercase and other characters to lowercase.
135  */
136 static size_t
137 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
138  CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
139 {
140  /* character CaseKind varies while titlecasing */
141  CaseKind chr_casekind = str_casekind;
142  size_t srcoff = 0;
143  size_t result_len = 0;
144  size_t boundary = 0;
145 
146  Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
147  (str_casekind != CaseTitle && !wbnext && !wbstate));
148 
149  if (str_casekind == CaseTitle)
150  {
151  boundary = wbnext(wbstate);
152  Assert(boundary == 0); /* start of text is always a boundary */
153  }
154 
155  while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
156  {
157  pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
158  int u1len = unicode_utf8len(u1);
159  const pg_case_map *casemap = find_case_map(u1);
160 
161  if (str_casekind == CaseTitle)
162  {
163  if (srcoff == boundary)
164  {
165  chr_casekind = CaseUpper;
166  boundary = wbnext(wbstate);
167  }
168  else
169  chr_casekind = CaseLower;
170  }
171 
172  /* perform mapping, update result_len, and write to dst */
173  if (casemap)
174  {
175  pg_wchar u2 = casemap->simplemap[chr_casekind];
176  pg_wchar u2len = unicode_utf8len(u2);
177 
178  if (result_len + u2len <= dstsize)
179  unicode_to_utf8(u2, (unsigned char *) dst + result_len);
180 
181  result_len += u2len;
182  }
183  else
184  {
185  /* no mapping; copy bytes from src */
186  if (result_len + u1len <= dstsize)
187  memcpy(dst + result_len, src + srcoff, u1len);
188 
189  result_len += u1len;
190  }
191 
192  srcoff += u1len;
193  }
194 
195  if (result_len < dstsize)
196  dst[result_len] = '\0';
197 
198  return result_len;
199 }
200 
201 /* find entry in simple case map, if any */
202 static const pg_case_map *
204 {
205  int min;
206  int mid;
207  int max;
208 
209  /* all chars <= 0x80 are stored in array for fast lookup */
210  Assert(lengthof(case_map) >= 0x80);
211  if (ucs < 0x80)
212  {
213  const pg_case_map *map = &case_map[ucs];
214 
215  Assert(map->codepoint == ucs);
216  return map;
217  }
218 
219  /* otherwise, binary search */
220  min = 0x80;
221  max = lengthof(case_map) - 1;
222  while (max >= min)
223  {
224  mid = (min + max) / 2;
225  if (ucs > case_map[mid].codepoint)
226  min = mid + 1;
227  else if (ucs < case_map[mid].codepoint)
228  max = mid - 1;
229  else
230  return &case_map[mid];
231  }
232 
233  return NULL;
234 }
#define Assert(condition)
Definition: c.h:858
#define lengthof(array)
Definition: c.h:788
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
static int unicode_utf8len(pg_wchar c)
Definition: pg_wchar.h:607
pg_wchar simplemap[NCaseKind]
pg_wchar codepoint
pg_wchar unicode_uppercase_simple(pg_wchar code)
Definition: unicode_case.c:45
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: unicode_case.c:69
pg_wchar unicode_titlecase_simple(pg_wchar code)
Definition: unicode_case.c:37
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
Definition: unicode_case.c:124
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:137
static const pg_case_map * find_case_map(pg_wchar ucs)
Definition: unicode_case.c:203
pg_wchar unicode_lowercase_simple(pg_wchar code)
Definition: unicode_case.c:29
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, WordBoundaryNext wbnext, void *wbstate)
Definition: unicode_case.c:100
size_t(* WordBoundaryNext)(void *wbstate)
Definition: unicode_case.h:19
static const pg_case_map case_map[2955]
@ CaseTitle
@ CaseLower
@ CaseUpper