PostgreSQL Source Code  git master
unicode_norm.c File Reference
Include dependency graph for unicode_norm.c:

Go to the source code of this file.

Macros

#define ALLOC(size)   palloc(size)
 
#define FREE(size)   pfree(size)
 
#define SBASE   0xAC00 /* U+AC00 */
 
#define LBASE   0x1100 /* U+1100 */
 
#define VBASE   0x1161 /* U+1161 */
 
#define TBASE   0x11A7 /* U+11A7 */
 
#define LCOUNT   19
 
#define VCOUNT   21
 
#define TCOUNT   28
 
#define NCOUNT   VCOUNT * TCOUNT
 
#define SCOUNT   LCOUNT * NCOUNT
 

Functions

static int conv_compare (const void *p1, const void *p2)
 
static pg_unicode_decompositionget_code_entry (pg_wchar code)
 
static const pg_wcharget_code_decomposition (pg_unicode_decomposition *entry, int *dec_size)
 
static int get_decomposed_size (pg_wchar code)
 
static bool recompose_code (uint32 start, uint32 code, uint32 *result)
 
static void decompose_code (pg_wchar code, pg_wchar **result, int *current)
 
pg_wcharunicode_normalize_kc (const pg_wchar *input)
 

Macro Definition Documentation

◆ ALLOC

#define ALLOC (   size)    palloc(size)

Definition at line 25 of file unicode_norm.c.

Referenced by unicode_normalize_kc().

◆ FREE

#define FREE (   size)    pfree(size)

Definition at line 26 of file unicode_norm.c.

Referenced by unicode_normalize_kc().

◆ LBASE

#define LBASE   0x1100 /* U+1100 */

Definition at line 34 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

◆ LCOUNT

#define LCOUNT   19

Definition at line 37 of file unicode_norm.c.

Referenced by recompose_code().

◆ NCOUNT

#define NCOUNT   VCOUNT * TCOUNT

Definition at line 40 of file unicode_norm.c.

◆ SBASE

#define SBASE   0xAC00 /* U+AC00 */

Definition at line 33 of file unicode_norm.c.

Referenced by decompose_code(), get_decomposed_size(), and recompose_code().

◆ SCOUNT

#define SCOUNT   LCOUNT * NCOUNT

Definition at line 41 of file unicode_norm.c.

Referenced by decompose_code(), get_decomposed_size(), and recompose_code().

◆ TBASE

#define TBASE   0x11A7 /* U+11A7 */

Definition at line 36 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

◆ TCOUNT

#define TCOUNT   28

Definition at line 39 of file unicode_norm.c.

Referenced by decompose_code(), get_decomposed_size(), and recompose_code().

◆ VBASE

#define VBASE   0x1161 /* U+1161 */

Definition at line 35 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

◆ VCOUNT

#define VCOUNT   21

Definition at line 38 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

Function Documentation

◆ conv_compare()

static int conv_compare ( const void *  p1,
const void *  p2 
)
static

Definition at line 45 of file unicode_norm.c.

Referenced by get_code_entry().

46 {
47  uint32 v1,
48  v2;
49 
50  v1 = *(const uint32 *) p1;
51  v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
52  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
53 }
unsigned int uint32
Definition: c.h:359

◆ decompose_code()

static void decompose_code ( pg_wchar  code,
pg_wchar **  result,
int *  current 
)
static

Definition at line 227 of file unicode_norm.c.

References DECOMPOSITION_SIZE, get_code_decomposition(), get_code_entry(), i, LBASE, SBASE, SCOUNT, TBASE, TCOUNT, VBASE, and VCOUNT.

Referenced by unicode_normalize_kc().

228 {
230  int i;
231  const uint32 *decomp;
232  int dec_size;
233 
234  /*
235  * Fast path for Hangul characters not stored in tables to save memory as
236  * decomposition is algorithmic. See
237  * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
238  * the matter.
239  */
240  if (code >= SBASE && code < SBASE + SCOUNT)
241  {
242  uint32 l,
243  v,
244  tindex,
245  sindex;
246  pg_wchar *res = *result;
247 
248  sindex = code - SBASE;
249  l = LBASE + sindex / (VCOUNT * TCOUNT);
250  v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
251  tindex = sindex % TCOUNT;
252 
253  res[*current] = l;
254  (*current)++;
255  res[*current] = v;
256  (*current)++;
257 
258  if (tindex != 0)
259  {
260  res[*current] = TBASE + tindex;
261  (*current)++;
262  }
263 
264  return;
265  }
266 
267  entry = get_code_entry(code);
268 
269  /*
270  * Just fill in with the current decomposition if there are no
271  * decomposition codes to recurse to. A NULL entry is equivalent to a
272  * character with class 0 and no decompositions, so just leave also in
273  * this case.
274  */
275  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
276  {
277  pg_wchar *res = *result;
278 
279  res[*current] = code;
280  (*current)++;
281  return;
282  }
283 
284  /*
285  * If this entry has other decomposition codes look at them as well.
286  */
287  decomp = get_code_decomposition(entry, &dec_size);
288  for (i = 0; i < dec_size; i++)
289  {
290  pg_wchar lcode = (pg_wchar) decomp[i];
291 
292  /* Leave if no more decompositions */
293  decompose_code(lcode, result, current);
294  }
295 }
#define DECOMPOSITION_SIZE(x)
#define SBASE
Definition: unicode_norm.c:33
#define VBASE
Definition: unicode_norm.c:35
#define TBASE
Definition: unicode_norm.c:36
unsigned int uint32
Definition: c.h:359
unsigned int pg_wchar
Definition: mbprint.c:31
#define LBASE
Definition: unicode_norm.c:34
#define SCOUNT
Definition: unicode_norm.c:41
static const pg_wchar * get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
Definition: unicode_norm.c:76
static pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:59
int i
#define TCOUNT
Definition: unicode_norm.c:39
static void decompose_code(pg_wchar code, pg_wchar **result, int *current)
Definition: unicode_norm.c:227
#define VCOUNT
Definition: unicode_norm.c:38

◆ get_code_decomposition()

static const pg_wchar* get_code_decomposition ( pg_unicode_decomposition entry,
int *  dec_size 
)
static

Definition at line 76 of file unicode_norm.c.

References Assert, pg_unicode_decomposition::dec_index, DECOMPOSITION_IS_INLINE, DECOMPOSITION_SIZE, and UnicodeDecomp_codepoints.

Referenced by decompose_code(), and get_decomposed_size().

77 {
78  static pg_wchar x;
79 
80  if (DECOMPOSITION_IS_INLINE(entry))
81  {
82  Assert(DECOMPOSITION_SIZE(entry) == 1);
83  x = (pg_wchar) entry->dec_index;
84  *dec_size = 1;
85  return &x;
86  }
87  else
88  {
89  *dec_size = DECOMPOSITION_SIZE(entry);
90  return &UnicodeDecomp_codepoints[entry->dec_index];
91  }
92 }
#define DECOMPOSITION_SIZE(x)
static const uint32 UnicodeDecomp_codepoints[5090]
#define DECOMPOSITION_IS_INLINE(x)
unsigned int pg_wchar
Definition: mbprint.c:31
#define Assert(condition)
Definition: c.h:739

◆ get_code_entry()

static pg_unicode_decomposition* get_code_entry ( pg_wchar  code)
static

Definition at line 59 of file unicode_norm.c.

References conv_compare(), lengthof, and UnicodeDecompMain.

Referenced by decompose_code(), get_decomposed_size(), and unicode_normalize_kc().

60 {
61  return bsearch(&(code),
65  conv_compare);
66 }
#define lengthof(array)
Definition: c.h:669
static const pg_unicode_decomposition UnicodeDecompMain[6582]
static int conv_compare(const void *p1, const void *p2)
Definition: unicode_norm.c:45

◆ get_decomposed_size()

static int get_decomposed_size ( pg_wchar  code)
static

Definition at line 101 of file unicode_norm.c.

References DECOMPOSITION_SIZE, get_code_decomposition(), get_code_entry(), i, SBASE, SCOUNT, and TCOUNT.

Referenced by unicode_normalize_kc().

102 {
104  int size = 0;
105  int i;
106  const uint32 *decomp;
107  int dec_size;
108 
109  /*
110  * Fast path for Hangul characters not stored in tables to save memory as
111  * decomposition is algorithmic. See
112  * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
113  * the matter.
114  */
115  if (code >= SBASE && code < SBASE + SCOUNT)
116  {
117  uint32 tindex,
118  sindex;
119 
120  sindex = code - SBASE;
121  tindex = sindex % TCOUNT;
122 
123  if (tindex != 0)
124  return 3;
125  return 2;
126  }
127 
128  entry = get_code_entry(code);
129 
130  /*
131  * Just count current code if no other decompositions. A NULL entry is
132  * equivalent to a character with class 0 and no decompositions.
133  */
134  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
135  return 1;
136 
137  /*
138  * If this entry has other decomposition codes look at them as well. First
139  * get its decomposition in the list of tables available.
140  */
141  decomp = get_code_decomposition(entry, &dec_size);
142  for (i = 0; i < dec_size; i++)
143  {
144  uint32 lcode = decomp[i];
145 
146  size += get_decomposed_size(lcode);
147  }
148 
149  return size;
150 }
static int get_decomposed_size(pg_wchar code)
Definition: unicode_norm.c:101
#define DECOMPOSITION_SIZE(x)
#define SBASE
Definition: unicode_norm.c:33
unsigned int uint32
Definition: c.h:359
#define SCOUNT
Definition: unicode_norm.c:41
static const pg_wchar * get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
Definition: unicode_norm.c:76
static pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:59
int i
#define TCOUNT
Definition: unicode_norm.c:39

◆ recompose_code()

static bool recompose_code ( uint32  start,
uint32  code,
uint32 result 
)
static

Definition at line 159 of file unicode_norm.c.

References pg_unicode_decomposition::codepoint, pg_unicode_decomposition::dec_index, DECOMPOSITION_NO_COMPOSE, DECOMPOSITION_SIZE, i, LBASE, LCOUNT, lengthof, SBASE, SCOUNT, TBASE, TCOUNT, UnicodeDecomp_codepoints, UnicodeDecompMain, VBASE, and VCOUNT.

Referenced by unicode_normalize_kc().

160 {
161  /*
162  * Handle Hangul characters algorithmically, per the Unicode spec.
163  *
164  * Check if two current characters are L and V.
165  */
166  if (start >= LBASE && start < LBASE + LCOUNT &&
167  code >= VBASE && code < VBASE + VCOUNT)
168  {
169  /* make syllable of form LV */
170  uint32 lindex = start - LBASE;
171  uint32 vindex = code - VBASE;
172 
173  *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
174  return true;
175  }
176  /* Check if two current characters are LV and T */
177  else if (start >= SBASE && start < (SBASE + SCOUNT) &&
178  ((start - SBASE) % TCOUNT) == 0 &&
179  code >= TBASE && code < (TBASE + TCOUNT))
180  {
181  /* make syllable of form LVT */
182  uint32 tindex = code - TBASE;
183 
184  *result = start + tindex;
185  return true;
186  }
187  else
188  {
189  int i;
190 
191  /*
192  * Do an inverse lookup of the decomposition tables to see if anything
193  * matches. The comparison just needs to be a perfect match on the
194  * sub-table of size two, because the start character has already been
195  * recomposed partially.
196  */
197  for (i = 0; i < lengthof(UnicodeDecompMain); i++)
198  {
200 
201  if (DECOMPOSITION_SIZE(entry) != 2)
202  continue;
203 
204  if (DECOMPOSITION_NO_COMPOSE(entry))
205  continue;
206 
207  if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
208  code == UnicodeDecomp_codepoints[entry->dec_index + 1])
209  {
210  *result = entry->codepoint;
211  return true;
212  }
213  }
214  }
215 
216  return false;
217 }
#define DECOMPOSITION_SIZE(x)
#define LCOUNT
Definition: unicode_norm.c:37
#define lengthof(array)
Definition: c.h:669
static const uint32 UnicodeDecomp_codepoints[5090]
#define SBASE
Definition: unicode_norm.c:33
#define VBASE
Definition: unicode_norm.c:35
#define TBASE
Definition: unicode_norm.c:36
unsigned int uint32
Definition: c.h:359
#define LBASE
Definition: unicode_norm.c:34
#define SCOUNT
Definition: unicode_norm.c:41
int i
#define TCOUNT
Definition: unicode_norm.c:39
static const pg_unicode_decomposition UnicodeDecompMain[6582]
#define VCOUNT
Definition: unicode_norm.c:38
#define DECOMPOSITION_NO_COMPOSE(x)

◆ unicode_normalize_kc()

pg_wchar* unicode_normalize_kc ( const pg_wchar input)

Definition at line 307 of file unicode_norm.c.

References ALLOC, Assert, pg_unicode_decomposition::comb_class, current_size, decompose_code(), FREE, get_code_entry(), get_decomposed_size(), next, and recompose_code().

Referenced by main(), and pg_saslprep().

308 {
309  pg_wchar *decomp_chars;
310  pg_wchar *recomp_chars;
311  int decomp_size,
312  current_size;
313  int count;
314  const pg_wchar *p;
315 
316  /* variables for recomposition */
317  int last_class;
318  int starter_pos;
319  int target_pos;
320  uint32 starter_ch;
321 
322  /* First, do character decomposition */
323 
324  /*
325  * Calculate how many characters long the decomposed version will be.
326  */
327  decomp_size = 0;
328  for (p = input; *p; p++)
329  decomp_size += get_decomposed_size(*p);
330 
331  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
332  if (decomp_chars == NULL)
333  return NULL;
334 
335  /*
336  * Now fill in each entry recursively. This needs a second pass on the
337  * decomposition table.
338  */
339  current_size = 0;
340  for (p = input; *p; p++)
341  decompose_code(*p, &decomp_chars, &current_size);
342  decomp_chars[decomp_size] = '\0';
343  Assert(decomp_size == current_size);
344 
345  /*
346  * Now apply canonical ordering.
347  */
348  for (count = 1; count < decomp_size; count++)
349  {
350  pg_wchar prev = decomp_chars[count - 1];
351  pg_wchar next = decomp_chars[count];
352  pg_wchar tmp;
353  pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354  pg_unicode_decomposition *nextEntry = get_code_entry(next);
355 
356  /*
357  * If no entries are found, the character used is either an Hangul
358  * character or a character with a class of 0 and no decompositions,
359  * so move to next result.
360  */
361  if (prevEntry == NULL || nextEntry == NULL)
362  continue;
363 
364  /*
365  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4,
366  * a sequence of two adjacent characters in a string is an
367  * exchangeable pair if the combining class (from the Unicode
368  * Character Database) for the first character is greater than the
369  * combining class for the second, and the second is not a starter. A
370  * character is a starter if its combining class is 0.
371  */
372  if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
373  continue;
374 
375  if (prevEntry->comb_class <= nextEntry->comb_class)
376  continue;
377 
378  /* exchange can happen */
379  tmp = decomp_chars[count - 1];
380  decomp_chars[count - 1] = decomp_chars[count];
381  decomp_chars[count] = tmp;
382 
383  /* backtrack to check again */
384  if (count > 1)
385  count -= 2;
386  }
387 
388  /*
389  * The last phase of NFKC is the recomposition of the reordered Unicode
390  * string using combining classes. The recomposed string cannot be longer
391  * than the decomposed one, so make the allocation of the output string
392  * based on that assumption.
393  */
394  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
395  if (!recomp_chars)
396  {
397  FREE(decomp_chars);
398  return NULL;
399  }
400 
401  last_class = -1; /* this eliminates a special check */
402  starter_pos = 0;
403  target_pos = 1;
404  starter_ch = recomp_chars[0] = decomp_chars[0];
405 
406  for (count = 1; count < decomp_size; count++)
407  {
408  pg_wchar ch = decomp_chars[count];
410  int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
411  pg_wchar composite;
412 
413  if (last_class < ch_class &&
414  recompose_code(starter_ch, ch, &composite))
415  {
416  recomp_chars[starter_pos] = composite;
417  starter_ch = composite;
418  }
419  else if (ch_class == 0)
420  {
421  starter_pos = target_pos;
422  starter_ch = ch;
423  last_class = -1;
424  recomp_chars[target_pos++] = ch;
425  }
426  else
427  {
428  last_class = ch_class;
429  recomp_chars[target_pos++] = ch;
430  }
431  }
432  recomp_chars[target_pos] = (pg_wchar) '\0';
433 
434  FREE(decomp_chars);
435 
436  return recomp_chars;
437 }
static int32 next
Definition: blutils.c:217
static int get_decomposed_size(pg_wchar code)
Definition: unicode_norm.c:101
#define ALLOC(size)
Definition: unicode_norm.c:25
unsigned int uint32
Definition: c.h:359
unsigned int pg_wchar
Definition: mbprint.c:31
#define Assert(condition)
Definition: c.h:739
#define FREE(size)
Definition: unicode_norm.c:26
static pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:59
int64 current_size
Definition: pg_checksums.c:69
static void decompose_code(pg_wchar code, pg_wchar **result, int *current)
Definition: unicode_norm.c:227
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:159