PostgreSQL Source Code  git master
unicode_norm.c File Reference
Include dependency graph for unicode_norm.c:

Go to the source code of this file.

Macros

#define ALLOC(size)   palloc(size)
 
#define FREE(size)   pfree(size)
 
#define SBASE   0xAC00 /* U+AC00 */
 
#define LBASE   0x1100 /* U+1100 */
 
#define VBASE   0x1161 /* U+1161 */
 
#define TBASE   0x11A7 /* U+11A7 */
 
#define LCOUNT   19
 
#define VCOUNT   21
 
#define TCOUNT   28
 
#define NCOUNT   VCOUNT * TCOUNT
 
#define SCOUNT   LCOUNT * NCOUNT
 

Functions

static const pg_unicode_decompositionget_code_entry (pg_wchar code)
 
static uint8 get_canonical_class (pg_wchar code)
 
static const pg_wcharget_code_decomposition (const pg_unicode_decomposition *entry, int *dec_size)
 
static int get_decomposed_size (pg_wchar code, bool compat)
 
static bool recompose_code (uint32 start, uint32 code, uint32 *result)
 
static void decompose_code (pg_wchar code, bool compat, pg_wchar **result, int *current)
 
pg_wcharunicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)
 
static const pg_unicode_normpropsqc_hash_lookup (pg_wchar ch, const pg_unicode_norminfo *norminfo)
 
static UnicodeNormalizationQC qc_is_allowed (UnicodeNormalizationForm form, pg_wchar ch)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)
 

Macro Definition Documentation

◆ ALLOC

#define ALLOC (   size)    palloc(size)

Definition at line 31 of file unicode_norm.c.

Referenced by unicode_normalize().

◆ FREE

#define FREE (   size)    pfree(size)

Definition at line 32 of file unicode_norm.c.

Referenced by unicode_normalize().

◆ LBASE

#define LBASE   0x1100 /* U+1100 */

Definition at line 40 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

◆ LCOUNT

#define LCOUNT   19

Definition at line 43 of file unicode_norm.c.

Referenced by recompose_code().

◆ NCOUNT

#define NCOUNT   VCOUNT * TCOUNT

Definition at line 46 of file unicode_norm.c.

◆ SBASE

#define SBASE   0xAC00 /* U+AC00 */

Definition at line 39 of file unicode_norm.c.

Referenced by decompose_code(), get_decomposed_size(), and recompose_code().

◆ SCOUNT

#define SCOUNT   LCOUNT * NCOUNT

Definition at line 47 of file unicode_norm.c.

Referenced by decompose_code(), get_decomposed_size(), and recompose_code().

◆ TBASE

#define TBASE   0x11A7 /* U+11A7 */

Definition at line 42 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

◆ TCOUNT

#define TCOUNT   28

Definition at line 45 of file unicode_norm.c.

Referenced by decompose_code(), get_decomposed_size(), and recompose_code().

◆ VBASE

#define VBASE   0x1161 /* U+1161 */

Definition at line 41 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

◆ VCOUNT

#define VCOUNT   21

Definition at line 44 of file unicode_norm.c.

Referenced by decompose_code(), and recompose_code().

Function Documentation

◆ decompose_code()

static void decompose_code ( pg_wchar  code,
bool  compat,
pg_wchar **  result,
int *  current 
)
static

Definition at line 321 of file unicode_norm.c.

References DECOMPOSITION_IS_COMPAT, DECOMPOSITION_SIZE, get_code_decomposition(), get_code_entry(), i, LBASE, SBASE, SCOUNT, TBASE, TCOUNT, VBASE, and VCOUNT.

Referenced by unicode_normalize().

322 {
323  const pg_unicode_decomposition *entry;
324  int i;
325  const uint32 *decomp;
326  int dec_size;
327 
328  /*
329  * Fast path for Hangul characters not stored in tables to save memory as
330  * decomposition is algorithmic. See
331  * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
332  * on the matter.
333  */
334  if (code >= SBASE && code < SBASE + SCOUNT)
335  {
336  uint32 l,
337  v,
338  tindex,
339  sindex;
340  pg_wchar *res = *result;
341 
342  sindex = code - SBASE;
343  l = LBASE + sindex / (VCOUNT * TCOUNT);
344  v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
345  tindex = sindex % TCOUNT;
346 
347  res[*current] = l;
348  (*current)++;
349  res[*current] = v;
350  (*current)++;
351 
352  if (tindex != 0)
353  {
354  res[*current] = TBASE + tindex;
355  (*current)++;
356  }
357 
358  return;
359  }
360 
361  entry = get_code_entry(code);
362 
363  /*
364  * Just fill in with the current decomposition if there are no
365  * decomposition codes to recurse to. A NULL entry is equivalent to a
366  * character with class 0 and no decompositions, so just leave also in
367  * this case.
368  */
369  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
370  (!compat && DECOMPOSITION_IS_COMPAT(entry)))
371  {
372  pg_wchar *res = *result;
373 
374  res[*current] = code;
375  (*current)++;
376  return;
377  }
378 
379  /*
380  * If this entry has other decomposition codes look at them as well.
381  */
382  decomp = get_code_decomposition(entry, &dec_size);
383  for (i = 0; i < dec_size; i++)
384  {
385  pg_wchar lcode = (pg_wchar) decomp[i];
386 
387  /* Leave if no more decompositions */
388  decompose_code(lcode, compat, result, current);
389  }
390 }
#define DECOMPOSITION_SIZE(x)
#define DECOMPOSITION_IS_COMPAT(x)
static const pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:72
#define SBASE
Definition: unicode_norm.c:39
#define VBASE
Definition: unicode_norm.c:41
#define TBASE
Definition: unicode_norm.c:42
unsigned int uint32
Definition: c.h:441
unsigned int pg_wchar
Definition: mbprint.c:31
enum COMPAT_MODE compat
Definition: ecpg.c:25
static const pg_wchar * get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
Definition: unicode_norm.c:134
#define LBASE
Definition: unicode_norm.c:40
#define SCOUNT
Definition: unicode_norm.c:47
int i
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:321
#define TCOUNT
Definition: unicode_norm.c:45
#define VCOUNT
Definition: unicode_norm.c:44

◆ get_canonical_class()

static uint8 get_canonical_class ( pg_wchar  code)
static

Definition at line 112 of file unicode_norm.c.

References pg_unicode_decomposition::comb_class, and get_code_entry().

Referenced by unicode_is_normalized_quickcheck(), and unicode_normalize().

113 {
114  const pg_unicode_decomposition *entry = get_code_entry(code);
115 
116  /*
117  * If no entries are found, the character used is either an Hangul
118  * character or a character with a class of 0 and no decompositions.
119  */
120  if (!entry)
121  return 0;
122  else
123  return entry->comb_class;
124 }
static const pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:72

◆ get_code_decomposition()

static const pg_wchar* get_code_decomposition ( const pg_unicode_decomposition entry,
int *  dec_size 
)
static

Definition at line 134 of file unicode_norm.c.

References Assert, pg_unicode_decomposition::dec_index, DECOMPOSITION_IS_INLINE, DECOMPOSITION_SIZE, and UnicodeDecomp_codepoints.

Referenced by decompose_code(), and get_decomposed_size().

135 {
136  static pg_wchar x;
137 
138  if (DECOMPOSITION_IS_INLINE(entry))
139  {
140  Assert(DECOMPOSITION_SIZE(entry) == 1);
141  x = (pg_wchar) entry->dec_index;
142  *dec_size = 1;
143  return &x;
144  }
145  else
146  {
147  *dec_size = DECOMPOSITION_SIZE(entry);
148  return &UnicodeDecomp_codepoints[entry->dec_index];
149  }
150 }
#define DECOMPOSITION_SIZE(x)
static const uint32 UnicodeDecomp_codepoints[5098]
#define DECOMPOSITION_IS_INLINE(x)
unsigned int pg_wchar
Definition: mbprint.c:31
#define Assert(condition)
Definition: c.h:804

◆ get_code_entry()

static const pg_unicode_decomposition* get_code_entry ( pg_wchar  code)
static

Definition at line 72 of file unicode_norm.c.

References pg_unicode_decomposition::codepoint, pg_unicode_decompinfo::decomps, pg_unicode_decompinfo::hash, lengthof, pg_unicode_decompinfo::num_decomps, pg_hton32, UnicodeDecompInfo, and UnicodeDecompMain.

Referenced by decompose_code(), get_canonical_class(), and get_decomposed_size().

73 {
74 #ifndef FRONTEND
75  int h;
76  uint32 hashkey;
78 
79  /*
80  * Compute the hash function. The hash key is the codepoint with the bytes
81  * in network order.
82  */
83  hashkey = pg_hton32(code);
84  h = decompinfo.hash(&hashkey);
85 
86  /* An out-of-range result implies no match */
87  if (h < 0 || h >= decompinfo.num_decomps)
88  return NULL;
89 
90  /*
91  * Since it's a perfect hash, we need only match to the specific codepoint
92  * it identifies.
93  */
94  if (code != decompinfo.decomps[h].codepoint)
95  return NULL;
96 
97  /* Success! */
98  return &decompinfo.decomps[h];
99 #else
100  return bsearch(&(code),
103  sizeof(pg_unicode_decomposition),
104  conv_compare);
105 #endif
106 }
const pg_unicode_decomposition * decomps
#define lengthof(array)
Definition: c.h:734
#define pg_hton32(x)
Definition: pg_bswap.h:121
unsigned int uint32
Definition: c.h:441
static const pg_unicode_decompinfo UnicodeDecompInfo
static const pg_unicode_decomposition UnicodeDecompMain[6703]

◆ get_decomposed_size()

static int get_decomposed_size ( pg_wchar  code,
bool  compat 
)
static

Definition at line 159 of file unicode_norm.c.

References DECOMPOSITION_IS_COMPAT, DECOMPOSITION_SIZE, get_code_decomposition(), get_code_entry(), i, SBASE, SCOUNT, and TCOUNT.

Referenced by unicode_normalize().

160 {
161  const pg_unicode_decomposition *entry;
162  int size = 0;
163  int i;
164  const uint32 *decomp;
165  int dec_size;
166 
167  /*
168  * Fast path for Hangul characters not stored in tables to save memory as
169  * decomposition is algorithmic. See
170  * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
171  * on the matter.
172  */
173  if (code >= SBASE && code < SBASE + SCOUNT)
174  {
175  uint32 tindex,
176  sindex;
177 
178  sindex = code - SBASE;
179  tindex = sindex % TCOUNT;
180 
181  if (tindex != 0)
182  return 3;
183  return 2;
184  }
185 
186  entry = get_code_entry(code);
187 
188  /*
189  * Just count current code if no other decompositions. A NULL entry is
190  * equivalent to a character with class 0 and no decompositions.
191  */
192  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
193  (!compat && DECOMPOSITION_IS_COMPAT(entry)))
194  return 1;
195 
196  /*
197  * If this entry has other decomposition codes look at them as well. First
198  * get its decomposition in the list of tables available.
199  */
200  decomp = get_code_decomposition(entry, &dec_size);
201  for (i = 0; i < dec_size; i++)
202  {
203  uint32 lcode = decomp[i];
204 
205  size += get_decomposed_size(lcode, compat);
206  }
207 
208  return size;
209 }
#define DECOMPOSITION_SIZE(x)
#define DECOMPOSITION_IS_COMPAT(x)
static const pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:72
#define SBASE
Definition: unicode_norm.c:39
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:159
unsigned int uint32
Definition: c.h:441
enum COMPAT_MODE compat
Definition: ecpg.c:25
static const pg_wchar * get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
Definition: unicode_norm.c:134
#define SCOUNT
Definition: unicode_norm.c:47
int i
#define TCOUNT
Definition: unicode_norm.c:45

◆ qc_hash_lookup()

static const pg_unicode_normprops* qc_hash_lookup ( pg_wchar  ch,
const pg_unicode_norminfo norminfo 
)
static

Definition at line 539 of file unicode_norm.c.

References pg_unicode_normprops::codepoint, pg_unicode_norminfo::hash, pg_unicode_norminfo::normprops, pg_unicode_norminfo::num_normprops, and pg_hton32.

Referenced by qc_is_allowed().

540 {
541  int h;
542  uint32 hashkey;
543 
544  /*
545  * Compute the hash function. The hash key is the codepoint with the bytes
546  * in network order.
547  */
548  hashkey = pg_hton32(ch);
549  h = norminfo->hash(&hashkey);
550 
551  /* An out-of-range result implies no match */
552  if (h < 0 || h >= norminfo->num_normprops)
553  return NULL;
554 
555  /*
556  * Since it's a perfect hash, we need only match to the specific codepoint
557  * it identifies.
558  */
559  if (ch != norminfo->normprops[h].codepoint)
560  return NULL;
561 
562  /* Success! */
563  return &norminfo->normprops[h];
564 }
const pg_unicode_normprops * normprops
#define pg_hton32(x)
Definition: pg_bswap.h:121
unsigned int uint32
Definition: c.h:441

◆ qc_is_allowed()

static UnicodeNormalizationQC qc_is_allowed ( UnicodeNormalizationForm  form,
pg_wchar  ch 
)
static

Definition at line 570 of file unicode_norm.c.

References Assert, qc_hash_lookup(), pg_unicode_normprops::quickcheck, UNICODE_NFC, UNICODE_NFKC, UNICODE_NORM_QC_YES, UnicodeNormInfo_NFC_QC, and UnicodeNormInfo_NFKC_QC.

Referenced by unicode_is_normalized_quickcheck().

571 {
572  const pg_unicode_normprops *found = NULL;
573 
574  switch (form)
575  {
576  case UNICODE_NFC:
578  break;
579  case UNICODE_NFKC:
581  break;
582  default:
583  Assert(false);
584  break;
585  }
586 
587  if (found)
588  return found->quickcheck;
589  else
590  return UNICODE_NORM_QC_YES;
591 }
static const pg_unicode_normprops * qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
Definition: unicode_norm.c:539
static const pg_unicode_norminfo UnicodeNormInfo_NFKC_QC
static const pg_unicode_norminfo UnicodeNormInfo_NFC_QC
#define Assert(condition)
Definition: c.h:804

◆ recompose_code()

static bool recompose_code ( uint32  start,
uint32  code,
uint32 result 
)
static

Definition at line 218 of file unicode_norm.c.

References pg_unicode_decomposition::codepoint, pg_unicode_decomposition::dec_index, DECOMPOSITION_NO_COMPOSE, DECOMPOSITION_SIZE, pg_unicode_recompinfo::hash, i, pg_unicode_recompinfo::inverse_lookup, LBASE, LCOUNT, lengthof, pg_unicode_recompinfo::num_recomps, pg_hton64, SBASE, SCOUNT, TBASE, TCOUNT, UnicodeDecomp_codepoints, UnicodeDecompMain, UnicodeRecompInfo, VBASE, and VCOUNT.

Referenced by unicode_normalize().

219 {
220  /*
221  * Handle Hangul characters algorithmically, per the Unicode spec.
222  *
223  * Check if two current characters are L and V.
224  */
225  if (start >= LBASE && start < LBASE + LCOUNT &&
226  code >= VBASE && code < VBASE + VCOUNT)
227  {
228  /* make syllable of form LV */
229  uint32 lindex = start - LBASE;
230  uint32 vindex = code - VBASE;
231 
232  *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
233  return true;
234  }
235  /* Check if two current characters are LV and T */
236  else if (start >= SBASE && start < (SBASE + SCOUNT) &&
237  ((start - SBASE) % TCOUNT) == 0 &&
238  code >= TBASE && code < (TBASE + TCOUNT))
239  {
240  /* make syllable of form LVT */
241  uint32 tindex = code - TBASE;
242 
243  *result = start + tindex;
244  return true;
245  }
246  else
247  {
248  const pg_unicode_decomposition *entry;
249 
250  /*
251  * Do an inverse lookup of the decomposition tables to see if anything
252  * matches. The comparison just needs to be a perfect match on the
253  * sub-table of size two, because the start character has already been
254  * recomposed partially. This lookup uses a perfect hash function for
255  * the backend code.
256  */
257 #ifndef FRONTEND
258 
259  int h,
260  inv_lookup_index;
261  uint64 hashkey;
263 
264  /*
265  * Compute the hash function. The hash key is formed by concatenating
266  * bytes of the two codepoints in network order. See also
267  * src/common/unicode/generate-unicode_norm_table.pl.
268  */
269  hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
270  h = recompinfo.hash(&hashkey);
271 
272  /* An out-of-range result implies no match */
273  if (h < 0 || h >= recompinfo.num_recomps)
274  return false;
275 
276  inv_lookup_index = recompinfo.inverse_lookup[h];
277  entry = &UnicodeDecompMain[inv_lookup_index];
278 
279  if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
280  code == UnicodeDecomp_codepoints[entry->dec_index + 1])
281  {
282  *result = entry->codepoint;
283  return true;
284  }
285 
286 #else
287 
288  int i;
289 
290  for (i = 0; i < lengthof(UnicodeDecompMain); i++)
291  {
292  entry = &UnicodeDecompMain[i];
293 
294  if (DECOMPOSITION_SIZE(entry) != 2)
295  continue;
296 
297  if (DECOMPOSITION_NO_COMPOSE(entry))
298  continue;
299 
300  if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
301  code == UnicodeDecomp_codepoints[entry->dec_index + 1])
302  {
303  *result = entry->codepoint;
304  return true;
305  }
306  }
307 #endif /* !FRONTEND */
308  }
309 
310  return false;
311 }
static const pg_unicode_recompinfo UnicodeRecompInfo
#define pg_hton64(x)
Definition: pg_bswap.h:122
#define DECOMPOSITION_SIZE(x)
#define LCOUNT
Definition: unicode_norm.c:43
#define lengthof(array)
Definition: c.h:734
static const uint32 UnicodeDecomp_codepoints[5098]
#define SBASE
Definition: unicode_norm.c:39
#define VBASE
Definition: unicode_norm.c:41
#define TBASE
Definition: unicode_norm.c:42
unsigned int uint32
Definition: c.h:441
#define LBASE
Definition: unicode_norm.c:40
#define SCOUNT
Definition: unicode_norm.c:47
static const pg_unicode_decomposition UnicodeDecompMain[6703]
int i
#define TCOUNT
Definition: unicode_norm.c:45
#define VCOUNT
Definition: unicode_norm.c:44
#define DECOMPOSITION_NO_COMPOSE(x)

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 594 of file unicode_norm.c.

References get_canonical_class(), qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

595 {
596  uint8 lastCanonicalClass = 0;
598 
599  /*
600  * For the "D" forms, we don't run the quickcheck. We don't include the
601  * lookup tables for those because they are huge, checking for these
602  * particular forms is less common, and running the slow path is faster
603  * for the "D" forms than the "C" forms because you don't need to
604  * recompose, which is slow.
605  */
606  if (form == UNICODE_NFD || form == UNICODE_NFKD)
607  return UNICODE_NORM_QC_MAYBE;
608 
609  for (const pg_wchar *p = input; *p; p++)
610  {
611  pg_wchar ch = *p;
612  uint8 canonicalClass;
614 
615  canonicalClass = get_canonical_class(ch);
616  if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
617  return UNICODE_NORM_QC_NO;
618 
619  check = qc_is_allowed(form, ch);
620  if (check == UNICODE_NORM_QC_NO)
621  return UNICODE_NORM_QC_NO;
622  else if (check == UNICODE_NORM_QC_MAYBE)
623  result = UNICODE_NORM_QC_MAYBE;
624 
625  lastCanonicalClass = canonicalClass;
626  }
627  return result;
628 }
unsigned char uint8
Definition: c.h:439
UnicodeNormalizationQC
Definition: unicode_norm.h:28
unsigned int pg_wchar
Definition: mbprint.c:31
static uint8 get_canonical_class(pg_wchar code)
Definition: unicode_norm.c:112
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:570

◆ unicode_normalize()

pg_wchar* unicode_normalize ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 402 of file unicode_norm.c.

References ALLOC, Assert, compat, current_size, decompose_code(), FREE, get_canonical_class(), get_decomposed_size(), next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().

403 {
404  bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405  bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406  pg_wchar *decomp_chars;
407  pg_wchar *recomp_chars;
408  int decomp_size,
409  current_size;
410  int count;
411  const pg_wchar *p;
412 
413  /* variables for recomposition */
414  int last_class;
415  int starter_pos;
416  int target_pos;
417  uint32 starter_ch;
418 
419  /* First, do character decomposition */
420 
421  /*
422  * Calculate how many characters long the decomposed version will be.
423  */
424  decomp_size = 0;
425  for (p = input; *p; p++)
426  decomp_size += get_decomposed_size(*p, compat);
427 
428  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429  if (decomp_chars == NULL)
430  return NULL;
431 
432  /*
433  * Now fill in each entry recursively. This needs a second pass on the
434  * decomposition table.
435  */
436  current_size = 0;
437  for (p = input; *p; p++)
438  decompose_code(*p, compat, &decomp_chars, &current_size);
439  decomp_chars[decomp_size] = '\0';
440  Assert(decomp_size == current_size);
441 
442  /*
443  * Now apply canonical ordering.
444  */
445  for (count = 1; count < decomp_size; count++)
446  {
447  pg_wchar prev = decomp_chars[count - 1];
448  pg_wchar next = decomp_chars[count];
449  pg_wchar tmp;
450  const uint8 prevClass = get_canonical_class(prev);
451  const uint8 nextClass = get_canonical_class(next);
452 
453  /*
454  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
455  * annex 4, a sequence of two adjacent characters in a string is an
456  * exchangeable pair if the combining class (from the Unicode
457  * Character Database) for the first character is greater than the
458  * combining class for the second, and the second is not a starter. A
459  * character is a starter if its combining class is 0.
460  */
461  if (prevClass == 0 || nextClass == 0)
462  continue;
463 
464  if (prevClass <= nextClass)
465  continue;
466 
467  /* exchange can happen */
468  tmp = decomp_chars[count - 1];
469  decomp_chars[count - 1] = decomp_chars[count];
470  decomp_chars[count] = tmp;
471 
472  /* backtrack to check again */
473  if (count > 1)
474  count -= 2;
475  }
476 
477  if (!recompose)
478  return decomp_chars;
479 
480  /*
481  * The last phase of NFC and NFKC is the recomposition of the reordered
482  * Unicode string using combining classes. The recomposed string cannot be
483  * longer than the decomposed one, so make the allocation of the output
484  * string based on that assumption.
485  */
486  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
487  if (!recomp_chars)
488  {
489  FREE(decomp_chars);
490  return NULL;
491  }
492 
493  last_class = -1; /* this eliminates a special check */
494  starter_pos = 0;
495  target_pos = 1;
496  starter_ch = recomp_chars[0] = decomp_chars[0];
497 
498  for (count = 1; count < decomp_size; count++)
499  {
500  pg_wchar ch = decomp_chars[count];
501  int ch_class = get_canonical_class(ch);
502  pg_wchar composite;
503 
504  if (last_class < ch_class &&
505  recompose_code(starter_ch, ch, &composite))
506  {
507  recomp_chars[starter_pos] = composite;
508  starter_ch = composite;
509  }
510  else if (ch_class == 0)
511  {
512  starter_pos = target_pos;
513  starter_ch = ch;
514  last_class = -1;
515  recomp_chars[target_pos++] = ch;
516  }
517  else
518  {
519  last_class = ch_class;
520  recomp_chars[target_pos++] = ch;
521  }
522  }
523  recomp_chars[target_pos] = (pg_wchar) '\0';
524 
525  FREE(decomp_chars);
526 
527  return recomp_chars;
528 }
static int32 next
Definition: blutils.c:219
unsigned char uint8
Definition: c.h:439
#define ALLOC(size)
Definition: unicode_norm.c:31
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:159
unsigned int uint32
Definition: c.h:441
unsigned int pg_wchar
Definition: mbprint.c:31
enum COMPAT_MODE compat
Definition: ecpg.c:25
static uint8 get_canonical_class(pg_wchar code)
Definition: unicode_norm.c:112
#define Assert(condition)
Definition: c.h:804
#define FREE(size)
Definition: unicode_norm.c:32
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:321
int64 current_size
Definition: pg_checksums.c:73
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:218