PostgreSQL Source Code  git master
unicode_norm.h File Reference
#include "mb/pg_wchar.h"
Include dependency graph for unicode_norm.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

pg_wcharunicode_normalize_kc (const pg_wchar *input)
 

Function Documentation

◆ unicode_normalize_kc()

pg_wchar* unicode_normalize_kc ( const pg_wchar input)

Definition at line 307 of file unicode_norm.c.

References ALLOC, Assert, pg_unicode_decomposition::comb_class, current_size, decompose_code(), FREE, get_code_entry(), get_decomposed_size(), next, and recompose_code().

Referenced by main(), and pg_saslprep().

308 {
309  pg_wchar *decomp_chars;
310  pg_wchar *recomp_chars;
311  int decomp_size,
312  current_size;
313  int count;
314  const pg_wchar *p;
315 
316  /* variables for recomposition */
317  int last_class;
318  int starter_pos;
319  int target_pos;
320  uint32 starter_ch;
321 
322  /* First, do character decomposition */
323 
324  /*
325  * Calculate how many characters long the decomposed version will be.
326  */
327  decomp_size = 0;
328  for (p = input; *p; p++)
329  decomp_size += get_decomposed_size(*p);
330 
331  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
332  if (decomp_chars == NULL)
333  return NULL;
334 
335  /*
336  * Now fill in each entry recursively. This needs a second pass on the
337  * decomposition table.
338  */
339  current_size = 0;
340  for (p = input; *p; p++)
341  decompose_code(*p, &decomp_chars, &current_size);
342  decomp_chars[decomp_size] = '\0';
343  Assert(decomp_size == current_size);
344 
345  /*
346  * Now apply canonical ordering.
347  */
348  for (count = 1; count < decomp_size; count++)
349  {
350  pg_wchar prev = decomp_chars[count - 1];
351  pg_wchar next = decomp_chars[count];
352  pg_wchar tmp;
353  pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354  pg_unicode_decomposition *nextEntry = get_code_entry(next);
355 
356  /*
357  * If no entries are found, the character used is either an Hangul
358  * character or a character with a class of 0 and no decompositions,
359  * so move to next result.
360  */
361  if (prevEntry == NULL || nextEntry == NULL)
362  continue;
363 
364  /*
365  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4,
366  * a sequence of two adjacent characters in a string is an
367  * exchangeable pair if the combining class (from the Unicode
368  * Character Database) for the first character is greater than the
369  * combining class for the second, and the second is not a starter. A
370  * character is a starter if its combining class is 0.
371  */
372  if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
373  continue;
374 
375  if (prevEntry->comb_class <= nextEntry->comb_class)
376  continue;
377 
378  /* exchange can happen */
379  tmp = decomp_chars[count - 1];
380  decomp_chars[count - 1] = decomp_chars[count];
381  decomp_chars[count] = tmp;
382 
383  /* backtrack to check again */
384  if (count > 1)
385  count -= 2;
386  }
387 
388  /*
389  * The last phase of NFKC is the recomposition of the reordered Unicode
390  * string using combining classes. The recomposed string cannot be longer
391  * than the decomposed one, so make the allocation of the output string
392  * based on that assumption.
393  */
394  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
395  if (!recomp_chars)
396  {
397  FREE(decomp_chars);
398  return NULL;
399  }
400 
401  last_class = -1; /* this eliminates a special check */
402  starter_pos = 0;
403  target_pos = 1;
404  starter_ch = recomp_chars[0] = decomp_chars[0];
405 
406  for (count = 1; count < decomp_size; count++)
407  {
408  pg_wchar ch = decomp_chars[count];
410  int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
411  pg_wchar composite;
412 
413  if (last_class < ch_class &&
414  recompose_code(starter_ch, ch, &composite))
415  {
416  recomp_chars[starter_pos] = composite;
417  starter_ch = composite;
418  }
419  else if (ch_class == 0)
420  {
421  starter_pos = target_pos;
422  starter_ch = ch;
423  last_class = -1;
424  recomp_chars[target_pos++] = ch;
425  }
426  else
427  {
428  last_class = ch_class;
429  recomp_chars[target_pos++] = ch;
430  }
431  }
432  recomp_chars[target_pos] = (pg_wchar) '\0';
433 
434  FREE(decomp_chars);
435 
436  return recomp_chars;
437 }
static int32 next
Definition: blutils.c:213
static int get_decomposed_size(pg_wchar code)
Definition: unicode_norm.c:101
#define ALLOC(size)
Definition: unicode_norm.c:25
unsigned int uint32
Definition: c.h:359
unsigned int pg_wchar
Definition: mbprint.c:31
#define Assert(condition)
Definition: c.h:739
#define FREE(size)
Definition: unicode_norm.c:26
static pg_unicode_decomposition * get_code_entry(pg_wchar code)
Definition: unicode_norm.c:59
int64 current_size
Definition: pg_checksums.c:69
static void decompose_code(pg_wchar code, pg_wchar **result, int *current)
Definition: unicode_norm.c:227
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:159