PostgreSQL Source Code git master
unicode_norm.h File Reference
#include "mb/pg_wchar.h"
Include dependency graph for unicode_norm.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Enumerations

enum  UnicodeNormalizationForm { UNICODE_NFC = 0 , UNICODE_NFD = 1 , UNICODE_NFKC = 2 , UNICODE_NFKD = 3 }
 
enum  UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0 , UNICODE_NORM_QC_YES = 1 , UNICODE_NORM_QC_MAYBE = -1 }
 

Functions

pg_wcharunicode_normalize (UnicodeNormalizationForm form, const pg_wchar *input)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const pg_wchar *input)
 

Enumeration Type Documentation

◆ UnicodeNormalizationForm

Enumerator
UNICODE_NFC 
UNICODE_NFD 
UNICODE_NFKC 
UNICODE_NFKD 

Definition at line 19 of file unicode_norm.h.

20{
21 UNICODE_NFC = 0,
22 UNICODE_NFD = 1,
23 UNICODE_NFKC = 2,
24 UNICODE_NFKD = 3,
UnicodeNormalizationForm
Definition: unicode_norm.h:20
@ UNICODE_NFKD
Definition: unicode_norm.h:24
@ UNICODE_NFD
Definition: unicode_norm.h:22
@ UNICODE_NFC
Definition: unicode_norm.h:21
@ UNICODE_NFKC
Definition: unicode_norm.h:23

◆ UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO 
UNICODE_NORM_QC_YES 
UNICODE_NORM_QC_MAYBE 

Definition at line 28 of file unicode_norm.h.

29{
UnicodeNormalizationQC
Definition: unicode_norm.h:29
@ UNICODE_NORM_QC_YES
Definition: unicode_norm.h:31
@ UNICODE_NORM_QC_NO
Definition: unicode_norm.h:30
@ UNICODE_NORM_QC_MAYBE
Definition: unicode_norm.h:32

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 598 of file unicode_norm.c.

599{
600 uint8 lastCanonicalClass = 0;
602
603 /*
604 * For the "D" forms, we don't run the quickcheck. We don't include the
605 * lookup tables for those because they are huge, checking for these
606 * particular forms is less common, and running the slow path is faster
607 * for the "D" forms than the "C" forms because you don't need to
608 * recompose, which is slow.
609 */
610 if (form == UNICODE_NFD || form == UNICODE_NFKD)
612
613 for (const pg_wchar *p = input; *p; p++)
614 {
615 pg_wchar ch = *p;
616 uint8 canonicalClass;
618
619 canonicalClass = get_canonical_class(ch);
620 if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
621 return UNICODE_NORM_QC_NO;
622
623 check = qc_is_allowed(form, ch);
624 if (check == UNICODE_NORM_QC_NO)
625 return UNICODE_NORM_QC_NO;
626 else if (check == UNICODE_NORM_QC_MAYBE)
627 result = UNICODE_NORM_QC_MAYBE;
628
629 lastCanonicalClass = canonicalClass;
630 }
631 return result;
632}
uint8_t uint8
Definition: c.h:486
FILE * input
unsigned int pg_wchar
Definition: mbprint.c:31
static uint8 get_canonical_class(pg_wchar code)
Definition: unicode_norm.c:112
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
Definition: unicode_norm.c:574

References get_canonical_class(), input, qc_is_allowed(), UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

◆ unicode_normalize()

pg_wchar * unicode_normalize ( UnicodeNormalizationForm  form,
const pg_wchar input 
)

Definition at line 402 of file unicode_norm.c.

403{
404 bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405 bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406 pg_wchar *decomp_chars;
407 pg_wchar *recomp_chars;
408 int decomp_size,
410 int count;
411 const pg_wchar *p;
412
413 /* variables for recomposition */
414 int last_class;
415 int starter_pos;
416 int target_pos;
417 uint32 starter_ch;
418
419 /* First, do character decomposition */
420
421 /*
422 * Calculate how many characters long the decomposed version will be.
423 */
424 decomp_size = 0;
425 for (p = input; *p; p++)
426 decomp_size += get_decomposed_size(*p, compat);
427
428 decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429 if (decomp_chars == NULL)
430 return NULL;
431
432 /*
433 * Now fill in each entry recursively. This needs a second pass on the
434 * decomposition table.
435 */
436 current_size = 0;
437 for (p = input; *p; p++)
438 decompose_code(*p, compat, &decomp_chars, &current_size);
439 decomp_chars[decomp_size] = '\0';
440 Assert(decomp_size == current_size);
441
442 /* Leave if there is nothing to decompose */
443 if (decomp_size == 0)
444 return decomp_chars;
445
446 /*
447 * Now apply canonical ordering.
448 */
449 for (count = 1; count < decomp_size; count++)
450 {
451 pg_wchar prev = decomp_chars[count - 1];
452 pg_wchar next = decomp_chars[count];
453 pg_wchar tmp;
454 const uint8 prevClass = get_canonical_class(prev);
455 const uint8 nextClass = get_canonical_class(next);
456
457 /*
458 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459 * annex 4, a sequence of two adjacent characters in a string is an
460 * exchangeable pair if the combining class (from the Unicode
461 * Character Database) for the first character is greater than the
462 * combining class for the second, and the second is not a starter. A
463 * character is a starter if its combining class is 0.
464 */
465 if (prevClass == 0 || nextClass == 0)
466 continue;
467
468 if (prevClass <= nextClass)
469 continue;
470
471 /* exchange can happen */
472 tmp = decomp_chars[count - 1];
473 decomp_chars[count - 1] = decomp_chars[count];
474 decomp_chars[count] = tmp;
475
476 /* backtrack to check again */
477 if (count > 1)
478 count -= 2;
479 }
480
481 if (!recompose)
482 return decomp_chars;
483
484 /*
485 * The last phase of NFC and NFKC is the recomposition of the reordered
486 * Unicode string using combining classes. The recomposed string cannot be
487 * longer than the decomposed one, so make the allocation of the output
488 * string based on that assumption.
489 */
490 recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
491 if (!recomp_chars)
492 {
493 FREE(decomp_chars);
494 return NULL;
495 }
496
497 last_class = -1; /* this eliminates a special check */
498 starter_pos = 0;
499 target_pos = 1;
500 starter_ch = recomp_chars[0] = decomp_chars[0];
501
502 for (count = 1; count < decomp_size; count++)
503 {
504 pg_wchar ch = decomp_chars[count];
505 int ch_class = get_canonical_class(ch);
506 pg_wchar composite;
507
508 if (last_class < ch_class &&
509 recompose_code(starter_ch, ch, &composite))
510 {
511 recomp_chars[starter_pos] = composite;
512 starter_ch = composite;
513 }
514 else if (ch_class == 0)
515 {
516 starter_pos = target_pos;
517 starter_ch = ch;
518 last_class = -1;
519 recomp_chars[target_pos++] = ch;
520 }
521 else
522 {
523 last_class = ch_class;
524 recomp_chars[target_pos++] = ch;
525 }
526 }
527 recomp_chars[target_pos] = (pg_wchar) '\0';
528
529 FREE(decomp_chars);
530
531 return recomp_chars;
532}
static int32 next
Definition: blutils.c:219
#define Assert(condition)
Definition: c.h:815
uint32_t uint32
Definition: c.h:488
enum COMPAT_MODE compat
Definition: ecpg.c:26
static int64 current_size
Definition: pg_checksums.c:63
static void decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
Definition: unicode_norm.c:321
#define ALLOC(size)
Definition: unicode_norm.c:31
#define FREE(size)
Definition: unicode_norm.c:32
static bool recompose_code(uint32 start, uint32 code, uint32 *result)
Definition: unicode_norm.c:218
static int get_decomposed_size(pg_wchar code, bool compat)
Definition: unicode_norm.c:159

References ALLOC, Assert, compat, current_size, decompose_code(), FREE, get_canonical_class(), get_decomposed_size(), input, next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, and UNICODE_NFKD.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().