PostgreSQL Source Code git master
Loading...
Searching...
No Matches
unicode_norm.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Enumerations

enum  UnicodeNormalizationForm { UNICODE_NFC = 0 , UNICODE_NFD = 1 , UNICODE_NFKC = 2 , UNICODE_NFKD = 3 }
 
enum  UnicodeNormalizationQC { UNICODE_NORM_QC_NO = 0 , UNICODE_NORM_QC_YES = 1 , UNICODE_NORM_QC_MAYBE = -1 }
 

Functions

char32_tunicode_normalize (UnicodeNormalizationForm form, const char32_t *input)
 
UnicodeNormalizationQC unicode_is_normalized_quickcheck (UnicodeNormalizationForm form, const char32_t *input)
 

Enumeration Type Documentation

◆ UnicodeNormalizationForm

Enumerator
UNICODE_NFC 
UNICODE_NFD 
UNICODE_NFKC 
UNICODE_NFKD 

Definition at line 17 of file unicode_norm.h.

18{
19 UNICODE_NFC = 0,
20 UNICODE_NFD = 1,
21 UNICODE_NFKC = 2,
22 UNICODE_NFKD = 3,
UnicodeNormalizationForm
@ UNICODE_NFKD
@ UNICODE_NFD
@ UNICODE_NFC
@ UNICODE_NFKC

◆ UnicodeNormalizationQC

Enumerator
UNICODE_NORM_QC_NO 
UNICODE_NORM_QC_YES 
UNICODE_NORM_QC_MAYBE 

Definition at line 26 of file unicode_norm.h.

27{
UnicodeNormalizationQC
@ UNICODE_NORM_QC_YES
@ UNICODE_NORM_QC_NO
@ UNICODE_NORM_QC_MAYBE

Function Documentation

◆ unicode_is_normalized_quickcheck()

UnicodeNormalizationQC unicode_is_normalized_quickcheck ( UnicodeNormalizationForm  form,
const char32_t input 
)
extern

Definition at line 617 of file unicode_norm.c.

618{
621
622 /*
623 * For the "D" forms, we don't run the quickcheck. We don't include the
624 * lookup tables for those because they are huge, checking for these
625 * particular forms is less common, and running the slow path is faster
626 * for the "D" forms than the "C" forms because you don't need to
627 * recompose, which is slow.
628 */
629 if (form == UNICODE_NFD || form == UNICODE_NFKD)
631
632 for (const char32_t *p = input; *p; p++)
633 {
634 char32_t ch = *p;
637
640 return UNICODE_NORM_QC_NO;
641
642 check = qc_is_allowed(form, ch);
643 if (check == UNICODE_NORM_QC_NO)
644 return UNICODE_NORM_QC_NO;
645 else if (check == UNICODE_NORM_QC_MAYBE)
647
649 }
650 return result;
651}
uint8_t uint8
Definition c.h:622
uint32 result
FILE * input
static int fb(int x)
static uint8 get_canonical_class(char32_t code)
static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)

References fb(), get_canonical_class(), input, qc_is_allowed(), result, UNICODE_NFD, UNICODE_NFKD, UNICODE_NORM_QC_MAYBE, UNICODE_NORM_QC_NO, and UNICODE_NORM_QC_YES.

Referenced by unicode_is_normalized().

◆ unicode_normalize()

char32_t * unicode_normalize ( UnicodeNormalizationForm  form,
const char32_t input 
)
extern

Definition at line 403 of file unicode_norm.c.

404{
405 bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
406 bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
407 char32_t *decomp_chars;
408 char32_t *recomp_chars;
409 int decomp_size,
411 int count;
412 const char32_t *p;
413
414 /* variables for recomposition */
415 int last_class;
416 int starter_pos;
417 int target_pos;
419
420 /* First, do character decomposition */
421
422 /*
423 * Calculate how many characters long the decomposed version will be.
424 *
425 * Some characters decompose to quite a few code points, so that the
426 * decomposed version's size could overrun MaxAllocSize, and even 32-bit
427 * size_t, even though the input string presumably fits in that. In
428 * frontend we want to just return NULL in that case, so monitor the sum
429 * and exit early once we'd need more than MaxAllocSize bytes.
430 */
431 decomp_size = 0;
432 for (p = input; *p; p++)
433 {
435 if (unlikely(decomp_size > MaxAllocSize / sizeof(char32_t)))
436 {
437#ifndef FRONTEND
438 /* Exit loop and let palloc() throw error below */
439 break;
440#else
441 /* Just return NULL with no explicit error */
442 return NULL;
443#endif
444 }
445 }
446
447 decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
448 if (decomp_chars == NULL)
449 return NULL;
450
451 /*
452 * Now fill in each entry recursively. This needs a second pass on the
453 * decomposition table.
454 */
455 current_size = 0;
456 for (p = input; *p; p++)
460
461 /* Leave if there is nothing to decompose */
462 if (decomp_size == 0)
463 return decomp_chars;
464
465 /*
466 * Now apply canonical ordering.
467 */
468 for (count = 1; count < decomp_size; count++)
469 {
470 char32_t prev = decomp_chars[count - 1];
471 char32_t next = decomp_chars[count];
472 char32_t tmp;
473 const uint8 prevClass = get_canonical_class(prev);
475
476 /*
477 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
478 * annex 4, a sequence of two adjacent characters in a string is an
479 * exchangeable pair if the combining class (from the Unicode
480 * Character Database) for the first character is greater than the
481 * combining class for the second, and the second is not a starter. A
482 * character is a starter if its combining class is 0.
483 */
484 if (prevClass == 0 || nextClass == 0)
485 continue;
486
487 if (prevClass <= nextClass)
488 continue;
489
490 /* exchange can happen */
491 tmp = decomp_chars[count - 1];
492 decomp_chars[count - 1] = decomp_chars[count];
493 decomp_chars[count] = tmp;
494
495 /* backtrack to check again */
496 if (count > 1)
497 count -= 2;
498 }
499
500 if (!recompose)
501 return decomp_chars;
502
503 /*
504 * The last phase of NFC and NFKC is the recomposition of the reordered
505 * Unicode string using combining classes. The recomposed string cannot be
506 * longer than the decomposed one, so make the allocation of the output
507 * string based on that assumption.
508 */
509 recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
510 if (!recomp_chars)
511 {
513 return NULL;
514 }
515
516 last_class = -1; /* this eliminates a special check */
517 starter_pos = 0;
518 target_pos = 1;
520
521 for (count = 1; count < decomp_size; count++)
522 {
523 char32_t ch = decomp_chars[count];
525 char32_t composite;
526
527 if (last_class < ch_class &&
528 recompose_code(starter_ch, ch, &composite))
529 {
530 recomp_chars[starter_pos] = composite;
531 starter_ch = composite;
532 }
533 else if (ch_class == 0)
534 {
536 starter_ch = ch;
537 last_class = -1;
539 }
540 else
541 {
544 }
545 }
547
549
550 return recomp_chars;
551}
static int32 next
Definition blutils.c:225
#define Assert(condition)
Definition c.h:943
#define unlikely(x)
Definition c.h:438
uint32_t uint32
Definition c.h:624
uint32_t char32_t
Definition c.h:1504
enum COMPAT_MODE compat
Definition ecpg.c:26
#define MaxAllocSize
Definition fe_memutils.h:22
static int64 current_size
static void decompose_code(char32_t code, bool compat, char32_t **result, int *current)
#define ALLOC(size)
#define FREE(size)
static int get_decomposed_size(char32_t code, bool compat)
static bool recompose_code(uint32 start, uint32 code, uint32 *result)

References ALLOC, Assert, compat, current_size, decompose_code(), fb(), FREE, get_canonical_class(), get_decomposed_size(), input, MaxAllocSize, next, recompose_code(), UNICODE_NFC, UNICODE_NFKC, UNICODE_NFKD, and unlikely.

Referenced by main(), pg_saslprep(), unicode_is_normalized(), and unicode_normalize_func().