src/common/unicode_norm.c

   1 /*-------------------------------------------------------------------------
   2  * unicode_norm.c
   3  *              Normalize a Unicode string
   4  *
   5  * This implements Unicode normalization, per the documentation at
   6  * https://www.unicode.org/reports/tr15/.
   7  *
   8  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
   9  *
  10  * IDENTIFICATION
  11  *        src/common/unicode_norm.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #ifndef FRONTEND
  16 #include "postgres.h"
  17 #else
  18 #include "postgres_fe.h"
  19 #endif
  20
  21 #include "common/unicode_norm.h"
  22 #ifndef FRONTEND
  23 #include "common/unicode_norm_hashfunc.h"
  24 #include "common/unicode_normprops_table.h"
  25 #include "port/pg_bswap.h"
  26 #else
  27 #include "common/unicode_norm_table.h"
  28 #endif
  29
  30 #ifndef FRONTEND
  31 #define ALLOC(size) palloc(size)
  32 #define FREE(size) pfree(size)
  33 #else
  34 #define ALLOC(size) malloc(size)
  35 #define FREE(size) free(size)
  36 #endif
  37
  38 /* Constants for calculations with Hangul characters */
  39 #define SBASE           0xAC00          /* U+AC00 */
  40 #define LBASE           0x1100          /* U+1100 */
  41 #define VBASE           0x1161          /* U+1161 */
  42 #define TBASE           0x11A7          /* U+11A7 */
  43 #define LCOUNT          19
  44 #define VCOUNT          21
  45 #define TCOUNT          28
  46 #define NCOUNT          VCOUNT * TCOUNT
  47 #define SCOUNT          LCOUNT * NCOUNT
  48
  49 #ifdef FRONTEND
  50 /* comparison routine for bsearch() of decomposition lookup table. */
  51 static int
  52 conv_compare(const void *p1, const void *p2)
  53 {
  54         uint32          v1,
  55                                 v2;
  56
  57         v1 = *(const uint32 *) p1;
  58         v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
  59         return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
  60 }
  61
  62 #endif
  63
  64 /*
  65  * get_code_entry
  66  *
  67  * Get the entry corresponding to code in the decomposition lookup table.
  68  * The backend version of this code uses a perfect hash function for the
  69  * lookup, while the frontend version uses a binary search.
  70  */
  71 static const pg_unicode_decomposition *
  72 get_code_entry(pg_wchar code)
  73 {
  74 #ifndef FRONTEND
  75         int                     h;
  76         uint32          hashkey;
  77         pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
  78
  79         /*
  80          * Compute the hash function. The hash key is the codepoint with the bytes
  81          * in network order.
  82          */
  83         hashkey = pg_hton32(code);
  84         h = decompinfo.hash(&hashkey);
  85
  86         /* An out-of-range result implies no match */
  87         if (h < 0 || h >= decompinfo.num_decomps)
  88                 return NULL;
  89
  90         /*
  91          * Since it's a perfect hash, we need only match to the specific codepoint
  92          * it identifies.
  93          */
  94         if (code != decompinfo.decomps[h].codepoint)
  95                 return NULL;
  96
  97         /* Success! */
  98         return &decompinfo.decomps[h];
  99 #else
 100         return bsearch(&(code),
 101                                    UnicodeDecompMain,
 102                                    lengthof(UnicodeDecompMain),
 103                                    sizeof(pg_unicode_decomposition),
 104                                    conv_compare);
 105 #endif
 106 }
 107
 108 /*
 109  * Get the combining class of the given codepoint.
 110  */
 111 static uint8
 112 get_canonical_class(pg_wchar code)
 113 {
 114         const pg_unicode_decomposition *entry = get_code_entry(code);
 115
 116         /*
 117          * If no entries are found, the character used is either an Hangul
 118          * character or a character with a class of 0 and no decompositions.
 119          */
 120         if (!entry)
 121                 return 0;
 122         else
 123                 return entry->comb_class;
 124 }
 125
 126 /*
 127  * Given a decomposition entry looked up earlier, get the decomposed
 128  * characters.
 129  *
 130  * Note: the returned pointer can point to statically allocated buffer, and
 131  * is only valid until next call to this function!
 132  */
 133 static const pg_wchar *
 134 get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
 135 {
 136         static pg_wchar x;
 137
 138         if (DECOMPOSITION_IS_INLINE(entry))
 139         {
 140                 Assert(DECOMPOSITION_SIZE(entry) == 1);
 141                 x = (pg_wchar) entry->dec_index;
 142                 *dec_size = 1;
 143                 return &x;
 144         }
 145         else
 146         {
 147                 *dec_size = DECOMPOSITION_SIZE(entry);
 148                 return &UnicodeDecomp_codepoints[entry->dec_index];
 149         }
 150 }
 151
 152 /*
 153  * Calculate how many characters a given character will decompose to.
 154  *
 155  * This needs to recurse, if the character decomposes into characters that
 156  * are, in turn, decomposable.
 157  */
 158 static int
 159 get_decomposed_size(pg_wchar code, bool compat)
 160 {
 161         const pg_unicode_decomposition *entry;
 162         int                     size = 0;
 163         int                     i;
 164         const uint32 *decomp;
 165         int                     dec_size;
 166
 167         /*
 168          * Fast path for Hangul characters not stored in tables to save memory as
 169          * decomposition is algorithmic. See
 170          * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
 171          * on the matter.
 172          */
 173         if (code >= SBASE && code < SBASE + SCOUNT)
 174         {
 175                 uint32          tindex,
 176                                         sindex;
 177
 178                 sindex = code - SBASE;
 179                 tindex = sindex % TCOUNT;
 180
 181                 if (tindex != 0)
 182                         return 3;
 183                 return 2;
 184         }
 185
 186         entry = get_code_entry(code);
 187
 188         /*
 189          * Just count current code if no other decompositions.  A NULL entry is
 190          * equivalent to a character with class 0 and no decompositions.
 191          */
 192         if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
 193                 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
 194                 return 1;
 195
 196         /*
 197          * If this entry has other decomposition codes look at them as well. First
 198          * get its decomposition in the list of tables available.
 199          */
 200         decomp = get_code_decomposition(entry, &dec_size);
 201         for (i = 0; i < dec_size; i++)
 202         {
 203                 uint32          lcode = decomp[i];
 204
 205                 size += get_decomposed_size(lcode, compat);
 206         }
 207
 208         return size;
 209 }
 210
 211 /*
 212  * Recompose a set of characters. For hangul characters, the calculation
 213  * is algorithmic. For others, an inverse lookup at the decomposition
 214  * table is necessary. Returns true if a recomposition can be done, and
 215  * false otherwise.
 216  */
 217 static bool
 218 recompose_code(uint32 start, uint32 code, uint32 *result)
 219 {
 220         /*
 221          * Handle Hangul characters algorithmically, per the Unicode spec.
 222          *
 223          * Check if two current characters are L and V.
 224          */
 225         if (start >= LBASE && start < LBASE + LCOUNT &&
 226                 code >= VBASE && code < VBASE + VCOUNT)
 227         {
 228                 /* make syllable of form LV */
 229                 uint32          lindex = start - LBASE;
 230                 uint32          vindex = code - VBASE;
 231
 232                 *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
 233                 return true;
 234         }
 235         /* Check if two current characters are LV and T */
 236         else if (start >= SBASE && start < (SBASE + SCOUNT) &&
 237                          ((start - SBASE) % TCOUNT) == 0 &&
 238                          code >= TBASE && code < (TBASE + TCOUNT))
 239         {
 240                 /* make syllable of form LVT */
 241                 uint32          tindex = code - TBASE;
 242
 243                 *result = start + tindex;
 244                 return true;
 245         }
 246         else
 247         {
 248                 const pg_unicode_decomposition *entry;
 249
 250                 /*
 251                  * Do an inverse lookup of the decomposition tables to see if anything
 252                  * matches. The comparison just needs to be a perfect match on the
 253                  * sub-table of size two, because the start character has already been
 254                  * recomposed partially.  This lookup uses a perfect hash function for
 255                  * the backend code.
 256                  */
 257 #ifndef FRONTEND
 258
 259                 int                     h,
 260                                         inv_lookup_index;
 261                 uint64          hashkey;
 262                 pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
 263
 264                 /*
 265                  * Compute the hash function. The hash key is formed by concatenating
 266                  * bytes of the two codepoints in network order. See also
 267                  * src/common/unicode/generate-unicode_norm_table.pl.
 268                  */
 269                 hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
 270                 h = recompinfo.hash(&hashkey);
 271
 272                 /* An out-of-range result implies no match */
 273                 if (h < 0 || h >= recompinfo.num_recomps)
 274                         return false;
 275
 276                 inv_lookup_index = recompinfo.inverse_lookup[h];
 277                 entry = &UnicodeDecompMain[inv_lookup_index];
 278
 279                 if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
 280                         code == UnicodeDecomp_codepoints[entry->dec_index + 1])
 281                 {
 282                         *result = entry->codepoint;
 283                         return true;
 284                 }
 285
 286 #else
 287
 288                 int                     i;
 289
 290                 for (i = 0; i < lengthof(UnicodeDecompMain); i++)
 291                 {
 292                         entry = &UnicodeDecompMain[i];
 293
 294                         if (DECOMPOSITION_SIZE(entry) != 2)
 295                                 continue;
 296
 297                         if (DECOMPOSITION_NO_COMPOSE(entry))
 298                                 continue;
 299
 300                         if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
 301                                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
 302                         {
 303                                 *result = entry->codepoint;
 304                                 return true;
 305                         }
 306                 }
 307 #endif                                                  /* !FRONTEND */
 308         }
 309
 310         return false;
 311 }
 312
 313 /*
 314  * Decompose the given code into the array given by caller. The
 315  * decomposition begins at the position given by caller, saving one
 316  * lookup on the decomposition table. The current position needs to be
 317  * updated here to let the caller know from where to continue filling
 318  * in the array result.
 319  */
 320 static void
 321 decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 322 {
 323         const pg_unicode_decomposition *entry;
 324         int                     i;
 325         const uint32 *decomp;
 326         int                     dec_size;
 327
 328         /*
 329          * Fast path for Hangul characters not stored in tables to save memory as
 330          * decomposition is algorithmic. See
 331          * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
 332          * on the matter.
 333          */
 334         if (code >= SBASE && code < SBASE + SCOUNT)
 335         {
 336                 uint32          l,
 337                                         v,
 338                                         tindex,
 339                                         sindex;
 340                 pg_wchar   *res = *result;
 341
 342                 sindex = code - SBASE;
 343                 l = LBASE + sindex / (VCOUNT * TCOUNT);
 344                 v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
 345                 tindex = sindex % TCOUNT;
 346
 347                 res[*current] = l;
 348                 (*current)++;
 349                 res[*current] = v;
 350                 (*current)++;
 351
 352                 if (tindex != 0)
 353                 {
 354                         res[*current] = TBASE + tindex;
 355                         (*current)++;
 356                 }
 357
 358                 return;
 359         }
 360
 361         entry = get_code_entry(code);
 362
 363         /*
 364          * Just fill in with the current decomposition if there are no
 365          * decomposition codes to recurse to.  A NULL entry is equivalent to a
 366          * character with class 0 and no decompositions, so just leave also in
 367          * this case.
 368          */
 369         if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
 370                 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
 371         {
 372                 pg_wchar   *res = *result;
 373
 374                 res[*current] = code;
 375                 (*current)++;
 376                 return;
 377         }
 378
 379         /*
 380          * If this entry has other decomposition codes look at them as well.
 381          */
 382         decomp = get_code_decomposition(entry, &dec_size);
 383         for (i = 0; i < dec_size; i++)
 384         {
 385                 pg_wchar        lcode = (pg_wchar) decomp[i];
 386
 387                 /* Leave if no more decompositions */
 388                 decompose_code(lcode, compat, result, current);
 389         }
 390 }
 391
 392 /*
 393  * unicode_normalize - Normalize a Unicode string to the specified form.
 394  *
 395  * The input is a 0-terminated array of codepoints.
 396  *
 397  * In frontend, returns a 0-terminated array of codepoints, allocated with
 398  * malloc. Or NULL if we run out of memory. In backend, the returned
 399  * string is palloc'd instead, and OOM is reported with ereport().
 400  */
 401 pg_wchar *
 402 unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 403 {
 404         bool            compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
 405         bool            recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
 406         pg_wchar   *decomp_chars;
 407         pg_wchar   *recomp_chars;
 408         int                     decomp_size,
 409                                 current_size;
 410         int                     count;
 411         const pg_wchar *p;
 412
 413         /* variables for recomposition */
 414         int                     last_class;
 415         int                     starter_pos;
 416         int                     target_pos;
 417         uint32          starter_ch;
 418
 419         /* First, do character decomposition */
 420
 421         /*
 422          * Calculate how many characters long the decomposed version will be.
 423          */
 424         decomp_size = 0;
 425         for (p = input; *p; p++)
 426                 decomp_size += get_decomposed_size(*p, compat);
 427
 428         decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
 429         if (decomp_chars == NULL)
 430                 return NULL;
 431
 432         /*
 433          * Now fill in each entry recursively. This needs a second pass on the
 434          * decomposition table.
 435          */
 436         current_size = 0;
 437         for (p = input; *p; p++)
 438                 decompose_code(*p, compat, &decomp_chars, &current_size);
 439         decomp_chars[decomp_size] = '\0';
 440         Assert(decomp_size == current_size);
 441
 442         /* Leave if there is nothing to decompose */
 443         if (decomp_size == 0)
 444                 return decomp_chars;
 445
 446         /*
 447          * Now apply canonical ordering.
 448          */
 449         for (count = 1; count < decomp_size; count++)
 450         {
 451                 pg_wchar        prev = decomp_chars[count - 1];
 452                 pg_wchar        next = decomp_chars[count];
 453                 pg_wchar        tmp;
 454                 const uint8 prevClass = get_canonical_class(prev);
 455                 const uint8 nextClass = get_canonical_class(next);
 456
 457                 /*
 458                  * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
 459                  * annex 4, a sequence of two adjacent characters in a string is an
 460                  * exchangeable pair if the combining class (from the Unicode
 461                  * Character Database) for the first character is greater than the
 462                  * combining class for the second, and the second is not a starter.  A
 463                  * character is a starter if its combining class is 0.
 464                  */
 465                 if (prevClass == 0 || nextClass == 0)
 466                         continue;
 467
 468                 if (prevClass <= nextClass)
 469                         continue;
 470
 471                 /* exchange can happen */
 472                 tmp = decomp_chars[count - 1];
 473                 decomp_chars[count - 1] = decomp_chars[count];
 474                 decomp_chars[count] = tmp;
 475
 476                 /* backtrack to check again */
 477                 if (count > 1)
 478                         count -= 2;
 479         }
 480
 481         if (!recompose)
 482                 return decomp_chars;
 483
 484         /*
 485          * The last phase of NFC and NFKC is the recomposition of the reordered
 486          * Unicode string using combining classes. The recomposed string cannot be
 487          * longer than the decomposed one, so make the allocation of the output
 488          * string based on that assumption.
 489          */
 490         recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
 491         if (!recomp_chars)
 492         {
 493                 FREE(decomp_chars);
 494                 return NULL;
 495         }
 496
 497         last_class = -1;                        /* this eliminates a special check */
 498         starter_pos = 0;
 499         target_pos = 1;
 500         starter_ch = recomp_chars[0] = decomp_chars[0];
 501
 502         for (count = 1; count < decomp_size; count++)
 503         {
 504                 pg_wchar        ch = decomp_chars[count];
 505                 int                     ch_class = get_canonical_class(ch);
 506                 pg_wchar        composite;
 507
 508                 if (last_class < ch_class &&
 509                         recompose_code(starter_ch, ch, &composite))
 510                 {
 511                         recomp_chars[starter_pos] = composite;
 512                         starter_ch = composite;
 513                 }
 514                 else if (ch_class == 0)
 515                 {
 516                         starter_pos = target_pos;
 517                         starter_ch = ch;
 518                         last_class = -1;
 519                         recomp_chars[target_pos++] = ch;
 520                 }
 521                 else
 522                 {
 523                         last_class = ch_class;
 524                         recomp_chars[target_pos++] = ch;
 525                 }
 526         }
 527         recomp_chars[target_pos] = (pg_wchar) '\0';
 528
 529         FREE(decomp_chars);
 530
 531         return recomp_chars;
 532 }
 533
 534 /*
 535  * Normalization "quick check" algorithm; see
 536  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
 537  */
 538
 539 /* We only need this in the backend. */
 540 #ifndef FRONTEND
 541
 542 static const pg_unicode_normprops *
 543 qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
 544 {
 545         int                     h;
 546         uint32          hashkey;
 547
 548         /*
 549          * Compute the hash function. The hash key is the codepoint with the bytes
 550          * in network order.
 551          */
 552         hashkey = pg_hton32(ch);
 553         h = norminfo->hash(&hashkey);
 554
 555         /* An out-of-range result implies no match */
 556         if (h < 0 || h >= norminfo->num_normprops)
 557                 return NULL;
 558
 559         /*
 560          * Since it's a perfect hash, we need only match to the specific codepoint
 561          * it identifies.
 562          */
 563         if (ch != norminfo->normprops[h].codepoint)
 564                 return NULL;
 565
 566         /* Success! */
 567         return &norminfo->normprops[h];
 568 }
 569
 570 /*
 571  * Look up the normalization quick check character property
 572  */
 573 static UnicodeNormalizationQC
 574 qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
 575 {
 576         const pg_unicode_normprops *found = NULL;
 577
 578         switch (form)
 579         {
 580                 case UNICODE_NFC:
 581                         found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
 582                         break;
 583                 case UNICODE_NFKC:
 584                         found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
 585                         break;
 586                 default:
 587                         Assert(false);
 588                         break;
 589         }
 590
 591         if (found)
 592                 return found->quickcheck;
 593         else
 594                 return UNICODE_NORM_QC_YES;
 595 }
 596
 597 UnicodeNormalizationQC
 598 unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
 599 {
 600         uint8           lastCanonicalClass = 0;
 601         UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
 602
 603         /*
 604          * For the "D" forms, we don't run the quickcheck.  We don't include the
 605          * lookup tables for those because they are huge, checking for these
 606          * particular forms is less common, and running the slow path is faster
 607          * for the "D" forms than the "C" forms because you don't need to
 608          * recompose, which is slow.
 609          */
 610         if (form == UNICODE_NFD || form == UNICODE_NFKD)
 611                 return UNICODE_NORM_QC_MAYBE;
 612
 613         for (const pg_wchar *p = input; *p; p++)
 614         {
 615                 pg_wchar        ch = *p;
 616                 uint8           canonicalClass;
 617                 UnicodeNormalizationQC check;
 618
 619                 canonicalClass = get_canonical_class(ch);
 620                 if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
 621                         return UNICODE_NORM_QC_NO;
 622
 623                 check = qc_is_allowed(form, ch);
 624                 if (check == UNICODE_NORM_QC_NO)
 625                         return UNICODE_NORM_QC_NO;
 626                 else if (check == UNICODE_NORM_QC_MAYBE)
 627                         result = UNICODE_NORM_QC_MAYBE;
 628
 629                 lastCanonicalClass = canonicalClass;
 630         }
 631         return result;
 632 }
 633
 634 #endif                                                  /* !FRONTEND */