doc PG 17 relnotes: add FETCH_COUNT item
[pgsql.git] / src / common / unicode_norm.c
blobff8a03b84939f6f9be98f072ae90308aa0efe1f0
1 /*-------------------------------------------------------------------------
2 * unicode_norm.c
3 * Normalize a Unicode string
5 * This implements Unicode normalization, per the documentation at
6 * https://www.unicode.org/reports/tr15/.
8 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
10 * IDENTIFICATION
11 * src/common/unicode_norm.c
13 *-------------------------------------------------------------------------
15 #ifndef FRONTEND
16 #include "postgres.h"
17 #else
18 #include "postgres_fe.h"
19 #endif
21 #include "common/unicode_norm.h"
22 #ifndef FRONTEND
23 #include "common/unicode_norm_hashfunc.h"
24 #include "common/unicode_normprops_table.h"
25 #include "port/pg_bswap.h"
26 #else
27 #include "common/unicode_norm_table.h"
28 #endif
30 #ifndef FRONTEND
31 #define ALLOC(size) palloc(size)
32 #define FREE(size) pfree(size)
33 #else
34 #define ALLOC(size) malloc(size)
35 #define FREE(size) free(size)
36 #endif
38 /* Constants for calculations with Hangul characters */
39 #define SBASE 0xAC00 /* U+AC00 */
40 #define LBASE 0x1100 /* U+1100 */
41 #define VBASE 0x1161 /* U+1161 */
42 #define TBASE 0x11A7 /* U+11A7 */
43 #define LCOUNT 19
44 #define VCOUNT 21
45 #define TCOUNT 28
46 #define NCOUNT VCOUNT * TCOUNT
47 #define SCOUNT LCOUNT * NCOUNT
49 #ifdef FRONTEND
50 /* comparison routine for bsearch() of decomposition lookup table. */
51 static int
52 conv_compare(const void *p1, const void *p2)
54 uint32 v1,
55 v2;
57 v1 = *(const uint32 *) p1;
58 v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
59 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
62 #endif
65 * get_code_entry
67 * Get the entry corresponding to code in the decomposition lookup table.
68 * The backend version of this code uses a perfect hash function for the
69 * lookup, while the frontend version uses a binary search.
71 static const pg_unicode_decomposition *
72 get_code_entry(pg_wchar code)
74 #ifndef FRONTEND
75 int h;
76 uint32 hashkey;
77 pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
80 * Compute the hash function. The hash key is the codepoint with the bytes
81 * in network order.
83 hashkey = pg_hton32(code);
84 h = decompinfo.hash(&hashkey);
86 /* An out-of-range result implies no match */
87 if (h < 0 || h >= decompinfo.num_decomps)
88 return NULL;
91 * Since it's a perfect hash, we need only match to the specific codepoint
92 * it identifies.
94 if (code != decompinfo.decomps[h].codepoint)
95 return NULL;
97 /* Success! */
98 return &decompinfo.decomps[h];
99 #else
100 return bsearch(&(code),
101 UnicodeDecompMain,
102 lengthof(UnicodeDecompMain),
103 sizeof(pg_unicode_decomposition),
104 conv_compare);
105 #endif
109 * Get the combining class of the given codepoint.
111 static uint8
112 get_canonical_class(pg_wchar code)
114 const pg_unicode_decomposition *entry = get_code_entry(code);
117 * If no entries are found, the character used is either an Hangul
118 * character or a character with a class of 0 and no decompositions.
120 if (!entry)
121 return 0;
122 else
123 return entry->comb_class;
127 * Given a decomposition entry looked up earlier, get the decomposed
128 * characters.
130 * Note: the returned pointer can point to statically allocated buffer, and
131 * is only valid until next call to this function!
133 static const pg_wchar *
134 get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
136 static pg_wchar x;
138 if (DECOMPOSITION_IS_INLINE(entry))
140 Assert(DECOMPOSITION_SIZE(entry) == 1);
141 x = (pg_wchar) entry->dec_index;
142 *dec_size = 1;
143 return &x;
145 else
147 *dec_size = DECOMPOSITION_SIZE(entry);
148 return &UnicodeDecomp_codepoints[entry->dec_index];
153 * Calculate how many characters a given character will decompose to.
155 * This needs to recurse, if the character decomposes into characters that
156 * are, in turn, decomposable.
158 static int
159 get_decomposed_size(pg_wchar code, bool compat)
161 const pg_unicode_decomposition *entry;
162 int size = 0;
163 int i;
164 const uint32 *decomp;
165 int dec_size;
168 * Fast path for Hangul characters not stored in tables to save memory as
169 * decomposition is algorithmic. See
170 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
171 * on the matter.
173 if (code >= SBASE && code < SBASE + SCOUNT)
175 uint32 tindex,
176 sindex;
178 sindex = code - SBASE;
179 tindex = sindex % TCOUNT;
181 if (tindex != 0)
182 return 3;
183 return 2;
186 entry = get_code_entry(code);
189 * Just count current code if no other decompositions. A NULL entry is
190 * equivalent to a character with class 0 and no decompositions.
192 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
193 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
194 return 1;
197 * If this entry has other decomposition codes look at them as well. First
198 * get its decomposition in the list of tables available.
200 decomp = get_code_decomposition(entry, &dec_size);
201 for (i = 0; i < dec_size; i++)
203 uint32 lcode = decomp[i];
205 size += get_decomposed_size(lcode, compat);
208 return size;
212 * Recompose a set of characters. For hangul characters, the calculation
213 * is algorithmic. For others, an inverse lookup at the decomposition
214 * table is necessary. Returns true if a recomposition can be done, and
215 * false otherwise.
217 static bool
218 recompose_code(uint32 start, uint32 code, uint32 *result)
221 * Handle Hangul characters algorithmically, per the Unicode spec.
223 * Check if two current characters are L and V.
225 if (start >= LBASE && start < LBASE + LCOUNT &&
226 code >= VBASE && code < VBASE + VCOUNT)
228 /* make syllable of form LV */
229 uint32 lindex = start - LBASE;
230 uint32 vindex = code - VBASE;
232 *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
233 return true;
235 /* Check if two current characters are LV and T */
236 else if (start >= SBASE && start < (SBASE + SCOUNT) &&
237 ((start - SBASE) % TCOUNT) == 0 &&
238 code >= TBASE && code < (TBASE + TCOUNT))
240 /* make syllable of form LVT */
241 uint32 tindex = code - TBASE;
243 *result = start + tindex;
244 return true;
246 else
248 const pg_unicode_decomposition *entry;
251 * Do an inverse lookup of the decomposition tables to see if anything
252 * matches. The comparison just needs to be a perfect match on the
253 * sub-table of size two, because the start character has already been
254 * recomposed partially. This lookup uses a perfect hash function for
255 * the backend code.
257 #ifndef FRONTEND
259 int h,
260 inv_lookup_index;
261 uint64 hashkey;
262 pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
265 * Compute the hash function. The hash key is formed by concatenating
266 * bytes of the two codepoints in network order. See also
267 * src/common/unicode/generate-unicode_norm_table.pl.
269 hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
270 h = recompinfo.hash(&hashkey);
272 /* An out-of-range result implies no match */
273 if (h < 0 || h >= recompinfo.num_recomps)
274 return false;
276 inv_lookup_index = recompinfo.inverse_lookup[h];
277 entry = &UnicodeDecompMain[inv_lookup_index];
279 if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
280 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
282 *result = entry->codepoint;
283 return true;
286 #else
288 int i;
290 for (i = 0; i < lengthof(UnicodeDecompMain); i++)
292 entry = &UnicodeDecompMain[i];
294 if (DECOMPOSITION_SIZE(entry) != 2)
295 continue;
297 if (DECOMPOSITION_NO_COMPOSE(entry))
298 continue;
300 if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
301 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
303 *result = entry->codepoint;
304 return true;
307 #endif /* !FRONTEND */
310 return false;
314 * Decompose the given code into the array given by caller. The
315 * decomposition begins at the position given by caller, saving one
316 * lookup on the decomposition table. The current position needs to be
317 * updated here to let the caller know from where to continue filling
318 * in the array result.
320 static void
321 decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
323 const pg_unicode_decomposition *entry;
324 int i;
325 const uint32 *decomp;
326 int dec_size;
329 * Fast path for Hangul characters not stored in tables to save memory as
330 * decomposition is algorithmic. See
331 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
332 * on the matter.
334 if (code >= SBASE && code < SBASE + SCOUNT)
336 uint32 l,
338 tindex,
339 sindex;
340 pg_wchar *res = *result;
342 sindex = code - SBASE;
343 l = LBASE + sindex / (VCOUNT * TCOUNT);
344 v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
345 tindex = sindex % TCOUNT;
347 res[*current] = l;
348 (*current)++;
349 res[*current] = v;
350 (*current)++;
352 if (tindex != 0)
354 res[*current] = TBASE + tindex;
355 (*current)++;
358 return;
361 entry = get_code_entry(code);
364 * Just fill in with the current decomposition if there are no
365 * decomposition codes to recurse to. A NULL entry is equivalent to a
366 * character with class 0 and no decompositions, so just leave also in
367 * this case.
369 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
370 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
372 pg_wchar *res = *result;
374 res[*current] = code;
375 (*current)++;
376 return;
380 * If this entry has other decomposition codes look at them as well.
382 decomp = get_code_decomposition(entry, &dec_size);
383 for (i = 0; i < dec_size; i++)
385 pg_wchar lcode = (pg_wchar) decomp[i];
387 /* Leave if no more decompositions */
388 decompose_code(lcode, compat, result, current);
393 * unicode_normalize - Normalize a Unicode string to the specified form.
395 * The input is a 0-terminated array of codepoints.
397 * In frontend, returns a 0-terminated array of codepoints, allocated with
398 * malloc. Or NULL if we run out of memory. In backend, the returned
399 * string is palloc'd instead, and OOM is reported with ereport().
401 pg_wchar *
402 unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
404 bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405 bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406 pg_wchar *decomp_chars;
407 pg_wchar *recomp_chars;
408 int decomp_size,
409 current_size;
410 int count;
411 const pg_wchar *p;
413 /* variables for recomposition */
414 int last_class;
415 int starter_pos;
416 int target_pos;
417 uint32 starter_ch;
419 /* First, do character decomposition */
422 * Calculate how many characters long the decomposed version will be.
424 decomp_size = 0;
425 for (p = input; *p; p++)
426 decomp_size += get_decomposed_size(*p, compat);
428 decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429 if (decomp_chars == NULL)
430 return NULL;
433 * Now fill in each entry recursively. This needs a second pass on the
434 * decomposition table.
436 current_size = 0;
437 for (p = input; *p; p++)
438 decompose_code(*p, compat, &decomp_chars, &current_size);
439 decomp_chars[decomp_size] = '\0';
440 Assert(decomp_size == current_size);
442 /* Leave if there is nothing to decompose */
443 if (decomp_size == 0)
444 return decomp_chars;
447 * Now apply canonical ordering.
449 for (count = 1; count < decomp_size; count++)
451 pg_wchar prev = decomp_chars[count - 1];
452 pg_wchar next = decomp_chars[count];
453 pg_wchar tmp;
454 const uint8 prevClass = get_canonical_class(prev);
455 const uint8 nextClass = get_canonical_class(next);
458 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459 * annex 4, a sequence of two adjacent characters in a string is an
460 * exchangeable pair if the combining class (from the Unicode
461 * Character Database) for the first character is greater than the
462 * combining class for the second, and the second is not a starter. A
463 * character is a starter if its combining class is 0.
465 if (prevClass == 0 || nextClass == 0)
466 continue;
468 if (prevClass <= nextClass)
469 continue;
471 /* exchange can happen */
472 tmp = decomp_chars[count - 1];
473 decomp_chars[count - 1] = decomp_chars[count];
474 decomp_chars[count] = tmp;
476 /* backtrack to check again */
477 if (count > 1)
478 count -= 2;
481 if (!recompose)
482 return decomp_chars;
485 * The last phase of NFC and NFKC is the recomposition of the reordered
486 * Unicode string using combining classes. The recomposed string cannot be
487 * longer than the decomposed one, so make the allocation of the output
488 * string based on that assumption.
490 recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
491 if (!recomp_chars)
493 FREE(decomp_chars);
494 return NULL;
497 last_class = -1; /* this eliminates a special check */
498 starter_pos = 0;
499 target_pos = 1;
500 starter_ch = recomp_chars[0] = decomp_chars[0];
502 for (count = 1; count < decomp_size; count++)
504 pg_wchar ch = decomp_chars[count];
505 int ch_class = get_canonical_class(ch);
506 pg_wchar composite;
508 if (last_class < ch_class &&
509 recompose_code(starter_ch, ch, &composite))
511 recomp_chars[starter_pos] = composite;
512 starter_ch = composite;
514 else if (ch_class == 0)
516 starter_pos = target_pos;
517 starter_ch = ch;
518 last_class = -1;
519 recomp_chars[target_pos++] = ch;
521 else
523 last_class = ch_class;
524 recomp_chars[target_pos++] = ch;
527 recomp_chars[target_pos] = (pg_wchar) '\0';
529 FREE(decomp_chars);
531 return recomp_chars;
535 * Normalization "quick check" algorithm; see
536 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
539 /* We only need this in the backend. */
540 #ifndef FRONTEND
542 static const pg_unicode_normprops *
543 qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
545 int h;
546 uint32 hashkey;
549 * Compute the hash function. The hash key is the codepoint with the bytes
550 * in network order.
552 hashkey = pg_hton32(ch);
553 h = norminfo->hash(&hashkey);
555 /* An out-of-range result implies no match */
556 if (h < 0 || h >= norminfo->num_normprops)
557 return NULL;
560 * Since it's a perfect hash, we need only match to the specific codepoint
561 * it identifies.
563 if (ch != norminfo->normprops[h].codepoint)
564 return NULL;
566 /* Success! */
567 return &norminfo->normprops[h];
571 * Look up the normalization quick check character property
573 static UnicodeNormalizationQC
574 qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
576 const pg_unicode_normprops *found = NULL;
578 switch (form)
580 case UNICODE_NFC:
581 found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
582 break;
583 case UNICODE_NFKC:
584 found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
585 break;
586 default:
587 Assert(false);
588 break;
591 if (found)
592 return found->quickcheck;
593 else
594 return UNICODE_NORM_QC_YES;
597 UnicodeNormalizationQC
598 unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
600 uint8 lastCanonicalClass = 0;
601 UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
604 * For the "D" forms, we don't run the quickcheck. We don't include the
605 * lookup tables for those because they are huge, checking for these
606 * particular forms is less common, and running the slow path is faster
607 * for the "D" forms than the "C" forms because you don't need to
608 * recompose, which is slow.
610 if (form == UNICODE_NFD || form == UNICODE_NFKD)
611 return UNICODE_NORM_QC_MAYBE;
613 for (const pg_wchar *p = input; *p; p++)
615 pg_wchar ch = *p;
616 uint8 canonicalClass;
617 UnicodeNormalizationQC check;
619 canonicalClass = get_canonical_class(ch);
620 if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
621 return UNICODE_NORM_QC_NO;
623 check = qc_is_allowed(form, ch);
624 if (check == UNICODE_NORM_QC_NO)
625 return UNICODE_NORM_QC_NO;
626 else if (check == UNICODE_NORM_QC_MAYBE)
627 result = UNICODE_NORM_QC_MAYBE;
629 lastCanonicalClass = canonicalClass;
631 return result;
634 #endif /* !FRONTEND */