1 /*-------------------------------------------------------------------------
3 * Determine general category and character properties of Unicode
4 * characters. Encoding must be UTF8, where we assume that the pg_wchar
5 * representation is a code point.
7 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
10 * src/common/unicode_category.c
12 *-------------------------------------------------------------------------
17 #include "postgres_fe.h"
20 #include "common/unicode_category.h"
21 #include "common/unicode_category_table.h"
24 * Create bitmasks from pg_unicode_category values for efficient comparison of
25 * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
26 * the general cateogry Mn; and PG_U_M_MASK represents general categories Mn,
29 * The number of Unicode General Categories should never grow, so a 32-bit
32 #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
34 #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
35 #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
36 #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
37 #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
38 #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
39 #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
40 #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
42 #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
43 #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
44 #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
45 #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
46 #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
47 #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
48 #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
49 #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
50 #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
51 #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
52 #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
53 #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
54 #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
55 #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
56 #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
57 #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
58 PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
59 #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
60 #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
61 #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
62 #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
63 #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
64 #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
65 #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
66 #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
67 #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
68 #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
69 #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
70 #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
71 #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
72 #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
73 #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
76 #define PG_U_CHARACTER_TAB 0x09
78 static bool range_search(const pg_unicode_range
* tbl
, size_t size
,
82 * Unicode general category for the given codepoint.
85 unicode_category(pg_wchar code
)
89 int max
= lengthof(unicode_categories
) - 1;
91 Assert(code
<= 0x10ffff);
94 return unicode_opt_ascii
[code
].category
;
98 mid
= (min
+ max
) / 2;
99 if (code
> unicode_categories
[mid
].last
)
101 else if (code
< unicode_categories
[mid
].first
)
104 return unicode_categories
[mid
].category
;
107 return PG_U_UNASSIGNED
;
111 pg_u_prop_alphabetic(pg_wchar code
)
114 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_ALPHABETIC
;
116 return range_search(unicode_alphabetic
,
117 lengthof(unicode_alphabetic
),
122 pg_u_prop_lowercase(pg_wchar code
)
125 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_LOWERCASE
;
127 return range_search(unicode_lowercase
,
128 lengthof(unicode_lowercase
),
133 pg_u_prop_uppercase(pg_wchar code
)
136 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_UPPERCASE
;
138 return range_search(unicode_uppercase
,
139 lengthof(unicode_uppercase
),
144 pg_u_prop_cased(pg_wchar code
)
146 uint32 category_mask
;
149 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_CASED
;
151 category_mask
= PG_U_CATEGORY_MASK(unicode_category(code
));
153 return category_mask
& PG_U_LT_MASK
||
154 pg_u_prop_lowercase(code
) ||
155 pg_u_prop_uppercase(code
);
159 pg_u_prop_case_ignorable(pg_wchar code
)
162 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_CASE_IGNORABLE
;
164 return range_search(unicode_case_ignorable
,
165 lengthof(unicode_case_ignorable
),
170 pg_u_prop_white_space(pg_wchar code
)
173 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_WHITE_SPACE
;
175 return range_search(unicode_white_space
,
176 lengthof(unicode_white_space
),
181 pg_u_prop_hex_digit(pg_wchar code
)
184 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_HEX_DIGIT
;
186 return range_search(unicode_hex_digit
,
187 lengthof(unicode_hex_digit
),
192 pg_u_prop_join_control(pg_wchar code
)
195 return unicode_opt_ascii
[code
].properties
& PG_U_PROP_JOIN_CONTROL
;
197 return range_search(unicode_join_control
,
198 lengthof(unicode_join_control
),
203 * The following functions implement the Compatibility Properties described
204 * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
206 * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
207 * the "Standard" variant.
211 pg_u_isdigit(pg_wchar code
, bool posix
)
214 return ('0' <= code
&& code
<= '9');
216 return unicode_category(code
) == PG_U_DECIMAL_NUMBER
;
220 pg_u_isalpha(pg_wchar code
)
222 return pg_u_prop_alphabetic(code
);
226 pg_u_isalnum(pg_wchar code
, bool posix
)
228 return pg_u_isalpha(code
) || pg_u_isdigit(code
, posix
);
232 pg_u_isword(pg_wchar code
)
234 uint32 category_mask
= PG_U_CATEGORY_MASK(unicode_category(code
));
237 category_mask
& (PG_U_M_MASK
| PG_U_ND_MASK
| PG_U_PC_MASK
) ||
238 pg_u_isalpha(code
) ||
239 pg_u_prop_join_control(code
);
243 pg_u_isupper(pg_wchar code
)
245 return pg_u_prop_uppercase(code
);
249 pg_u_islower(pg_wchar code
)
251 return pg_u_prop_lowercase(code
);
255 pg_u_isblank(pg_wchar code
)
257 return code
== PG_U_CHARACTER_TAB
||
258 unicode_category(code
) == PG_U_SPACE_SEPARATOR
;
262 pg_u_iscntrl(pg_wchar code
)
264 return unicode_category(code
) == PG_U_CONTROL
;
268 pg_u_isgraph(pg_wchar code
)
270 uint32 category_mask
= PG_U_CATEGORY_MASK(unicode_category(code
));
272 if (category_mask
& (PG_U_CC_MASK
| PG_U_CS_MASK
| PG_U_CN_MASK
) ||
279 pg_u_isprint(pg_wchar code
)
281 pg_unicode_category category
= unicode_category(code
);
283 if (category
== PG_U_CONTROL
)
286 return pg_u_isgraph(code
) || pg_u_isblank(code
);
290 pg_u_ispunct(pg_wchar code
, bool posix
)
292 uint32 category_mask
;
296 if (pg_u_isalpha(code
))
299 category_mask
= PG_U_CATEGORY_MASK(unicode_category(code
));
300 return category_mask
& (PG_U_P_MASK
| PG_U_S_MASK
);
304 category_mask
= PG_U_CATEGORY_MASK(unicode_category(code
));
306 return category_mask
& PG_U_P_MASK
;
311 pg_u_isspace(pg_wchar code
)
313 return pg_u_prop_white_space(code
);
317 pg_u_isxdigit(pg_wchar code
, bool posix
)
320 return (('0' <= code
&& code
<= '9') ||
321 ('A' <= code
&& code
<= 'F') ||
322 ('a' <= code
&& code
<= 'f'));
324 return unicode_category(code
) == PG_U_DECIMAL_NUMBER
||
325 pg_u_prop_hex_digit(code
);
329 * Description of Unicode general category.
332 unicode_category_string(pg_unicode_category category
)
336 case PG_U_UNASSIGNED
:
338 case PG_U_UPPERCASE_LETTER
:
339 return "Uppercase_Letter";
340 case PG_U_LOWERCASE_LETTER
:
341 return "Lowercase_Letter";
342 case PG_U_TITLECASE_LETTER
:
343 return "Titlecase_Letter";
344 case PG_U_MODIFIER_LETTER
:
345 return "Modifier_Letter";
346 case PG_U_OTHER_LETTER
:
347 return "Other_Letter";
348 case PG_U_NONSPACING_MARK
:
349 return "Nonspacing_Mark";
350 case PG_U_ENCLOSING_MARK
:
351 return "Enclosing_Mark";
352 case PG_U_SPACING_MARK
:
353 return "Spacing_Mark";
354 case PG_U_DECIMAL_NUMBER
:
355 return "Decimal_Number";
356 case PG_U_LETTER_NUMBER
:
357 return "Letter_Number";
358 case PG_U_OTHER_NUMBER
:
359 return "Other_Number";
360 case PG_U_SPACE_SEPARATOR
:
361 return "Space_Separator";
362 case PG_U_LINE_SEPARATOR
:
363 return "Line_Separator";
364 case PG_U_PARAGRAPH_SEPARATOR
:
365 return "Paragraph_Separator";
370 case PG_U_PRIVATE_USE
:
371 return "Private_Use";
374 case PG_U_DASH_PUNCTUATION
:
375 return "Dash_Punctuation";
376 case PG_U_OPEN_PUNCTUATION
:
377 return "Open_Punctuation";
378 case PG_U_CLOSE_PUNCTUATION
:
379 return "Close_Punctuation";
380 case PG_U_CONNECTOR_PUNCTUATION
:
381 return "Connector_Punctuation";
382 case PG_U_OTHER_PUNCTUATION
:
383 return "Other_Punctuation";
384 case PG_U_MATH_SYMBOL
:
385 return "Math_Symbol";
386 case PG_U_CURRENCY_SYMBOL
:
387 return "Currency_Symbol";
388 case PG_U_MODIFIER_SYMBOL
:
389 return "Modifier_Symbol";
390 case PG_U_OTHER_SYMBOL
:
391 return "Other_Symbol";
392 case PG_U_INITIAL_PUNCTUATION
:
393 return "Initial_Punctuation";
394 case PG_U_FINAL_PUNCTUATION
:
395 return "Final_Punctuation";
399 return "Unrecognized"; /* keep compiler quiet */
403 * Short code for Unicode general category.
406 unicode_category_abbrev(pg_unicode_category category
)
410 case PG_U_UNASSIGNED
:
412 case PG_U_UPPERCASE_LETTER
:
414 case PG_U_LOWERCASE_LETTER
:
416 case PG_U_TITLECASE_LETTER
:
418 case PG_U_MODIFIER_LETTER
:
420 case PG_U_OTHER_LETTER
:
422 case PG_U_NONSPACING_MARK
:
424 case PG_U_ENCLOSING_MARK
:
426 case PG_U_SPACING_MARK
:
428 case PG_U_DECIMAL_NUMBER
:
430 case PG_U_LETTER_NUMBER
:
432 case PG_U_OTHER_NUMBER
:
434 case PG_U_SPACE_SEPARATOR
:
436 case PG_U_LINE_SEPARATOR
:
438 case PG_U_PARAGRAPH_SEPARATOR
:
444 case PG_U_PRIVATE_USE
:
448 case PG_U_DASH_PUNCTUATION
:
450 case PG_U_OPEN_PUNCTUATION
:
452 case PG_U_CLOSE_PUNCTUATION
:
454 case PG_U_CONNECTOR_PUNCTUATION
:
456 case PG_U_OTHER_PUNCTUATION
:
458 case PG_U_MATH_SYMBOL
:
460 case PG_U_CURRENCY_SYMBOL
:
462 case PG_U_MODIFIER_SYMBOL
:
464 case PG_U_OTHER_SYMBOL
:
466 case PG_U_INITIAL_PUNCTUATION
:
468 case PG_U_FINAL_PUNCTUATION
:
473 return "??"; /* keep compiler quiet */
477 * Binary search to test if given codepoint exists in one of the ranges in the
481 range_search(const pg_unicode_range
* tbl
, size_t size
, pg_wchar code
)
487 Assert(code
<= 0x10ffff);
491 mid
= (min
+ max
) / 2;
492 if (code
> tbl
[mid
].last
)
494 else if (code
< tbl
[mid
].first
)