src/common/unicode_category.c

   1 /*-------------------------------------------------------------------------
   2  * unicode_category.c
   3  *              Determine general category and character properties of Unicode
   4  *              characters. Encoding must be UTF8, where we assume that the pg_wchar
   5  *              representation is a code point.
   6  *
   7  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
   8  *
   9  * IDENTIFICATION
  10  *        src/common/unicode_category.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #ifndef FRONTEND
  15 #include "postgres.h"
  16 #else
  17 #include "postgres_fe.h"
  18 #endif
  19
  20 #include "common/unicode_category.h"
  21 #include "common/unicode_category_table.h"
  22
  23 /*
  24  * Create bitmasks from pg_unicode_category values for efficient comparison of
  25  * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
  26  * the general cateogry Mn; and PG_U_M_MASK represents general categories Mn,
  27  * Me, and Mc.
  28  *
  29  * The number of Unicode General Categories should never grow, so a 32-bit
  30  * mask is fine.
  31  */
  32 #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
  33
  34 #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
  35 #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
  36 #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
  37 #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
  38 #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
  39 #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
  40 #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
  41                                          PG_U_LO_MASK)
  42 #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
  43 #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
  44 #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
  45 #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
  46 #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
  47 #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
  48 #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
  49 #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
  50 #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
  51 #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
  52 #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
  53 #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
  54 #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
  55 #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
  56 #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
  57 #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
  58                                          PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
  59 #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
  60 #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
  61 #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
  62 #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
  63 #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
  64 #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
  65 #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
  66 #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
  67 #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
  68 #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
  69 #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
  70 #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
  71 #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
  72 #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
  73 #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
  74                                          PG_U_CN_MASK)
  75
  76 #define PG_U_CHARACTER_TAB      0x09
  77
  78 static bool range_search(const pg_unicode_range * tbl, size_t size,
  79                                                  pg_wchar code);
  80
  81 /*
  82  * Unicode general category for the given codepoint.
  83  */
  84 pg_unicode_category
  85 unicode_category(pg_wchar code)
  86 {
  87         int                     min = 0;
  88         int                     mid;
  89         int                     max = lengthof(unicode_categories) - 1;
  90
  91         Assert(code <= 0x10ffff);
  92
  93         if (code < 0x80)
  94                 return unicode_opt_ascii[code].category;
  95
  96         while (max >= min)
  97         {
  98                 mid = (min + max) / 2;
  99                 if (code > unicode_categories[mid].last)
 100                         min = mid + 1;
 101                 else if (code < unicode_categories[mid].first)
 102                         max = mid - 1;
 103                 else
 104                         return unicode_categories[mid].category;
 105         }
 106
 107         return PG_U_UNASSIGNED;
 108 }
 109
 110 bool
 111 pg_u_prop_alphabetic(pg_wchar code)
 112 {
 113         if (code < 0x80)
 114                 return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
 115
 116         return range_search(unicode_alphabetic,
 117                                                 lengthof(unicode_alphabetic),
 118                                                 code);
 119 }
 120
 121 bool
 122 pg_u_prop_lowercase(pg_wchar code)
 123 {
 124         if (code < 0x80)
 125                 return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
 126
 127         return range_search(unicode_lowercase,
 128                                                 lengthof(unicode_lowercase),
 129                                                 code);
 130 }
 131
 132 bool
 133 pg_u_prop_uppercase(pg_wchar code)
 134 {
 135         if (code < 0x80)
 136                 return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
 137
 138         return range_search(unicode_uppercase,
 139                                                 lengthof(unicode_uppercase),
 140                                                 code);
 141 }
 142
 143 bool
 144 pg_u_prop_cased(pg_wchar code)
 145 {
 146         uint32          category_mask;
 147
 148         if (code < 0x80)
 149                 return unicode_opt_ascii[code].properties & PG_U_PROP_CASED;
 150
 151         category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 152
 153         return category_mask & PG_U_LT_MASK ||
 154                 pg_u_prop_lowercase(code) ||
 155                 pg_u_prop_uppercase(code);
 156 }
 157
 158 bool
 159 pg_u_prop_case_ignorable(pg_wchar code)
 160 {
 161         if (code < 0x80)
 162                 return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
 163
 164         return range_search(unicode_case_ignorable,
 165                                                 lengthof(unicode_case_ignorable),
 166                                                 code);
 167 }
 168
 169 bool
 170 pg_u_prop_white_space(pg_wchar code)
 171 {
 172         if (code < 0x80)
 173                 return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
 174
 175         return range_search(unicode_white_space,
 176                                                 lengthof(unicode_white_space),
 177                                                 code);
 178 }
 179
 180 bool
 181 pg_u_prop_hex_digit(pg_wchar code)
 182 {
 183         if (code < 0x80)
 184                 return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
 185
 186         return range_search(unicode_hex_digit,
 187                                                 lengthof(unicode_hex_digit),
 188                                                 code);
 189 }
 190
 191 bool
 192 pg_u_prop_join_control(pg_wchar code)
 193 {
 194         if (code < 0x80)
 195                 return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
 196
 197         return range_search(unicode_join_control,
 198                                                 lengthof(unicode_join_control),
 199                                                 code);
 200 }
 201
 202 /*
 203  * The following functions implement the Compatibility Properties described
 204  * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
 205  *
 206  * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
 207  * the "Standard" variant.
 208  */
 209
 210 bool
 211 pg_u_isdigit(pg_wchar code, bool posix)
 212 {
 213         if (posix)
 214                 return ('0' <= code && code <= '9');
 215         else
 216                 return unicode_category(code) == PG_U_DECIMAL_NUMBER;
 217 }
 218
 219 bool
 220 pg_u_isalpha(pg_wchar code)
 221 {
 222         return pg_u_prop_alphabetic(code);
 223 }
 224
 225 bool
 226 pg_u_isalnum(pg_wchar code, bool posix)
 227 {
 228         return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
 229 }
 230
 231 bool
 232 pg_u_isword(pg_wchar code)
 233 {
 234         uint32          category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 235
 236         return
 237                 category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
 238                 pg_u_isalpha(code) ||
 239                 pg_u_prop_join_control(code);
 240 }
 241
 242 bool
 243 pg_u_isupper(pg_wchar code)
 244 {
 245         return pg_u_prop_uppercase(code);
 246 }
 247
 248 bool
 249 pg_u_islower(pg_wchar code)
 250 {
 251         return pg_u_prop_lowercase(code);
 252 }
 253
 254 bool
 255 pg_u_isblank(pg_wchar code)
 256 {
 257         return code == PG_U_CHARACTER_TAB ||
 258                 unicode_category(code) == PG_U_SPACE_SEPARATOR;
 259 }
 260
 261 bool
 262 pg_u_iscntrl(pg_wchar code)
 263 {
 264         return unicode_category(code) == PG_U_CONTROL;
 265 }
 266
 267 bool
 268 pg_u_isgraph(pg_wchar code)
 269 {
 270         uint32          category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 271
 272         if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
 273                 pg_u_isspace(code))
 274                 return false;
 275         return true;
 276 }
 277
 278 bool
 279 pg_u_isprint(pg_wchar code)
 280 {
 281         pg_unicode_category category = unicode_category(code);
 282
 283         if (category == PG_U_CONTROL)
 284                 return false;
 285
 286         return pg_u_isgraph(code) || pg_u_isblank(code);
 287 }
 288
 289 bool
 290 pg_u_ispunct(pg_wchar code, bool posix)
 291 {
 292         uint32          category_mask;
 293
 294         if (posix)
 295         {
 296                 if (pg_u_isalpha(code))
 297                         return false;
 298
 299                 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 300                 return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
 301         }
 302         else
 303         {
 304                 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 305
 306                 return category_mask & PG_U_P_MASK;
 307         }
 308 }
 309
 310 bool
 311 pg_u_isspace(pg_wchar code)
 312 {
 313         return pg_u_prop_white_space(code);
 314 }
 315
 316 bool
 317 pg_u_isxdigit(pg_wchar code, bool posix)
 318 {
 319         if (posix)
 320                 return (('0' <= code && code <= '9') ||
 321                                 ('A' <= code && code <= 'F') ||
 322                                 ('a' <= code && code <= 'f'));
 323         else
 324                 return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
 325                         pg_u_prop_hex_digit(code);
 326 }
 327
 328 /*
 329  * Description of Unicode general category.
 330  */
 331 const char *
 332 unicode_category_string(pg_unicode_category category)
 333 {
 334         switch (category)
 335         {
 336                 case PG_U_UNASSIGNED:
 337                         return "Unassigned";
 338                 case PG_U_UPPERCASE_LETTER:
 339                         return "Uppercase_Letter";
 340                 case PG_U_LOWERCASE_LETTER:
 341                         return "Lowercase_Letter";
 342                 case PG_U_TITLECASE_LETTER:
 343                         return "Titlecase_Letter";
 344                 case PG_U_MODIFIER_LETTER:
 345                         return "Modifier_Letter";
 346                 case PG_U_OTHER_LETTER:
 347                         return "Other_Letter";
 348                 case PG_U_NONSPACING_MARK:
 349                         return "Nonspacing_Mark";
 350                 case PG_U_ENCLOSING_MARK:
 351                         return "Enclosing_Mark";
 352                 case PG_U_SPACING_MARK:
 353                         return "Spacing_Mark";
 354                 case PG_U_DECIMAL_NUMBER:
 355                         return "Decimal_Number";
 356                 case PG_U_LETTER_NUMBER:
 357                         return "Letter_Number";
 358                 case PG_U_OTHER_NUMBER:
 359                         return "Other_Number";
 360                 case PG_U_SPACE_SEPARATOR:
 361                         return "Space_Separator";
 362                 case PG_U_LINE_SEPARATOR:
 363                         return "Line_Separator";
 364                 case PG_U_PARAGRAPH_SEPARATOR:
 365                         return "Paragraph_Separator";
 366                 case PG_U_CONTROL:
 367                         return "Control";
 368                 case PG_U_FORMAT:
 369                         return "Format";
 370                 case PG_U_PRIVATE_USE:
 371                         return "Private_Use";
 372                 case PG_U_SURROGATE:
 373                         return "Surrogate";
 374                 case PG_U_DASH_PUNCTUATION:
 375                         return "Dash_Punctuation";
 376                 case PG_U_OPEN_PUNCTUATION:
 377                         return "Open_Punctuation";
 378                 case PG_U_CLOSE_PUNCTUATION:
 379                         return "Close_Punctuation";
 380                 case PG_U_CONNECTOR_PUNCTUATION:
 381                         return "Connector_Punctuation";
 382                 case PG_U_OTHER_PUNCTUATION:
 383                         return "Other_Punctuation";
 384                 case PG_U_MATH_SYMBOL:
 385                         return "Math_Symbol";
 386                 case PG_U_CURRENCY_SYMBOL:
 387                         return "Currency_Symbol";
 388                 case PG_U_MODIFIER_SYMBOL:
 389                         return "Modifier_Symbol";
 390                 case PG_U_OTHER_SYMBOL:
 391                         return "Other_Symbol";
 392                 case PG_U_INITIAL_PUNCTUATION:
 393                         return "Initial_Punctuation";
 394                 case PG_U_FINAL_PUNCTUATION:
 395                         return "Final_Punctuation";
 396         }
 397
 398         Assert(false);
 399         return "Unrecognized";          /* keep compiler quiet */
 400 }
 401
 402 /*
 403  * Short code for Unicode general category.
 404  */
 405 const char *
 406 unicode_category_abbrev(pg_unicode_category category)
 407 {
 408         switch (category)
 409         {
 410                 case PG_U_UNASSIGNED:
 411                         return "Cn";
 412                 case PG_U_UPPERCASE_LETTER:
 413                         return "Lu";
 414                 case PG_U_LOWERCASE_LETTER:
 415                         return "Ll";
 416                 case PG_U_TITLECASE_LETTER:
 417                         return "Lt";
 418                 case PG_U_MODIFIER_LETTER:
 419                         return "Lm";
 420                 case PG_U_OTHER_LETTER:
 421                         return "Lo";
 422                 case PG_U_NONSPACING_MARK:
 423                         return "Mn";
 424                 case PG_U_ENCLOSING_MARK:
 425                         return "Me";
 426                 case PG_U_SPACING_MARK:
 427                         return "Mc";
 428                 case PG_U_DECIMAL_NUMBER:
 429                         return "Nd";
 430                 case PG_U_LETTER_NUMBER:
 431                         return "Nl";
 432                 case PG_U_OTHER_NUMBER:
 433                         return "No";
 434                 case PG_U_SPACE_SEPARATOR:
 435                         return "Zs";
 436                 case PG_U_LINE_SEPARATOR:
 437                         return "Zl";
 438                 case PG_U_PARAGRAPH_SEPARATOR:
 439                         return "Zp";
 440                 case PG_U_CONTROL:
 441                         return "Cc";
 442                 case PG_U_FORMAT:
 443                         return "Cf";
 444                 case PG_U_PRIVATE_USE:
 445                         return "Co";
 446                 case PG_U_SURROGATE:
 447                         return "Cs";
 448                 case PG_U_DASH_PUNCTUATION:
 449                         return "Pd";
 450                 case PG_U_OPEN_PUNCTUATION:
 451                         return "Ps";
 452                 case PG_U_CLOSE_PUNCTUATION:
 453                         return "Pe";
 454                 case PG_U_CONNECTOR_PUNCTUATION:
 455                         return "Pc";
 456                 case PG_U_OTHER_PUNCTUATION:
 457                         return "Po";
 458                 case PG_U_MATH_SYMBOL:
 459                         return "Sm";
 460                 case PG_U_CURRENCY_SYMBOL:
 461                         return "Sc";
 462                 case PG_U_MODIFIER_SYMBOL:
 463                         return "Sk";
 464                 case PG_U_OTHER_SYMBOL:
 465                         return "So";
 466                 case PG_U_INITIAL_PUNCTUATION:
 467                         return "Pi";
 468                 case PG_U_FINAL_PUNCTUATION:
 469                         return "Pf";
 470         }
 471
 472         Assert(false);
 473         return "??";                            /* keep compiler quiet */
 474 }
 475
 476 /*
 477  * Binary search to test if given codepoint exists in one of the ranges in the
 478  * given table.
 479  */
 480 static bool
 481 range_search(const pg_unicode_range * tbl, size_t size, pg_wchar code)
 482 {
 483         int                     min = 0;
 484         int                     mid;
 485         int                     max = size - 1;
 486
 487         Assert(code <= 0x10ffff);
 488
 489         while (max >= min)
 490         {
 491                 mid = (min + max) / 2;
 492                 if (code > tbl[mid].last)
 493                         min = mid + 1;
 494                 else if (code < tbl[mid].first)
 495                         max = mid - 1;
 496                 else
 497                         return true;
 498         }
 499
 500         return false;
 501 }