gcc/rust/util/rust-unicode.cc

   1 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
   2
   3 // This file is part of GCC.
   4
   5 // GCC is free software; you can redistribute it and/or modify it under
   6 // the terms of the GNU General Public License as published by the Free
   7 // Software Foundation; either version 3, or (at your option) any later
   8 // version.
   9
  10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 // for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with GCC; see the file COPYING3.  If not see
  17 // <http://www.gnu.org/licenses/>.
  18
  19 #include "rust-input-source.h"
  20 #include "rust-system.h"
  21 #include "optional.h"
  22 #include "selftest.h"
  23 #include "rust-lex.h"
  24 #include "rust-unicode.h"
  25
  26 #include "rust-unicode-data.h"
  27
  28 namespace Rust {
  29
  30 typedef Codepoint codepoint_t;
  31 typedef std::vector<codepoint_t> string_t;
  32
  33 // These constants are used to compose and decompose of Hangul syllables.
  34 // See `Sample Code for Hangul Algorithms` in 3.1.2
  35 // unicode.org/versions/Unicode15.0.0/ch03.pdf
  36 const uint32_t S_BASE = 0xAC00;
  37 const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
  38 const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
  39 const uint32_t N_COUNT = V_COUNT * T_COUNT;
  40 const uint32_t S_COUNT = L_COUNT * N_COUNT;
  41
  42 // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
  43 template <std::size_t SIZE>
  44 bool
  45 binary_search_ranges (
  46   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
  47   uint32_t target_cp)
  48 {
  49   auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
  50                               [] (const std::pair<uint32_t, uint32_t> &a,
  51                                   uint32_t b) { return a.second <= b; });
  52   if (it == ranges.end ())
  53     return false;
  54   else
  55     return it->first <= target_cp && target_cp < it->second;
  56 }
  57
  58 int
  59 lookup_cc (codepoint_t c)
  60 {
  61   auto it = CCC_TABLE.find (c.value);
  62   if (it != CCC_TABLE.end ())
  63     return it->second;
  64   else
  65     // Starter. Returns zero.
  66     return 0;
  67 }
  68
  69 tl::optional<codepoint_t>
  70 lookup_recomp (codepoint_t starter, codepoint_t c)
  71 {
  72   auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
  73   if (it != Rust::RECOMPOSITION_MAP.end ())
  74     return {it->second};
  75
  76   it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
  77   if (it != Rust::RECOMPOSITION_MAP.end ())
  78     return {it->second};
  79
  80   return tl::nullopt;
  81 }
  82
  83 void
  84 recursive_decomp_cano (codepoint_t c, string_t &buf)
  85 {
  86   auto it = Rust::DECOMPOSITION_MAP.find (c.value);
  87   if (it != Rust::DECOMPOSITION_MAP.end ())
  88     {
  89       std::vector<uint32_t> decomped = it->second;
  90       for (uint32_t cp : decomped)
  91         recursive_decomp_cano (cp, buf);
  92     }
  93   else
  94     buf.push_back (c);
  95 }
  96
  97 string_t
  98 decomp_cano (string_t s)
  99 {
 100   string_t buf;
 101   for (codepoint_t c : s)
 102     {
 103       int64_t s_index = c.value - S_BASE;
 104       if (0 <= s_index && s_index < S_COUNT)
 105         {
 106           // decompose Hangul argorithmically
 107           uint32_t l = L_BASE + s_index / N_COUNT;
 108           uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
 109           uint32_t t = T_BASE + s_index % T_COUNT;
 110           buf.push_back (l);
 111           buf.push_back (v);
 112           if (t != T_BASE)
 113             buf.push_back (t);
 114           continue;
 115         }
 116
 117       // Current character is not hangul
 118       recursive_decomp_cano (c, buf);
 119     }
 120   return buf;
 121 }
 122
 123 void
 124 sort_cano (string_t &s)
 125 {
 126   int cc_here, cc_prev;
 127   if (s.size () == 1)
 128     return;
 129   for (unsigned int i = 1; i < s.size (); i++)
 130     {
 131       cc_here = lookup_cc (s[i]);
 132       cc_prev = lookup_cc (s[i - 1]);
 133       if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
 134         {
 135           // swap
 136           codepoint_t tmp = s[i];
 137           s[i] = s[i - 1];
 138           s[i - 1] = tmp;
 139           if (i > 1)
 140             i -= 2;
 141         }
 142     }
 143 }
 144
 145 string_t
 146 compose_hangul (string_t s)
 147 {
 148   string_t buf;
 149   if (s.size () < 2)
 150     return s;
 151
 152   codepoint_t last = s[0];
 153   buf.push_back (last);
 154   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
 155     {
 156       codepoint_t ch = s[src_pos];
 157
 158       // L V => LV
 159       int64_t l_index = last.value - L_BASE;
 160       if (0 <= l_index && l_index < L_COUNT)
 161         {
 162           int64_t v_index = ch.value - V_BASE;
 163           if (0 <= v_index && v_index < V_COUNT)
 164             {
 165               last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
 166               // pop L
 167               buf.pop_back ();
 168               buf.push_back (last);
 169               continue;
 170             }
 171         }
 172
 173       // LV T => LVT
 174       int64_t s_index = last.value - S_BASE;
 175       if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
 176         {
 177           int64_t t_index = ch.value - T_BASE;
 178           if (0 < t_index && t_index < T_COUNT)
 179             {
 180               last.value += t_index;
 181               // pop LV
 182               buf.pop_back ();
 183               buf.push_back (last);
 184               continue;
 185             }
 186         }
 187       last = ch;
 188       buf.push_back (last);
 189     }
 190   return buf;
 191 }
 192
 193 string_t
 194 recomp (string_t s)
 195 {
 196   // compose hangul first
 197   s = compose_hangul (s);
 198
 199   string_t buf;
 200   if (s.size () < 2)
 201     return s;
 202
 203   int last_class = -1;
 204   // int starter_pos = 0; // Assume the first character is Starter. Correct?
 205   // int target_pos = 1;
 206   codepoint_t starter_ch = s[0];
 207
 208   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
 209     {
 210       // get current character
 211       codepoint_t ch = s[src_pos];
 212
 213       int ch_class = lookup_cc (ch);
 214       tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
 215       if (composite.has_value () && last_class < ch_class)
 216         {
 217           // ch can be composed
 218           buf.push_back (composite.value ());
 219           starter_ch = composite.value ();
 220         }
 221       else if (ch_class == 0)
 222         {
 223           // ch is Starter and cannot be composed.
 224           if (src_pos == 1)
 225             // FIXME: buggy?
 226             buf.push_back (starter_ch);
 227           starter_ch = ch;
 228           last_class = -1;
 229           buf.push_back (ch);
 230         }
 231       else
 232         {
 233           if (src_pos == 1)
 234             // FIXME: buggy?
 235             buf.push_back (starter_ch);
 236           // ch is not Starter.
 237           last_class = ch_class;
 238           buf.push_back (ch);
 239         }
 240     }
 241   return buf;
 242 }
 243
 244 // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
 245 QuickCheckResult
 246 nfc_quick_check (const string_t &s)
 247 {
 248   int last_canonical_class = 0;
 249   QuickCheckResult res = QuickCheckResult::YES;
 250
 251   for (unsigned long i = 0; i < s.size (); i++)
 252     {
 253       codepoint_t c = s[i];
 254
 255       if (c.is_supplementary_character ())
 256         i++;
 257
 258       int canonical_class = lookup_cc (c);
 259       if (last_canonical_class > canonical_class && canonical_class != 0)
 260         return QuickCheckResult::NO;
 261
 262       if (is_nfc_qc_no (c.value))
 263         return QuickCheckResult::NO;
 264
 265       if (is_nfc_qc_maybe (c.value))
 266         res = QuickCheckResult::MAYBE;
 267
 268       last_canonical_class = canonical_class;
 269     }
 270   return res;
 271 }
 272
 273 string_t
 274 nfc_normalize (const string_t &s)
 275 {
 276   if (nfc_quick_check (s) == QuickCheckResult::YES)
 277     return s;
 278
 279   // TODO: optimize normalization.
 280   // i.e. only normalize a limited area around MAYBE character, instead of
 281   // performing complete normlization of the entire string
 282
 283   // decompose
 284   string_t d = decomp_cano (s);
 285   sort_cano (d);
 286
 287   // recompose
 288   string_t r = recomp (d);
 289   return r;
 290 }
 291
 292 Utf8String
 293 Utf8String::nfc_normalize () const
 294 {
 295   return Utf8String (Rust::nfc_normalize (chars));
 296 }
 297
 298 bool
 299 is_alphabetic (uint32_t codepoint)
 300 {
 301   return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
 302 }
 303
 304 bool
 305 is_numeric (uint32_t codepoint)
 306 {
 307   return std::binary_search (NUMERIC_CODEPOINTS.begin (),
 308                              NUMERIC_CODEPOINTS.end (), codepoint);
 309 }
 310
 311 bool
 312 is_nfc_qc_maybe (uint32_t codepoint)
 313 {
 314   return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
 315 }
 316
 317 bool
 318 is_nfc_qc_no (uint32_t codepoint)
 319 {
 320   return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
 321 }
 322
 323 bool
 324 is_ascii_only (const std::string &str)
 325 {
 326   for (char c : str)
 327     if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
 328       return false;
 329   return true;
 330 }
 331
 332 } // namespace Rust
 333
 334 #if CHECKING_P
 335
 336 namespace selftest {
 337
 338 void
 339 rust_nfc_qc_test ()
 340 {
 341   ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
 342              Rust::QuickCheckResult::YES);
 343   ASSERT_EQ (Rust::nfc_quick_check (
 344                {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
 345              Rust::QuickCheckResult::MAYBE);
 346   ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
 347              Rust::QuickCheckResult::NO);
 348 }
 349
 350 void
 351 assert_normalize (const std::vector<Rust::Codepoint> origin,
 352                   const std::vector<Rust::Codepoint> expected)
 353 {
 354   std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
 355
 356   ASSERT_EQ (actual.size (), expected.size ());
 357   for (unsigned int i = 0; i < actual.size (); i++)
 358     {
 359       ASSERT_EQ (actual[i], expected[i]);
 360     }
 361 }
 362
 363 void
 364 rust_utf8_normalize_test ()
 365 {
 366   // ASCII
 367   assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
 368   // ASCII
 369   assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
 370
 371   // testcases retrieved from Part0 of
 372   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
 373   assert_normalize ({0x1e0a}, {0x1e0a});
 374   assert_normalize ({0x1e0c}, {0x1e0c});
 375   assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
 376   assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
 377   assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
 378
 379   // testcases for Hangul from Part0
 380   assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
 381   assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
 382   // testcases for Hangul from Part1
 383   assert_normalize ({0x3131}, {0x3131});
 384   assert_normalize ({0x3132}, {0x3132});
 385   // testcases for Hangul from Part3
 386   assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
 387   assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
 388
 389   // TODO: add more testcases in
 390   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
 391 }
 392
 393 void
 394 rust_utf8_property_test ()
 395 {
 396   ASSERT_TRUE (Rust::is_alphabetic ('A'));
 397   ASSERT_TRUE (Rust::is_alphabetic ('B'));
 398   ASSERT_TRUE (Rust::is_alphabetic ('x'));
 399   ASSERT_TRUE (Rust::is_alphabetic ('z'));
 400   ASSERT_TRUE (Rust::is_alphabetic (0x00b5));  // µ
 401   ASSERT_TRUE (Rust::is_alphabetic (0x3093));  // ん
 402   ASSERT_TRUE (Rust::is_alphabetic (0xa8f2));  // ꣲ
 403   ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
 404
 405   ASSERT_FALSE (Rust::is_alphabetic ('\v'));
 406   ASSERT_FALSE (Rust::is_alphabetic ('-'));
 407   ASSERT_FALSE (Rust::is_alphabetic ('_'));
 408   ASSERT_FALSE (Rust::is_alphabetic ('+'));
 409   ASSERT_FALSE (Rust::is_alphabetic ('0'));
 410   ASSERT_FALSE (Rust::is_alphabetic ('1'));
 411   ASSERT_FALSE (Rust::is_alphabetic ('2'));
 412   ASSERT_FALSE (Rust::is_alphabetic ('9'));
 413   ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
 414   ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
 415
 416   // `Nd`s
 417   ASSERT_TRUE (Rust::is_numeric ('0'));
 418   ASSERT_TRUE (Rust::is_numeric ('1'));
 419   ASSERT_TRUE (Rust::is_numeric ('7'));
 420   ASSERT_TRUE (Rust::is_numeric ('9'));
 421   ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
 422   ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
 423   // `Nl`s
 424   ASSERT_TRUE (Rust::is_numeric (0x16e6));  // ᛮ
 425   ASSERT_TRUE (Rust::is_numeric (0xa6e6));  // ꛦ
 426   ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
 427   ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
 428   // `No`s
 429   ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
 430   ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
 431
 432   ASSERT_FALSE (Rust::is_numeric ('\n'));
 433   ASSERT_FALSE (Rust::is_numeric ('-'));
 434   ASSERT_FALSE (Rust::is_numeric ('_'));
 435   ASSERT_FALSE (Rust::is_numeric ('('));
 436   ASSERT_FALSE (Rust::is_numeric ('z'));
 437   ASSERT_FALSE (Rust::is_numeric (';'));
 438   ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
 439   ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
 440   ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
 441   ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
 442   ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
 443 }
 444
 445 } // namespace selftest
 446
 447 #endif // CHECKING_P