Daily bump.
[official-gcc.git] / gcc / rust / util / rust-unicode.cc
blob3dd1c1960d447c3e19c15eb277737397ce73a2ee
1 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
3 // This file is part of GCC.
5 // GCC is free software; you can redistribute it and/or modify it under
6 // the terms of the GNU General Public License as published by the Free
7 // Software Foundation; either version 3, or (at your option) any later
8 // version.
10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 // for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with GCC; see the file COPYING3. If not see
17 // <http://www.gnu.org/licenses/>.
19 #include "rust-input-source.h"
20 #include "rust-system.h"
21 #include "optional.h"
22 #include "selftest.h"
23 #include "rust-lex.h"
24 #include "rust-unicode.h"
26 #include "rust-unicode-data.h"
28 namespace Rust {
30 typedef Codepoint codepoint_t;
31 typedef std::vector<codepoint_t> string_t;
33 // These constants are used to compose and decompose of Hangul syllables.
34 // See `Sample Code for Hangul Algorithms` in 3.1.2
35 // unicode.org/versions/Unicode15.0.0/ch03.pdf
36 const uint32_t S_BASE = 0xAC00;
37 const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
38 const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
39 const uint32_t N_COUNT = V_COUNT * T_COUNT;
40 const uint32_t S_COUNT = L_COUNT * N_COUNT;
42 // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
43 template <std::size_t SIZE>
44 bool
45 binary_search_ranges (
46 const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
47 uint32_t target_cp)
49 auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
50 [] (const std::pair<uint32_t, uint32_t> &a,
51 uint32_t b) { return a.second <= b; });
52 if (it == ranges.end ())
53 return false;
54 else
55 return it->first <= target_cp && target_cp < it->second;
58 int
59 lookup_cc (codepoint_t c)
61 auto it = CCC_TABLE.find (c.value);
62 if (it != CCC_TABLE.end ())
63 return it->second;
64 else
65 // Starter. Returns zero.
66 return 0;
69 tl::optional<codepoint_t>
70 lookup_recomp (codepoint_t starter, codepoint_t c)
72 auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
73 if (it != Rust::RECOMPOSITION_MAP.end ())
74 return {it->second};
76 it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
77 if (it != Rust::RECOMPOSITION_MAP.end ())
78 return {it->second};
80 return tl::nullopt;
83 void
84 recursive_decomp_cano (codepoint_t c, string_t &buf)
86 auto it = Rust::DECOMPOSITION_MAP.find (c.value);
87 if (it != Rust::DECOMPOSITION_MAP.end ())
89 std::vector<uint32_t> decomped = it->second;
90 for (uint32_t cp : decomped)
91 recursive_decomp_cano (cp, buf);
93 else
94 buf.push_back (c);
97 string_t
98 decomp_cano (string_t s)
100 string_t buf;
101 for (codepoint_t c : s)
103 int64_t s_index = c.value - S_BASE;
104 if (0 <= s_index && s_index < S_COUNT)
106 // decompose Hangul argorithmically
107 uint32_t l = L_BASE + s_index / N_COUNT;
108 uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
109 uint32_t t = T_BASE + s_index % T_COUNT;
110 buf.push_back (l);
111 buf.push_back (v);
112 if (t != T_BASE)
113 buf.push_back (t);
114 continue;
117 // Current character is not hangul
118 recursive_decomp_cano (c, buf);
120 return buf;
123 void
124 sort_cano (string_t &s)
126 int cc_here, cc_prev;
127 if (s.size () == 1)
128 return;
129 for (unsigned int i = 1; i < s.size (); i++)
131 cc_here = lookup_cc (s[i]);
132 cc_prev = lookup_cc (s[i - 1]);
133 if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
135 // swap
136 codepoint_t tmp = s[i];
137 s[i] = s[i - 1];
138 s[i - 1] = tmp;
139 if (i > 1)
140 i -= 2;
145 string_t
146 compose_hangul (string_t s)
148 string_t buf;
149 if (s.size () < 2)
150 return s;
152 codepoint_t last = s[0];
153 buf.push_back (last);
154 for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
156 codepoint_t ch = s[src_pos];
158 // L V => LV
159 int64_t l_index = last.value - L_BASE;
160 if (0 <= l_index && l_index < L_COUNT)
162 int64_t v_index = ch.value - V_BASE;
163 if (0 <= v_index && v_index < V_COUNT)
165 last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
166 // pop L
167 buf.pop_back ();
168 buf.push_back (last);
169 continue;
173 // LV T => LVT
174 int64_t s_index = last.value - S_BASE;
175 if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
177 int64_t t_index = ch.value - T_BASE;
178 if (0 < t_index && t_index < T_COUNT)
180 last.value += t_index;
181 // pop LV
182 buf.pop_back ();
183 buf.push_back (last);
184 continue;
187 last = ch;
188 buf.push_back (last);
190 return buf;
193 string_t
194 recomp (string_t s)
196 // compose hangul first
197 s = compose_hangul (s);
199 string_t buf;
200 if (s.size () < 2)
201 return s;
203 int last_class = -1;
204 // int starter_pos = 0; // Assume the first character is Starter. Correct?
205 // int target_pos = 1;
206 codepoint_t starter_ch = s[0];
208 for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
210 // get current character
211 codepoint_t ch = s[src_pos];
213 int ch_class = lookup_cc (ch);
214 tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
215 if (composite.has_value () && last_class < ch_class)
217 // ch can be composed
218 buf.push_back (composite.value ());
219 starter_ch = composite.value ();
221 else if (ch_class == 0)
223 // ch is Starter and cannot be composed.
224 if (src_pos == 1)
225 // FIXME: buggy?
226 buf.push_back (starter_ch);
227 starter_ch = ch;
228 last_class = -1;
229 buf.push_back (ch);
231 else
233 if (src_pos == 1)
234 // FIXME: buggy?
235 buf.push_back (starter_ch);
236 // ch is not Starter.
237 last_class = ch_class;
238 buf.push_back (ch);
241 return buf;
244 // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
245 QuickCheckResult
246 nfc_quick_check (const string_t &s)
248 int last_canonical_class = 0;
249 QuickCheckResult res = QuickCheckResult::YES;
251 for (unsigned long i = 0; i < s.size (); i++)
253 codepoint_t c = s[i];
255 if (c.is_supplementary_character ())
256 i++;
258 int canonical_class = lookup_cc (c);
259 if (last_canonical_class > canonical_class && canonical_class != 0)
260 return QuickCheckResult::NO;
262 if (is_nfc_qc_no (c.value))
263 return QuickCheckResult::NO;
265 if (is_nfc_qc_maybe (c.value))
266 res = QuickCheckResult::MAYBE;
268 last_canonical_class = canonical_class;
270 return res;
273 string_t
274 nfc_normalize (const string_t &s)
276 if (nfc_quick_check (s) == QuickCheckResult::YES)
277 return s;
279 // TODO: optimize normalization.
280 // i.e. only normalize a limited area around MAYBE character, instead of
281 // performing complete normlization of the entire string
283 // decompose
284 string_t d = decomp_cano (s);
285 sort_cano (d);
287 // recompose
288 string_t r = recomp (d);
289 return r;
292 Utf8String
293 Utf8String::nfc_normalize () const
295 return Utf8String (Rust::nfc_normalize (chars));
298 bool
299 is_alphabetic (uint32_t codepoint)
301 return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
304 bool
305 is_numeric (uint32_t codepoint)
307 return std::binary_search (NUMERIC_CODEPOINTS.begin (),
308 NUMERIC_CODEPOINTS.end (), codepoint);
311 bool
312 is_nfc_qc_maybe (uint32_t codepoint)
314 return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
317 bool
318 is_nfc_qc_no (uint32_t codepoint)
320 return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
323 bool
324 is_ascii_only (const std::string &str)
326 for (char c : str)
327 if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
328 return false;
329 return true;
332 } // namespace Rust
334 #if CHECKING_P
336 namespace selftest {
338 void
339 rust_nfc_qc_test ()
341 ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
342 Rust::QuickCheckResult::YES);
343 ASSERT_EQ (Rust::nfc_quick_check (
344 {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
345 Rust::QuickCheckResult::MAYBE);
346 ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
347 Rust::QuickCheckResult::NO);
350 void
351 assert_normalize (const std::vector<Rust::Codepoint> origin,
352 const std::vector<Rust::Codepoint> expected)
354 std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
356 ASSERT_EQ (actual.size (), expected.size ());
357 for (unsigned int i = 0; i < actual.size (); i++)
359 ASSERT_EQ (actual[i], expected[i]);
363 void
364 rust_utf8_normalize_test ()
366 // ASCII
367 assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
368 // ASCII
369 assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
371 // testcases retrieved from Part0 of
372 // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
373 assert_normalize ({0x1e0a}, {0x1e0a});
374 assert_normalize ({0x1e0c}, {0x1e0c});
375 assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
376 assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
377 assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
379 // testcases for Hangul from Part0
380 assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
381 assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
382 // testcases for Hangul from Part1
383 assert_normalize ({0x3131}, {0x3131});
384 assert_normalize ({0x3132}, {0x3132});
385 // testcases for Hangul from Part3
386 assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
387 assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
389 // TODO: add more testcases in
390 // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
393 void
394 rust_utf8_property_test ()
396 ASSERT_TRUE (Rust::is_alphabetic ('A'));
397 ASSERT_TRUE (Rust::is_alphabetic ('B'));
398 ASSERT_TRUE (Rust::is_alphabetic ('x'));
399 ASSERT_TRUE (Rust::is_alphabetic ('z'));
400 ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
401 ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
402 ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
403 ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
405 ASSERT_FALSE (Rust::is_alphabetic ('\v'));
406 ASSERT_FALSE (Rust::is_alphabetic ('-'));
407 ASSERT_FALSE (Rust::is_alphabetic ('_'));
408 ASSERT_FALSE (Rust::is_alphabetic ('+'));
409 ASSERT_FALSE (Rust::is_alphabetic ('0'));
410 ASSERT_FALSE (Rust::is_alphabetic ('1'));
411 ASSERT_FALSE (Rust::is_alphabetic ('2'));
412 ASSERT_FALSE (Rust::is_alphabetic ('9'));
413 ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
414 ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
416 // `Nd`s
417 ASSERT_TRUE (Rust::is_numeric ('0'));
418 ASSERT_TRUE (Rust::is_numeric ('1'));
419 ASSERT_TRUE (Rust::is_numeric ('7'));
420 ASSERT_TRUE (Rust::is_numeric ('9'));
421 ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
422 ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
423 // `Nl`s
424 ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
425 ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
426 ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
427 ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
428 // `No`s
429 ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
430 ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
432 ASSERT_FALSE (Rust::is_numeric ('\n'));
433 ASSERT_FALSE (Rust::is_numeric ('-'));
434 ASSERT_FALSE (Rust::is_numeric ('_'));
435 ASSERT_FALSE (Rust::is_numeric ('('));
436 ASSERT_FALSE (Rust::is_numeric ('z'));
437 ASSERT_FALSE (Rust::is_numeric (';'));
438 ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
439 ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
440 ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
441 ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
442 ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
445 } // namespace selftest
447 #endif // CHECKING_P