c++/coroutines: correct passing *this to promise type [PR104981]
[official-gcc.git] / libstdc++-v3 / include / bits / unicode.h
bloba14a17c5dfcf547e10dc3db948d150ed56237f65
1 // Unicode utilities -*- C++ -*-
3 // Copyright The GNU Toolchain Authors.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
25 /** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
30 #ifndef _GLIBCXX_UNICODE_H
31 #define _GLIBCXX_UNICODE_H 1
33 #if __cplusplus >= 202002L
34 #include <array>
35 #include <bit> // bit_width
36 #include <charconv> // __detail::__from_chars_alnum_to_val_table
37 #include <string_view>
38 #include <cstdint>
39 #include <bits/stl_algo.h>
40 #include <bits/stl_iterator.h>
41 #include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42 #include <bits/ranges_util.h> // view_interface
44 namespace std _GLIBCXX_VISIBILITY(default)
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 namespace __unicode
49 // A Unicode code point that is not a high or low surrogate.
50 constexpr bool
51 __is_scalar_value(char32_t __c)
53 if (__c < 0xD800) [[likely]]
54 return true;
55 return 0xDFFF < __c && __c <= 0x10FFFF;
58 // A code point that can be encoded in a single code unit of type _CharT.
59 template<typename _CharT>
60 constexpr bool
61 __is_single_code_unit(char32_t __c)
63 if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
64 return __c < 0x7F; // ASCII character
65 else
66 return __c < __gnu_cxx::__int_traits<_CharT>::__max
67 && __is_scalar_value(__c);
70 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
72 struct _Repl
74 constexpr char32_t
75 operator()() const noexcept
76 { return 0xFFFD; }
79 struct _Null_sentinel_t
81 template<input_iterator _It>
82 requires default_initializable<iter_value_t<_It>>
83 && equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
84 friend constexpr auto
85 operator==(_It __it, _Null_sentinel_t)
86 { return *__it == iter_value_t<_It>{}; }
89 template<typename _FromFmt, typename _ToFmt,
90 input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
91 typename _ErrorHandler = _Repl>
92 requires convertible_to<iter_value_t<_Iter>, _FromFmt>
93 class _Utf_iterator
95 static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
97 public:
98 using value_type = _ToFmt;
99 using difference_type = iter_difference_t<_Iter>;
100 using reference = value_type;
101 using iterator_concept
102 = std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
103 bidirectional_iterator_tag>;
105 constexpr _Utf_iterator() = default;
107 constexpr
108 _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
109 requires bidirectional_iterator<_Iter>
110 : _M_first_and_curr{__first, __it}, _M_last(__last)
112 if (_M_curr() != _M_last)
113 _M_read();
114 else
115 _M_buf = {};
118 constexpr
119 _Utf_iterator(_Iter __it, _Sent __last)
120 requires (!bidirectional_iterator<_Iter>)
121 : _M_first_and_curr{__it}, _M_last(__last)
123 if (_M_curr() != _M_last)
124 _M_read();
125 else
126 _M_buf = {};
129 template<class _Iter2, class _Sent2>
130 requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
131 constexpr
132 _Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
133 _ErrorHandler>& __other)
134 : _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
135 _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
136 _M_last(__other._M_last)
139 [[nodiscard]]
140 constexpr _Iter
141 begin() const requires bidirectional_iterator<_Iter>
142 { return _M_first(); }
144 [[nodiscard]]
145 constexpr _Sent
146 end() const { return _M_last; }
148 [[nodiscard]]
149 constexpr _Iter
150 base() const requires forward_iterator<_Iter>
151 { return _M_curr(); }
153 [[nodiscard]]
154 constexpr value_type
155 operator*() const { return _M_buf[_M_buf_index]; }
157 constexpr _Utf_iterator&
158 operator++()
160 if (_M_buf_index + 1 == _M_buf_last && _M_curr() != _M_last)
162 if constexpr (forward_iterator<_Iter>)
163 std::advance(_M_curr(), _M_to_increment);
164 if (_M_curr() == _M_last)
165 _M_buf_index = 0;
166 else
167 _M_read();
169 else if (_M_buf_index + 1 < _M_buf_last)
170 ++_M_buf_index;
171 return *this;
174 constexpr _Utf_iterator
175 operator++(int)
177 auto __tmp = *this;
178 ++*this;
179 return __tmp;
182 constexpr _Utf_iterator&
183 operator--() requires bidirectional_iterator<_Iter>
185 if (!_M_buf_index && _M_curr() != _M_first())
186 _M_read_reverse();
187 else if (_M_buf_index)
188 --_M_buf_index;
189 return *this;
192 constexpr _Utf_iterator
193 operator--(int)
195 auto __tmp = *this;
196 --*this;
197 return __tmp;
200 [[nodiscard]]
201 friend constexpr bool
202 operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
203 requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
205 if constexpr (forward_iterator<_Iter>)
206 return __lhs._M_curr() == __rhs._M_curr()
207 && __lhs._M_buf_index == __rhs._M_buf_index;
208 else if (__lhs._M_curr() != __rhs._M_curr())
209 return false;
210 else if (__lhs._M_buf_index == __rhs._M_buf_index
211 && __lhs._M_buf_last == __rhs._M_buf_last)
212 return true;
213 else
214 return __lhs._M_buf_index == __lhs._M_buf_last
215 && __rhs._M_buf_index == __rhs._M_buf_last;
218 [[nodiscard]]
219 friend constexpr bool
220 operator==(_Utf_iterator __lhs, _Sent __rhs)
222 if constexpr (forward_iterator<_Iter>)
223 return __lhs._M_curr() == __rhs;
224 else
225 return __lhs._M_curr() == __rhs
226 && __lhs._M_buf_index == __lhs._M_buf_last;
229 private:
230 constexpr void
231 _M_read()
233 if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
234 _M_read_utf8();
235 else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
236 _M_read_utf16();
237 else
239 static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
240 _M_read_utf32();
244 constexpr void
245 _M_read_reverse(); // TODO
247 template<typename>
248 struct _Guard
250 _Guard(void*, _Iter&) { }
253 template<typename _It> requires forward_iterator<_It>
254 struct _Guard<_It>
256 constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
257 _Utf_iterator* _M_this;
258 _It _M_orig;
261 constexpr void
262 _M_read_utf8()
264 _Guard<_Iter> __g{this, _M_curr()};
265 char32_t __c{};
266 const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
267 uint8_t __u = *_M_curr()++;
268 uint8_t __to_incr = 1;
269 auto __incr = [&, this] {
270 ++__to_incr;
271 return ++_M_curr();
274 if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
275 __c = __u;
276 else if (__u < 0xC2) [[unlikely]]
277 __c = _S_error();
278 else if (_M_curr() == _M_last) [[unlikely]]
279 __c = _S_error();
280 else if (__u <= 0xDF) // 0xC2 to 0xDF
282 __c = __u & 0x1F;
283 __u = *_M_curr();
285 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
286 __c = _S_error();
287 else
289 __c = (__c << 6) | (__u & 0x3F);
290 __incr();
293 else if (__u <= 0xEF) // 0xE0 to 0xEF
295 const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
296 const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
298 __c = __u & 0x0F;
299 __u = *_M_curr();
301 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
302 __c = _S_error();
303 else if (__incr() == _M_last) [[unlikely]]
304 __c = _S_error();
305 else
307 __c = (__c << 6) | (__u & 0x3F);
308 __u = *_M_curr();
310 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
311 __c = _S_error();
312 else
314 __c = (__c << 6) | (__u & 0x3F);
315 __incr();
319 else if (__u <= 0xF4) // 0xF0 to 0xF4
321 const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
322 const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
324 __c = __u & 0x07;
325 __u = *_M_curr();
327 if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
328 __c = _S_error();
329 else if (__incr() == _M_last) [[unlikely]]
330 __c = _S_error();
331 else
333 __c = (__c << 6) | (__u & 0x3F);
334 __u = *_M_curr();
336 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
337 __c = _S_error();
338 else if (__incr() == _M_last) [[unlikely]]
339 __c = _S_error();
340 else
342 __c = (__c << 6) | (__u & 0x3F);
343 __u = *_M_curr();
345 if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
346 __c = _S_error();
347 else
349 __c = (__c << 6) | (__u & 0x3F);
350 __incr();
355 else [[unlikely]]
356 __c = _S_error();
358 _M_update(__c, __to_incr);
361 constexpr void
362 _M_read_utf16()
364 _Guard<_Iter> __g{this, _M_curr()};
365 char32_t __c{};
366 uint16_t __u = *_M_curr()++;
367 uint8_t __to_incr = 1;
369 if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
370 __c = __u;
371 else if (__u < 0xDC00 && _M_curr() != _M_last)
373 uint16_t __u2 = *_M_curr();
374 if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
375 __c = _S_error();
376 else
378 ++_M_curr();
379 __to_incr = 2;
380 uint32_t __x = (__u & 0x3F) << 10 | __u2 & 0x3FF;
381 uint32_t __w = (__u >> 6) & 0x1F;
382 __c = (__w + 1) << 16 | __x;
385 else
386 __c = _S_error();
388 _M_update(__c, __to_incr);
391 constexpr void
392 _M_read_utf32()
394 _Guard<_Iter> __g{this, _M_curr()};
395 char32_t __c = *_M_curr()++;
396 if (!__is_scalar_value(__c)) [[unlikely]]
397 __c = _S_error();
398 _M_update(__c, 1);
401 // Encode the code point __c as one or more code units in _M_buf.
402 constexpr void
403 _M_update(char32_t __c, uint8_t __to_incr)
405 _M_to_increment = __to_incr;
406 _M_buf_index = 0;
407 if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
409 _M_buf[0] = __c;
410 _M_buf_last = 1;
412 else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
414 if (__is_single_code_unit<_ToFmt>(__c))
416 _M_buf[0] = __c;
417 _M_buf[1] = 0;
418 _M_buf_last = 1;
420 else
422 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
423 const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
424 char16_t __lead = __lead_offset + (__c >> 10);
425 char16_t __trail = 0xDC00 + (__c & 0x3FF);
426 _M_buf[0] = __lead;
427 _M_buf[1] = __trail;
428 _M_buf_last = 2;
431 else
433 static_assert(sizeof(_ToFmt) == 1);
434 int __bits = std::bit_width((uint32_t)__c);
435 if (__bits <= 7) [[likely]]
437 _M_buf[0] = __c;
438 _M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
439 _M_buf_last = 1;
441 else if (__bits <= 11)
443 _M_buf[0] = 0xC0 | (__c >> 6);
444 _M_buf[1] = 0x80 | (__c & 0x3F);
445 _M_buf[2] = _M_buf[3] = 0;
446 _M_buf_last = 2;
448 else if (__bits <= 16)
450 _M_buf[0] = 0xE0 | (__c >> 12);
451 _M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
452 _M_buf[2] = 0x80 | (__c & 0x3F);
453 _M_buf[3] = 0;
454 _M_buf_last = 3;
456 else
458 _M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
459 _M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
460 _M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
461 _M_buf[3] = 0x80 | (__c & 0x3F);
462 _M_buf_last = 4;
467 constexpr char32_t
468 _S_error()
470 char32_t __c = _ErrorHandler()();
471 __glibcxx_assert(__is_scalar_value(__c));
472 return __c;
475 constexpr _Iter
476 _M_first() const requires bidirectional_iterator<_Iter>
477 { return _M_first_and_curr._M_first; }
479 constexpr _Iter&
480 _M_curr() { return _M_first_and_curr._M_curr; }
482 constexpr _Iter
483 _M_curr() const { return _M_first_and_curr._M_curr; }
485 array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
487 template<typename _It>
488 struct _First_and_curr
490 _First_and_curr() = default;
492 constexpr
493 _First_and_curr(_It __curr) : _M_curr(__curr) { }
495 template<convertible_to<_It> _It2>
496 constexpr
497 _First_and_curr(const _First_and_curr<_It2>& __other)
498 : _M_curr(__other._M_curr) { }
500 _It _M_curr;
503 template<typename _It> requires bidirectional_iterator<_It>
504 struct _First_and_curr<_It>
506 _First_and_curr() = default;
508 constexpr
509 _First_and_curr(_It __first, _It __curr)
510 : _M_first(__first), _M_curr(__curr) { }
512 template<convertible_to<_It> _It2>
513 constexpr
514 _First_and_curr(const _First_and_curr<_It2>& __other)
515 : _M_first(__other._M_first), _M_curr(__other._M_curr) { }
517 _It _M_first;
518 _It _M_curr;
521 _First_and_curr<_Iter> _M_first_and_curr;
523 uint8_t _M_buf_index = 0;
524 uint8_t _M_buf_last = 0;
525 uint8_t _M_to_increment = 0;
527 [[no_unique_address]] _Sent _M_last;
529 template<typename _FromFmt2, typename _ToFmt2,
530 input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
531 typename _ErrHandler>
532 requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
533 friend class _Utf_iterator;
536 template<typename _ToFormat, ranges::input_range _Range>
537 class _Utf_view
538 : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
540 using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
541 _ToFormat, ranges::iterator_t<_Range>,
542 ranges::sentinel_t<_Range>>;
544 template<typename _Iter, typename _Sent>
545 constexpr auto
546 _M_begin(_Iter __first, _Sent __last)
548 if constexpr (bidirectional_iterator<_Iter>)
549 return _Iterator(__first, __first, __last);
550 else
551 return _Iterator(__first, __last);
554 template<typename _Iter, typename _Sent>
555 constexpr auto
556 _M_end(_Iter __first, _Sent __last)
558 if constexpr (!is_same_v<_Iter, _Sent>)
559 return __last;
560 else if constexpr (bidirectional_iterator<_Iter>)
561 return _Iterator(__first, __last, __last);
562 else
563 return _Iterator(__last, __last);
566 _Range _M_base;
568 public:
569 constexpr explicit
570 _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
572 constexpr auto begin()
573 { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
575 constexpr auto end()
576 { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
578 constexpr bool empty() const { return ranges::empty(_M_base); }
581 #ifdef __cpp_char8_t
582 template<typename _View>
583 using _Utf8_view = _Utf_view<char8_t, _View>;
584 #else
585 template<typename _View>
586 using _Utf8_view = _Utf_view<char, _View>;
587 #endif
588 template<typename _View>
589 using _Utf16_view = _Utf_view<char16_t, _View>;
590 template<typename _View>
591 using _Utf32_view = _Utf_view<char32_t, _View>;
593 inline namespace __v15_1_0
595 #define _GLIBCXX_GET_UNICODE_DATA 150100
596 #include "unicode-data.h"
597 #ifdef _GLIBCXX_GET_UNICODE_DATA
598 # error "Invalid unicode data"
599 #endif
601 // The field width of a code point.
602 constexpr int
603 __field_width(char32_t __c) noexcept
605 if (__c < __width_edges[0]) [[likely]]
606 return 1;
608 auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
609 return (__p - __width_edges) % 2 + 1;
612 // @pre c <= 0x10FFFF
613 constexpr _Gcb_property
614 __grapheme_cluster_break_property(char32_t __c) noexcept
616 constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
617 auto* __end = std::end(__gcb_edges);
618 auto* __p = std::lower_bound(__gcb_edges, __end,
619 (__c << __gcb_shift_bits) | __mask);
620 return _Gcb_property(__p[-1] & __mask);
623 constexpr bool
624 __is_incb_linker(char32_t __c) noexcept
626 const auto __end = std::end(__incb_linkers);
627 // Array is small enough that linear search is faster than binary search.
628 return std::find(__incb_linkers, __end, __c) != __end;
631 // @pre c <= 0x10FFFF
632 constexpr _InCB
633 __incb_property(char32_t __c) noexcept
635 if ((__c << 2) < __incb_edges[0]) [[likely]]
636 return _InCB(0);
638 constexpr uint32_t __mask = 0x3;
639 auto* __end = std::end(__incb_edges);
640 auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
641 return _InCB(__p[-1] & __mask);
644 constexpr bool
645 __is_extended_pictographic(char32_t __c)
647 if (__c < __xpicto_edges[0]) [[likely]]
648 return 0;
650 auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
651 return (__p - __xpicto_edges) % 2;
654 struct _Grapheme_cluster_iterator_base
656 char32_t _M_c; // First code point in the cluster.
657 _Gcb_property _M_prop; // GCB property of _M_c.
658 enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
659 _XPicto _M_xpicto_seq_state = _XPicto::_Init;
660 unsigned char _M_RI_count = 0;
661 bool _M_incb_linker_seen = false;
663 constexpr void
664 _M_reset(char32_t __c, _Gcb_property __p)
666 _M_c = __c;
667 _M_prop = __p;
668 _M_xpicto_seq_state = _XPicto::_Init;
669 _M_RI_count = 0;
670 _M_incb_linker_seen = false;
673 constexpr void
674 _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
676 if (_M_xpicto_seq_state == _XPicto::_Failed)
677 return;
679 auto __next_state = _XPicto::_Failed;
680 if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
682 if (__p == _Gcb_property::_Gcb_ZWJ)
684 if (_M_xpicto_seq_state == _XPicto::_Matched)
685 __next_state = _XPicto::_Zwj;
686 // We check _M_c here so that we do the lookup at most once,
687 // and only for clusters containing at least one ZWJ.
688 else if (__is_extended_pictographic(_M_c))
689 __next_state = _XPicto::_Zwj;
691 else if (__p == _Gcb_property::_Gcb_Extend)
692 __next_state = _M_xpicto_seq_state; // no change
694 else // Zwj
696 // This assumes that all \p{Extended_Pictographic} emoji have
697 // Grapheme_Cluster_Break=Other.
698 if (__p == _Gcb_property::_Gcb_Other
699 && __is_extended_pictographic(__c))
700 __next_state = _XPicto::_Matched;
702 _M_xpicto_seq_state = __next_state;
705 constexpr void
706 _M_update_ri_count(_Gcb_property __p)
708 if (__p == _Gcb_property::_Gcb_Regional_Indicator)
709 ++_M_RI_count;
710 else
711 _M_RI_count = 0;
714 constexpr void
715 _M_update_incb_state(char32_t __c, _Gcb_property)
717 if (__is_incb_linker(__c))
718 _M_incb_linker_seen = true;
722 // Split a range into extended grapheme clusters.
723 template<ranges::forward_range _View> requires ranges::view<_View>
724 class _Grapheme_cluster_view
725 : public ranges::view_interface<_Grapheme_cluster_view<_View>>
727 public:
729 constexpr
730 _Grapheme_cluster_view(_View __v)
731 : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
734 constexpr auto begin() const { return _M_begin; }
735 constexpr auto end() const { return _M_begin.end(); }
737 private:
738 struct _Iterator : private _Grapheme_cluster_iterator_base
740 private:
741 // Iterator over the underlying code points.
742 using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
744 public:
745 // TODO: Change value_type to be subrange<_U32_iterator> instead?
746 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
747 // That would be the whole cluster, not just the first code point.
748 // Would need to store two iterators and find end of current cluster
749 // on increment, so operator* returns value_type(_M_base, _M_next).
750 using value_type = char32_t;
751 using iterator_concept = forward_iterator_tag;
752 using difference_type = ptrdiff_t;
754 constexpr
755 _Iterator(_U32_iterator __i)
756 : _M_base(__i)
758 if (__i != __i.end())
760 _M_c = *__i;
761 _M_prop = __grapheme_cluster_break_property(_M_c);
765 // The first code point of the current extended grapheme cluster.
766 constexpr value_type
767 operator*() const
768 { return _M_c; }
770 constexpr auto
771 operator->() const
772 { return &_M_c; }
774 // Move to the next extended grapheme cluster.
775 constexpr _Iterator&
776 operator++()
778 const auto __end = _M_base.end();
779 if (_M_base != __end)
781 auto __p_prev = _M_prop;
782 auto __it = _M_base;
783 while (++__it != __end)
785 char32_t __c = *__it;
786 auto __p = __grapheme_cluster_break_property(*__it);
787 _M_update_xpicto_seq_state(__c, __p);
788 _M_update_ri_count(__p);
789 _M_update_incb_state(__c, __p);
790 if (_M_is_break(__p_prev, __p, __it))
792 // Found a grapheme cluster break
793 _M_reset(__c, __p);
794 break;
796 __p_prev = __p;
798 _M_base = __it;
800 return *this;
803 constexpr _Iterator
804 operator++(int)
806 auto __tmp = *this;
807 ++*this;
808 return __tmp;
811 constexpr bool
812 operator==(const _Iterator& __i) const
813 { return _M_base == __i._M_base; }
815 // This supports iter != iter.end()
816 constexpr bool
817 operator==(const ranges::sentinel_t<_View>& __i) const
818 { return _M_base == __i; }
820 // Iterator to the start of the current cluster.
821 constexpr auto base() const { return _M_base.base(); }
823 // The end of the underlying view (not the end of the current cluster!)
824 constexpr auto end() const { return _M_base.end(); }
826 // Field width of the first code point in the cluster.
827 constexpr int
828 width() const noexcept
829 { return __field_width(_M_c); }
831 private:
832 _U32_iterator _M_base;
834 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
835 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
836 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
837 // Return true if there is a break between code point with property p1
838 // and code point with property p2.
839 constexpr bool
840 _M_is_break(_Gcb_property __p1, _Gcb_property __p2,
841 _U32_iterator __curr) const
843 using enum _Gcb_property;
845 if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
846 return true; // Break after Control or LF.
848 if (__p1 == _Gcb_CR)
849 return __p2 != _Gcb_LF; // Do not break between a CR and LF.
851 // Rule GB5
852 if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
853 return true; // Break before Control, CR or LF.
855 // Rule GB6
856 if (__p1 == _Gcb_L)
857 switch (__p2)
859 case _Gcb_L:
860 case _Gcb_V:
861 case _Gcb_LV:
862 case _Gcb_LVT:
863 return false; // Do not break Hangul syllable sequences.
864 default:
865 return true;
868 // Rule GB7
869 if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
870 switch (__p2)
872 case _Gcb_V:
873 case _Gcb_T:
874 return false; // Do not break Hangul syllable sequences.
875 default:
876 return true;
879 // Rule GB8
880 if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
881 return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
883 // Rule GB9
884 if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
885 return false; // Do not break before extending characters or ZWJ.
887 // The following GB9x rules only apply to extended grapheme clusters,
888 // which is what the C++ standard uses (not legacy grapheme clusters).
890 // Rule GB9a
891 if (__p2 == _Gcb_SpacingMark)
892 return false; // Do not break before SpacingMarks,
893 // Rule GB9b
894 if (__p1 == _Gcb_Prepend)
895 return false; // or after Prepend characters.
897 // Rule GB9c (Unicode 15.1.0)
898 // Do not break within certain combinations with
899 // Indic_Conjunct_Break (InCB)=Linker.
900 if (_M_incb_linker_seen
901 && __incb_property(_M_c) == _InCB::_Consonant
902 && __incb_property(*__curr) == _InCB::_Consonant)
904 // Match [_M_base, __curr] against regular expression
905 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
906 bool __have_linker = false;
907 auto __it = _M_base;
908 while (++__it != __curr)
910 if (__is_incb_linker(*__it))
911 __have_linker = true;
912 else
914 auto __incb = __incb_property(*__it);
915 if (__incb == _InCB::_Consonant)
916 __have_linker = false;
917 else if (__incb != _InCB::_Extend)
918 break;
921 if (__it == __curr && __have_linker)
922 return false;
925 // Rule GB11
926 // Do not break within emoji modifier sequences
927 // or emoji zwj sequences.
928 if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
929 return false;
931 // Rules GB12 and GB13
932 // Do not break within emoji flag sequences. That is, do not break
933 // between regional indicator (RI) symbols if there is an odd number
934 // of RI characters before the break point.
935 if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
936 return (_M_RI_count & 1) == 0;
938 // Rule GB999
939 return true; // Otherwise, break everywhere.
943 _Iterator _M_begin;
946 } // namespace __v15_1_0
948 // Return the field width of a string.
949 template<typename _CharT>
950 constexpr size_t
951 __field_width(basic_string_view<_CharT> __s)
953 if (__s.empty()) [[unlikely]]
954 return 0;
955 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
956 auto __it = __gc.begin();
957 const auto __end = __gc.end();
958 size_t __n = __it.width();
959 while (++__it != __end)
960 __n += __it.width();
961 return __n;
964 // Truncate a string to at most `__max` field width units, and return the
965 // resulting field width.
966 template<typename _CharT>
967 constexpr size_t
968 __truncate(basic_string_view<_CharT>& __s, size_t __max)
970 if (__s.empty()) [[unlikely]]
971 return 0;
973 _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
974 auto __it = __gc.begin();
975 const auto __end = __gc.end();
976 size_t __n = __it.width();
977 if (__n > __max)
979 __s = {};
980 return 0;
982 while (++__it != __end)
984 size_t __n2 = __n + __it.width();
985 if (__n2 > __max)
987 __s = basic_string_view<_CharT>(__s.begin(), __it.base());
988 return __n;
990 __n = __n2;
992 return __n;
995 template<typename _CharT>
996 consteval bool
997 __literal_encoding_is_unicode()
999 if constexpr (is_same_v<_CharT, char16_t>)
1000 return true;
1001 else if constexpr (is_same_v<_CharT, char32_t>)
1002 return true;
1003 #ifdef __cpp_char8_t
1004 else if constexpr (is_same_v<_CharT, char8_t>)
1005 return true;
1006 #endif
1008 const char* __enc = "";
1010 #ifdef __GNUC_EXECUTION_CHARSET_NAME
1011 auto __remove_iso10646_prefix = [](const char* __s) {
1012 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1013 if (__s[0] == 'I' || __s[0] == 'i')
1014 if (__s[1] == 'S' || __s[1] == 's')
1015 if (__s[2] == 'O' || __s[2] == 'o')
1016 if (string_view(__s + 3).starts_with("-10646/"))
1017 return __s + 10;
1018 return __s;
1021 if constexpr (is_same_v<_CharT, char>)
1022 __enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1023 # if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1024 else
1025 __enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1026 # endif
1028 if ((__enc[0] == 'U' || __enc[0] == 'u')
1029 && (__enc[1] == 'T' || __enc[1] == 't')
1030 && (__enc[2] == 'F' || __enc[2] == 'f'))
1032 __enc += 3;
1033 if (__enc[0] == '-')
1034 ++__enc;
1035 if (__enc[0] == '8')
1036 return __enc[1] == '\0' || string_view(__enc + 1) == "//";
1037 else if constexpr (!is_same_v<_CharT, char>)
1039 string_view __s(__enc);
1040 if (__s.ends_with("//"))
1041 __s.remove_suffix(2);
1042 return __s == "16" || __s == "32";
1045 #elif defined __clang_literal_encoding__
1046 if constexpr (is_same_v<_CharT, char>)
1047 __enc = __clang_literal_encoding__;
1048 # if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1049 else
1050 __enc = __clang_wide_literal_encoding__;
1051 # endif
1052 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1053 string_view __s(__enc);
1054 if (__s == "UTF-8")
1055 return true;
1056 else if constexpr (!is_same_v<_CharT, char>)
1057 return __s == "UTF-16" || __s == "UTF-32";
1058 #endif
1060 return false;
1063 consteval bool
1064 __literal_encoding_is_utf8()
1065 { return __literal_encoding_is_unicode<char>(); }
1067 consteval bool
1068 __literal_encoding_is_extended_ascii()
1070 return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1071 && 'a' == 0x61 && 'z' == 0x7a;
1074 // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1075 constexpr bool
1076 __charset_alias_match(string_view __a, string_view __b)
1078 // Map alphanumeric chars to their base 64 value, everything else to 127.
1079 auto __map = [](char __c, bool& __num) -> unsigned char {
1080 if (__c == '0') [[unlikely]]
1081 return __num ? 0 : 127;
1082 const auto __v = __detail::__from_chars_alnum_to_val(__c);
1083 __num = __v < 10;
1084 return __v;
1087 auto __ptr_a = __a.begin(), __end_a = __a.end();
1088 auto __ptr_b = __b.begin(), __end_b = __b.end();
1089 bool __num_a = false, __num_b = false;
1091 while (true)
1093 // Find the value of the next alphanumeric character in each string.
1094 unsigned char __val_a{}, __val_b{};
1095 while (__ptr_a != __end_a
1096 && (__val_a = __map(*__ptr_a, __num_a)) == 127)
1097 ++__ptr_a;
1098 while (__ptr_b != __end_b
1099 && (__val_b = __map(*__ptr_b, __num_b)) == 127)
1100 ++__ptr_b;
1101 // Stop when we reach the end of a string, or get a mismatch.
1102 if (__ptr_a == __end_a)
1103 return __ptr_b == __end_b;
1104 else if (__ptr_b == __end_b)
1105 return false;
1106 else if (__val_a != __val_b)
1107 return false; // Found non-matching characters.
1108 ++__ptr_a;
1109 ++__ptr_b;
1111 return true;
1114 } // namespace __unicode
1116 namespace ranges
1118 template<typename _To, typename _Range>
1119 inline constexpr bool
1120 enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1121 = enable_borrowed_range<_Range>;
1123 template<typename _Range>
1124 inline constexpr bool
1125 enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1126 = enable_borrowed_range<_Range>;
1127 } // namespace ranges
1129 _GLIBCXX_END_NAMESPACE_VERSION
1130 } // namespace std
1131 #endif // C++20
1132 #endif // _GLIBCXX_UNICODE_H