1 // Unicode utilities -*- C++ -*-
3 // Copyright The GNU Toolchain Authors.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
25 /** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
30 #ifndef _GLIBCXX_UNICODE_H
31 #define _GLIBCXX_UNICODE_H 1
33 #if __cplusplus >= 202002L
35 #include <bit> // bit_width
36 #include <charconv> // __detail::__from_chars_alnum_to_val_table
37 #include <string_view>
39 #include <bits/stl_algo.h>
40 #include <bits/stl_iterator.h>
41 #include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42 #include <bits/ranges_util.h> // view_interface
44 namespace std
_GLIBCXX_VISIBILITY(default)
46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
49 // A Unicode code point that is not a high or low surrogate.
51 __is_scalar_value(char32_t __c
)
53 if (__c
< 0xD800) [[likely
]]
55 return 0xDFFF < __c
&& __c
<= 0x10FFFF;
58 // A code point that can be encoded in a single code unit of type _CharT.
59 template<typename _CharT
>
61 __is_single_code_unit(char32_t __c
)
63 if constexpr (__gnu_cxx::__int_traits
<_CharT
>::__max
<= 0xFF)
64 return __c
< 0x7F; // ASCII character
66 return __c
< __gnu_cxx::__int_traits
<_CharT
>::__max
67 && __is_scalar_value(__c
);
70 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
75 operator()() const noexcept
79 struct _Null_sentinel_t
81 template<input_iterator _It
>
82 requires default_initializable
<iter_value_t
<_It
>>
83 && equality_comparable_with
<iter_reference_t
<_It
>, iter_value_t
<_It
>>
85 operator==(_It __it
, _Null_sentinel_t
)
86 { return *__it
== iter_value_t
<_It
>{}; }
89 template<typename _FromFmt
, typename _ToFmt
,
90 input_iterator _Iter
, sentinel_for
<_Iter
> _Sent
= _Iter
,
91 typename _ErrorHandler
= _Repl
>
92 requires convertible_to
<iter_value_t
<_Iter
>, _FromFmt
>
95 static_assert(forward_iterator
<_Iter
> || noexcept(_ErrorHandler()()));
98 using value_type
= _ToFmt
;
99 using difference_type
= iter_difference_t
<_Iter
>;
100 using reference
= value_type
;
101 using iterator_concept
102 = std::__detail::__clamp_iter_cat
<__iter_category_t
<_Iter
>,
103 bidirectional_iterator_tag
>;
105 constexpr _Utf_iterator() = default;
108 _Utf_iterator(_Iter __first
, _Iter __it
, _Sent __last
)
109 requires bidirectional_iterator
<_Iter
>
110 : _M_first_and_curr
{__first
, __it
}, _M_last(__last
)
112 if (_M_curr() != _M_last
)
119 _Utf_iterator(_Iter __it
, _Sent __last
)
120 requires (!bidirectional_iterator
<_Iter
>)
121 : _M_first_and_curr
{__it
}, _M_last(__last
)
123 if (_M_curr() != _M_last
)
129 template<class _Iter2
, class _Sent2
>
130 requires convertible_to
<_Iter2
, _Iter
> && convertible_to
<_Sent2
, _Sent
>
132 _Utf_iterator(const _Utf_iterator
<_FromFmt
, _ToFmt
, _Iter2
, _Sent2
,
133 _ErrorHandler
>& __other
)
134 : _M_buf(__other
._M_buf
), _M_first_and_curr(__other
._M_first_and_curr
),
135 _M_buf_index(__other
._M_buf_index
), _M_buf_last(__other
._M_buf_last
),
136 _M_last(__other
._M_last
)
141 begin() const requires bidirectional_iterator
<_Iter
>
142 { return _M_first(); }
146 end() const { return _M_last
; }
150 base() const requires forward_iterator
<_Iter
>
151 { return _M_curr(); }
155 operator*() const { return _M_buf
[_M_buf_index
]; }
157 constexpr _Utf_iterator
&
160 if (_M_buf_index
+ 1 == _M_buf_last
&& _M_curr() != _M_last
)
162 if constexpr (forward_iterator
<_Iter
>)
163 std::advance(_M_curr(), _M_to_increment
);
164 if (_M_curr() == _M_last
)
169 else if (_M_buf_index
+ 1 < _M_buf_last
)
174 constexpr _Utf_iterator
182 constexpr _Utf_iterator
&
183 operator--() requires bidirectional_iterator
<_Iter
>
185 if (!_M_buf_index
&& _M_curr() != _M_first())
187 else if (_M_buf_index
)
192 constexpr _Utf_iterator
201 friend constexpr bool
202 operator==(_Utf_iterator __lhs
, _Utf_iterator __rhs
)
203 requires forward_iterator
<_Iter
> || requires (_Iter __i
) { __i
!= __i
; }
205 if constexpr (forward_iterator
<_Iter
>)
206 return __lhs
._M_curr() == __rhs
._M_curr()
207 && __lhs
._M_buf_index
== __rhs
._M_buf_index
;
208 else if (__lhs
._M_curr() != __rhs
._M_curr())
210 else if (__lhs
._M_buf_index
== __rhs
._M_buf_index
211 && __lhs
._M_buf_last
== __rhs
._M_buf_last
)
214 return __lhs
._M_buf_index
== __lhs
._M_buf_last
215 && __rhs
._M_buf_index
== __rhs
._M_buf_last
;
219 friend constexpr bool
220 operator==(_Utf_iterator __lhs
, _Sent __rhs
)
222 if constexpr (forward_iterator
<_Iter
>)
223 return __lhs
._M_curr() == __rhs
;
225 return __lhs
._M_curr() == __rhs
226 && __lhs
._M_buf_index
== __lhs
._M_buf_last
;
233 if constexpr (sizeof(_FromFmt
) == sizeof(uint8_t))
235 else if constexpr (sizeof(_FromFmt
) == sizeof(uint16_t))
239 static_assert(sizeof(_FromFmt
) == sizeof(uint32_t));
245 _M_read_reverse(); // TODO
250 _Guard(void*, _Iter
&) { }
253 template<typename _It
> requires forward_iterator
<_It
>
256 constexpr ~_Guard() { _M_this
->_M_curr() = std::move(_M_orig
); }
257 _Utf_iterator
* _M_this
;
264 _Guard
<_Iter
> __g
{this, _M_curr()};
266 const uint8_t __lo_bound
= 0x80, __hi_bound
= 0xBF;
267 uint8_t __u
= *_M_curr()++;
268 uint8_t __to_incr
= 1;
269 auto __incr
= [&, this] {
274 if (__u
<= 0x7F) [[likely
]] // 0x00 to 0x7F
276 else if (__u
< 0xC2) [[unlikely
]]
278 else if (_M_curr() == _M_last
) [[unlikely
]]
280 else if (__u
<= 0xDF) // 0xC2 to 0xDF
285 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
289 __c
= (__c
<< 6) | (__u
& 0x3F);
293 else if (__u
<= 0xEF) // 0xE0 to 0xEF
295 const uint8_t __lo_bound_2
= __u
== 0xE0 ? 0xA0 : __lo_bound
;
296 const uint8_t __hi_bound_2
= __u
== 0xED ? 0x9F : __hi_bound
;
301 if (__u
< __lo_bound_2
|| __u
> __hi_bound_2
) [[unlikely
]]
303 else if (__incr() == _M_last
) [[unlikely
]]
307 __c
= (__c
<< 6) | (__u
& 0x3F);
310 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
314 __c
= (__c
<< 6) | (__u
& 0x3F);
319 else if (__u
<= 0xF4) // 0xF0 to 0xF4
321 const uint8_t __lo_bound_2
= __u
== 0xF0 ? 0x90 : __lo_bound
;
322 const uint8_t __hi_bound_2
= __u
== 0xF4 ? 0x8F : __hi_bound
;
327 if (__u
< __lo_bound_2
|| __u
> __hi_bound_2
) [[unlikely
]]
329 else if (__incr() == _M_last
) [[unlikely
]]
333 __c
= (__c
<< 6) | (__u
& 0x3F);
336 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
338 else if (__incr() == _M_last
) [[unlikely
]]
342 __c
= (__c
<< 6) | (__u
& 0x3F);
345 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
349 __c
= (__c
<< 6) | (__u
& 0x3F);
358 _M_update(__c
, __to_incr
);
364 _Guard
<_Iter
> __g
{this, _M_curr()};
366 uint16_t __u
= *_M_curr()++;
367 uint8_t __to_incr
= 1;
369 if (__u
< 0xD800 || __u
> 0xDFFF) [[likely
]]
371 else if (__u
< 0xDC00 && _M_curr() != _M_last
)
373 uint16_t __u2
= *_M_curr();
374 if (__u2
< 0xDC00 || __u2
> 0xDFFF) [[unlikely
]]
380 uint32_t __x
= (__u
& 0x3F) << 10 | __u2
& 0x3FF;
381 uint32_t __w
= (__u
>> 6) & 0x1F;
382 __c
= (__w
+ 1) << 16 | __x
;
388 _M_update(__c
, __to_incr
);
394 _Guard
<_Iter
> __g
{this, _M_curr()};
395 char32_t __c
= *_M_curr()++;
396 if (!__is_scalar_value(__c
)) [[unlikely
]]
401 // Encode the code point __c as one or more code units in _M_buf.
403 _M_update(char32_t __c
, uint8_t __to_incr
)
405 _M_to_increment
= __to_incr
;
407 if constexpr (sizeof(_ToFmt
) == sizeof(uint32_t))
412 else if constexpr (sizeof(_ToFmt
) == sizeof(uint16_t))
414 if (__is_single_code_unit
<_ToFmt
>(__c
))
422 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
423 const char32_t __lead_offset
= 0xD800 - (0x10000 >> 10);
424 char16_t __lead
= __lead_offset
+ (__c
>> 10);
425 char16_t __trail
= 0xDC00 + (__c
& 0x3FF);
433 static_assert(sizeof(_ToFmt
) == 1);
434 int __bits
= std::bit_width((uint32_t)__c
);
435 if (__bits
<= 7) [[likely
]]
438 _M_buf
[1] = _M_buf
[2] = _M_buf
[3] = 0;
441 else if (__bits
<= 11)
443 _M_buf
[0] = 0xC0 | (__c
>> 6);
444 _M_buf
[1] = 0x80 | (__c
& 0x3F);
445 _M_buf
[2] = _M_buf
[3] = 0;
448 else if (__bits
<= 16)
450 _M_buf
[0] = 0xE0 | (__c
>> 12);
451 _M_buf
[1] = 0x80 | ((__c
>> 6) & 0x3F);
452 _M_buf
[2] = 0x80 | (__c
& 0x3F);
458 _M_buf
[0] = 0xF0 | ((__c
>> 18) & 0x07);
459 _M_buf
[1] = 0x80 | ((__c
>> 12) & 0x3F);
460 _M_buf
[2] = 0x80 | ((__c
>> 6) & 0x3F);
461 _M_buf
[3] = 0x80 | (__c
& 0x3F);
470 char32_t __c
= _ErrorHandler()();
471 __glibcxx_assert(__is_scalar_value(__c
));
476 _M_first() const requires bidirectional_iterator
<_Iter
>
477 { return _M_first_and_curr
._M_first
; }
480 _M_curr() { return _M_first_and_curr
._M_curr
; }
483 _M_curr() const { return _M_first_and_curr
._M_curr
; }
485 array
<value_type
, 4 / sizeof(_ToFmt
)> _M_buf
;
487 template<typename _It
>
488 struct _First_and_curr
490 _First_and_curr() = default;
493 _First_and_curr(_It __curr
) : _M_curr(__curr
) { }
495 template<convertible_to
<_It
> _It2
>
497 _First_and_curr(const _First_and_curr
<_It2
>& __other
)
498 : _M_curr(__other
._M_curr
) { }
503 template<typename _It
> requires bidirectional_iterator
<_It
>
504 struct _First_and_curr
<_It
>
506 _First_and_curr() = default;
509 _First_and_curr(_It __first
, _It __curr
)
510 : _M_first(__first
), _M_curr(__curr
) { }
512 template<convertible_to
<_It
> _It2
>
514 _First_and_curr(const _First_and_curr
<_It2
>& __other
)
515 : _M_first(__other
._M_first
), _M_curr(__other
._M_curr
) { }
521 _First_and_curr
<_Iter
> _M_first_and_curr
;
523 uint8_t _M_buf_index
= 0;
524 uint8_t _M_buf_last
= 0;
525 uint8_t _M_to_increment
= 0;
527 [[no_unique_address
]] _Sent _M_last
;
529 template<typename _FromFmt2
, typename _ToFmt2
,
530 input_iterator _Iter2
, sentinel_for
<_Iter2
> _Sent2
,
531 typename _ErrHandler
>
532 requires convertible_to
<iter_value_t
<_Iter2
>, _FromFmt2
>
533 friend class _Utf_iterator
;
536 template<typename _ToFormat
, ranges::input_range _Range
>
538 : public ranges::view_interface
<_Utf_view
<_ToFormat
, _Range
>>
540 using _Iterator
= _Utf_iterator
<ranges::range_value_t
<_Range
>,
541 _ToFormat
, ranges::iterator_t
<_Range
>,
542 ranges::sentinel_t
<_Range
>>;
544 template<typename _Iter
, typename _Sent
>
546 _M_begin(_Iter __first
, _Sent __last
)
548 if constexpr (bidirectional_iterator
<_Iter
>)
549 return _Iterator(__first
, __first
, __last
);
551 return _Iterator(__first
, __last
);
554 template<typename _Iter
, typename _Sent
>
556 _M_end(_Iter __first
, _Sent __last
)
558 if constexpr (!is_same_v
<_Iter
, _Sent
>)
560 else if constexpr (bidirectional_iterator
<_Iter
>)
561 return _Iterator(__first
, __last
, __last
);
563 return _Iterator(__last
, __last
);
570 _Utf_view(_Range
&& __r
) : _M_base(std::forward
<_Range
>(__r
)) { }
572 constexpr auto begin()
573 { return _M_begin(ranges::begin(_M_base
), ranges::end(_M_base
)); }
576 { return _M_end(ranges::begin(_M_base
), ranges::end(_M_base
)); }
578 constexpr bool empty() const { return ranges::empty(_M_base
); }
582 template<typename _View
>
583 using _Utf8_view
= _Utf_view
<char8_t
, _View
>;
585 template<typename _View
>
586 using _Utf8_view
= _Utf_view
<char, _View
>;
588 template<typename _View
>
589 using _Utf16_view
= _Utf_view
<char16_t
, _View
>;
590 template<typename _View
>
591 using _Utf32_view
= _Utf_view
<char32_t
, _View
>;
593 inline namespace __v15_1_0
595 #define _GLIBCXX_GET_UNICODE_DATA 150100
596 #include "unicode-data.h"
597 #ifdef _GLIBCXX_GET_UNICODE_DATA
598 # error "Invalid unicode data"
601 // The field width of a code point.
603 __field_width(char32_t __c
) noexcept
605 if (__c
< __width_edges
[0]) [[likely
]]
608 auto* __p
= std::upper_bound(__width_edges
, std::end(__width_edges
), __c
);
609 return (__p
- __width_edges
) % 2 + 1;
612 // @pre c <= 0x10FFFF
613 constexpr _Gcb_property
614 __grapheme_cluster_break_property(char32_t __c
) noexcept
616 constexpr uint32_t __mask
= (1 << __gcb_shift_bits
) - 1;
617 auto* __end
= std::end(__gcb_edges
);
618 auto* __p
= std::lower_bound(__gcb_edges
, __end
,
619 (__c
<< __gcb_shift_bits
) | __mask
);
620 return _Gcb_property(__p
[-1] & __mask
);
624 __is_incb_linker(char32_t __c
) noexcept
626 const auto __end
= std::end(__incb_linkers
);
627 // Array is small enough that linear search is faster than binary search.
628 return std::find(__incb_linkers
, __end
, __c
) != __end
;
631 // @pre c <= 0x10FFFF
633 __incb_property(char32_t __c
) noexcept
635 if ((__c
<< 2) < __incb_edges
[0]) [[likely
]]
638 constexpr uint32_t __mask
= 0x3;
639 auto* __end
= std::end(__incb_edges
);
640 auto* __p
= std::lower_bound(__incb_edges
, __end
, (__c
<< 2) | __mask
);
641 return _InCB(__p
[-1] & __mask
);
645 __is_extended_pictographic(char32_t __c
)
647 if (__c
< __xpicto_edges
[0]) [[likely
]]
650 auto* __p
= std::upper_bound(__xpicto_edges
, std::end(__xpicto_edges
), __c
);
651 return (__p
- __xpicto_edges
) % 2;
654 struct _Grapheme_cluster_iterator_base
656 char32_t _M_c
; // First code point in the cluster.
657 _Gcb_property _M_prop
; // GCB property of _M_c.
658 enum class _XPicto
: unsigned char { _Init
, _Zwj
, _Matched
, _Failed
};
659 _XPicto _M_xpicto_seq_state
= _XPicto::_Init
;
660 unsigned char _M_RI_count
= 0;
661 bool _M_incb_linker_seen
= false;
664 _M_reset(char32_t __c
, _Gcb_property __p
)
668 _M_xpicto_seq_state
= _XPicto::_Init
;
670 _M_incb_linker_seen
= false;
674 _M_update_xpicto_seq_state(char32_t __c
, _Gcb_property __p
)
676 if (_M_xpicto_seq_state
== _XPicto::_Failed
)
679 auto __next_state
= _XPicto::_Failed
;
680 if (_M_xpicto_seq_state
!= _XPicto::_Zwj
) // i.e. Init or Matched
682 if (__p
== _Gcb_property::_Gcb_ZWJ
)
684 if (_M_xpicto_seq_state
== _XPicto::_Matched
)
685 __next_state
= _XPicto::_Zwj
;
686 // We check _M_c here so that we do the lookup at most once,
687 // and only for clusters containing at least one ZWJ.
688 else if (__is_extended_pictographic(_M_c
))
689 __next_state
= _XPicto::_Zwj
;
691 else if (__p
== _Gcb_property::_Gcb_Extend
)
692 __next_state
= _M_xpicto_seq_state
; // no change
696 // This assumes that all \p{Extended_Pictographic} emoji have
697 // Grapheme_Cluster_Break=Other.
698 if (__p
== _Gcb_property::_Gcb_Other
699 && __is_extended_pictographic(__c
))
700 __next_state
= _XPicto::_Matched
;
702 _M_xpicto_seq_state
= __next_state
;
706 _M_update_ri_count(_Gcb_property __p
)
708 if (__p
== _Gcb_property::_Gcb_Regional_Indicator
)
715 _M_update_incb_state(char32_t __c
, _Gcb_property
)
717 if (__is_incb_linker(__c
))
718 _M_incb_linker_seen
= true;
722 // Split a range into extended grapheme clusters.
723 template<ranges::forward_range _View
> requires
ranges::view
<_View
>
724 class _Grapheme_cluster_view
725 : public ranges::view_interface
<_Grapheme_cluster_view
<_View
>>
730 _Grapheme_cluster_view(_View __v
)
731 : _M_begin(_Utf32_view
<_View
>(std::move(__v
)).begin())
734 constexpr auto begin() const { return _M_begin
; }
735 constexpr auto end() const { return _M_begin
.end(); }
738 struct _Iterator
: private _Grapheme_cluster_iterator_base
741 // Iterator over the underlying code points.
742 using _U32_iterator
= ranges::iterator_t
<_Utf32_view
<_View
>>;
745 // TODO: Change value_type to be subrange<_U32_iterator> instead?
746 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
747 // That would be the whole cluster, not just the first code point.
748 // Would need to store two iterators and find end of current cluster
749 // on increment, so operator* returns value_type(_M_base, _M_next).
750 using value_type
= char32_t
;
751 using iterator_concept
= forward_iterator_tag
;
752 using difference_type
= ptrdiff_t;
755 _Iterator(_U32_iterator __i
)
758 if (__i
!= __i
.end())
761 _M_prop
= __grapheme_cluster_break_property(_M_c
);
765 // The first code point of the current extended grapheme cluster.
774 // Move to the next extended grapheme cluster.
778 const auto __end
= _M_base
.end();
779 if (_M_base
!= __end
)
781 auto __p_prev
= _M_prop
;
783 while (++__it
!= __end
)
785 char32_t __c
= *__it
;
786 auto __p
= __grapheme_cluster_break_property(*__it
);
787 _M_update_xpicto_seq_state(__c
, __p
);
788 _M_update_ri_count(__p
);
789 _M_update_incb_state(__c
, __p
);
790 if (_M_is_break(__p_prev
, __p
, __it
))
792 // Found a grapheme cluster break
812 operator==(const _Iterator
& __i
) const
813 { return _M_base
== __i
._M_base
; }
815 // This supports iter != iter.end()
817 operator==(const ranges::sentinel_t
<_View
>& __i
) const
818 { return _M_base
== __i
; }
820 // Iterator to the start of the current cluster.
821 constexpr auto base() const { return _M_base
.base(); }
823 // The end of the underlying view (not the end of the current cluster!)
824 constexpr auto end() const { return _M_base
.end(); }
826 // Field width of the first code point in the cluster.
828 width() const noexcept
829 { return __field_width(_M_c
); }
832 _U32_iterator _M_base
;
834 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
835 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
836 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
837 // Return true if there is a break between code point with property p1
838 // and code point with property p2.
840 _M_is_break(_Gcb_property __p1
, _Gcb_property __p2
,
841 _U32_iterator __curr
) const
843 using enum _Gcb_property
;
845 if (__p1
== _Gcb_Control
|| __p1
== _Gcb_LF
)
846 return true; // Break after Control or LF.
849 return __p2
!= _Gcb_LF
; // Do not break between a CR and LF.
852 if (__p2
== _Gcb_Control
|| __p2
== _Gcb_CR
|| __p2
== _Gcb_LF
)
853 return true; // Break before Control, CR or LF.
863 return false; // Do not break Hangul syllable sequences.
869 if (__p1
== _Gcb_LV
|| __p1
== _Gcb_V
)
874 return false; // Do not break Hangul syllable sequences.
880 if (__p1
== _Gcb_LVT
|| __p1
== _Gcb_T
)
881 return __p2
!= _Gcb_T
; // Do not break Hangul syllable sequences.
884 if (__p2
== _Gcb_Extend
|| __p2
== _Gcb_ZWJ
)
885 return false; // Do not break before extending characters or ZWJ.
887 // The following GB9x rules only apply to extended grapheme clusters,
888 // which is what the C++ standard uses (not legacy grapheme clusters).
891 if (__p2
== _Gcb_SpacingMark
)
892 return false; // Do not break before SpacingMarks,
894 if (__p1
== _Gcb_Prepend
)
895 return false; // or after Prepend characters.
897 // Rule GB9c (Unicode 15.1.0)
898 // Do not break within certain combinations with
899 // Indic_Conjunct_Break (InCB)=Linker.
900 if (_M_incb_linker_seen
901 && __incb_property(_M_c
) == _InCB::_Consonant
902 && __incb_property(*__curr
) == _InCB::_Consonant
)
904 // Match [_M_base, __curr] against regular expression
905 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
906 bool __have_linker
= false;
908 while (++__it
!= __curr
)
910 if (__is_incb_linker(*__it
))
911 __have_linker
= true;
914 auto __incb
= __incb_property(*__it
);
915 if (__incb
== _InCB::_Consonant
)
916 __have_linker
= false;
917 else if (__incb
!= _InCB::_Extend
)
921 if (__it
== __curr
&& __have_linker
)
926 // Do not break within emoji modifier sequences
927 // or emoji zwj sequences.
928 if (__p1
== _Gcb_ZWJ
&& _M_xpicto_seq_state
== _XPicto::_Matched
)
931 // Rules GB12 and GB13
932 // Do not break within emoji flag sequences. That is, do not break
933 // between regional indicator (RI) symbols if there is an odd number
934 // of RI characters before the break point.
935 if (__p1
== _Gcb_property::_Gcb_Regional_Indicator
&& __p1
== __p2
)
936 return (_M_RI_count
& 1) == 0;
939 return true; // Otherwise, break everywhere.
946 } // namespace __v15_1_0
948 // Return the field width of a string.
949 template<typename _CharT
>
951 __field_width(basic_string_view
<_CharT
> __s
)
953 if (__s
.empty()) [[unlikely
]]
955 _Grapheme_cluster_view
<basic_string_view
<_CharT
>> __gc(__s
);
956 auto __it
= __gc
.begin();
957 const auto __end
= __gc
.end();
958 size_t __n
= __it
.width();
959 while (++__it
!= __end
)
964 // Truncate a string to at most `__max` field width units, and return the
965 // resulting field width.
966 template<typename _CharT
>
968 __truncate(basic_string_view
<_CharT
>& __s
, size_t __max
)
970 if (__s
.empty()) [[unlikely
]]
973 _Grapheme_cluster_view
<basic_string_view
<_CharT
>> __gc(__s
);
974 auto __it
= __gc
.begin();
975 const auto __end
= __gc
.end();
976 size_t __n
= __it
.width();
982 while (++__it
!= __end
)
984 size_t __n2
= __n
+ __it
.width();
987 __s
= basic_string_view
<_CharT
>(__s
.begin(), __it
.base());
995 template<typename _CharT
>
997 __literal_encoding_is_unicode()
999 if constexpr (is_same_v
<_CharT
, char16_t
>)
1001 else if constexpr (is_same_v
<_CharT
, char32_t
>)
1003 #ifdef __cpp_char8_t
1004 else if constexpr (is_same_v
<_CharT
, char8_t
>)
1008 const char* __enc
= "";
1010 #ifdef __GNUC_EXECUTION_CHARSET_NAME
1011 auto __remove_iso10646_prefix
= [](const char* __s
) {
1012 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1013 if (__s
[0] == 'I' || __s
[0] == 'i')
1014 if (__s
[1] == 'S' || __s
[1] == 's')
1015 if (__s
[2] == 'O' || __s
[2] == 'o')
1016 if (string_view(__s
+ 3).starts_with("-10646/"))
1021 if constexpr (is_same_v
<_CharT
, char>)
1022 __enc
= __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME
);
1023 # if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1025 __enc
= __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME
);
1028 if ((__enc
[0] == 'U' || __enc
[0] == 'u')
1029 && (__enc
[1] == 'T' || __enc
[1] == 't')
1030 && (__enc
[2] == 'F' || __enc
[2] == 'f'))
1033 if (__enc
[0] == '-')
1035 if (__enc
[0] == '8')
1036 return __enc
[1] == '\0' || string_view(__enc
+ 1) == "//";
1037 else if constexpr (!is_same_v
<_CharT
, char>)
1039 string_view
__s(__enc
);
1040 if (__s
.ends_with("//"))
1041 __s
.remove_suffix(2);
1042 return __s
== "16" || __s
== "32";
1045 #elif defined __clang_literal_encoding__
1046 if constexpr (is_same_v
<_CharT
, char>)
1047 __enc
= __clang_literal_encoding__
;
1048 # if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1050 __enc
= __clang_wide_literal_encoding__
;
1052 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1053 string_view
__s(__enc
);
1056 else if constexpr (!is_same_v
<_CharT
, char>)
1057 return __s
== "UTF-16" || __s
== "UTF-32";
1064 __literal_encoding_is_utf8()
1065 { return __literal_encoding_is_unicode
<char>(); }
1068 __literal_encoding_is_extended_ascii()
1070 return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1071 && 'a' == 0x61 && 'z' == 0x7a;
1074 // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1076 __charset_alias_match(string_view __a
, string_view __b
)
1078 // Map alphanumeric chars to their base 64 value, everything else to 127.
1079 auto __map
= [](char __c
, bool& __num
) -> unsigned char {
1080 if (__c
== '0') [[unlikely
]]
1081 return __num
? 0 : 127;
1082 const auto __v
= __detail::__from_chars_alnum_to_val(__c
);
1087 auto __ptr_a
= __a
.begin(), __end_a
= __a
.end();
1088 auto __ptr_b
= __b
.begin(), __end_b
= __b
.end();
1089 bool __num_a
= false, __num_b
= false;
1093 // Find the value of the next alphanumeric character in each string.
1094 unsigned char __val_a
{}, __val_b
{};
1095 while (__ptr_a
!= __end_a
1096 && (__val_a
= __map(*__ptr_a
, __num_a
)) == 127)
1098 while (__ptr_b
!= __end_b
1099 && (__val_b
= __map(*__ptr_b
, __num_b
)) == 127)
1101 // Stop when we reach the end of a string, or get a mismatch.
1102 if (__ptr_a
== __end_a
)
1103 return __ptr_b
== __end_b
;
1104 else if (__ptr_b
== __end_b
)
1106 else if (__val_a
!= __val_b
)
1107 return false; // Found non-matching characters.
1114 } // namespace __unicode
1118 template<typename _To
, typename _Range
>
1119 inline constexpr bool
1120 enable_borrowed_range
<std::__unicode::_Utf_view
<_To
, _Range
>>
1121 = enable_borrowed_range
<_Range
>;
1123 template<typename _Range
>
1124 inline constexpr bool
1125 enable_borrowed_range
<std::__unicode::_Grapheme_cluster_view
<_Range
>>
1126 = enable_borrowed_range
<_Range
>;
1127 } // namespace ranges
1129 _GLIBCXX_END_NAMESPACE_VERSION
1132 #endif // _GLIBCXX_UNICODE_H