1 // Unicode utilities -*- C++ -*-
3 // Copyright The GNU Toolchain Authors.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
25 /** @file include/bits/unicode.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{format}
30 #ifndef _GLIBCXX_UNICODE_H
31 #define _GLIBCXX_UNICODE_H 1
33 #if __cplusplus >= 202002L
35 #include <bit> // bit_width
36 #include <charconv> // __detail::__from_chars_alnum_to_val_table
38 #include <bits/stl_algo.h>
39 #include <bits/stl_iterator.h>
40 #include <bits/ranges_base.h>
42 namespace std
_GLIBCXX_VISIBILITY(default)
44 _GLIBCXX_BEGIN_NAMESPACE_VERSION
47 // A Unicode code point that is not a high or low surrogate.
49 __is_scalar_value(char32_t __c
)
51 if (__c
< 0xD800) [[likely
]]
53 return 0xDFFF < __c
&& __c
<= 0x10FFFF;
56 // A code point that can be encoded in a single code unit of type _CharT.
57 template<typename _CharT
>
59 __is_single_code_unit(char32_t __c
)
61 if constexpr (__gnu_cxx::__int_traits
<_CharT
>::__max
<= 0xFF)
62 return __c
< 0x7F; // ASCII character
64 return __c
< __gnu_cxx::__int_traits
<_CharT
>::__max
65 && __is_scalar_value(__c
);
68 // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
73 operator()() const noexcept
77 struct _Null_sentinel_t
79 template<input_iterator _It
>
80 requires default_initializable
<iter_value_t
<_It
>>
81 && equality_comparable_with
<iter_reference_t
<_It
>, iter_value_t
<_It
>>
83 operator==(_It __it
, _Null_sentinel_t
)
84 { return *__it
== iter_value_t
<_It
>{}; }
87 template<typename _FromFmt
, typename _ToFmt
,
88 input_iterator _Iter
, sentinel_for
<_Iter
> _Sent
= _Iter
,
89 typename _ErrorHandler
= _Repl
>
90 requires convertible_to
<iter_value_t
<_Iter
>, _FromFmt
>
93 static_assert(forward_iterator
<_Iter
> || noexcept(_ErrorHandler()()));
96 using value_type
= _ToFmt
;
97 using difference_type
= iter_difference_t
<_Iter
>;
98 using reference
= value_type
;
99 using iterator_concept
100 = std::__detail::__clamp_iter_cat
<__iter_category_t
<_Iter
>,
101 bidirectional_iterator_tag
>;
103 constexpr _Utf_iterator() = default;
106 _Utf_iterator(_Iter __first
, _Iter __it
, _Sent __last
)
107 requires bidirectional_iterator
<_Iter
>
108 : _M_first_and_curr
{__first
, __it
}, _M_last(__last
)
110 if (_M_curr() != _M_last
)
117 _Utf_iterator(_Iter __it
, _Sent __last
)
118 requires (!bidirectional_iterator
<_Iter
>)
119 : _M_first_and_curr
{__it
}, _M_last(__last
)
121 if (_M_curr() != _M_last
)
127 template<class _Iter2
, class _Sent2
>
128 requires convertible_to
<_Iter2
, _Iter
> && convertible_to
<_Sent2
, _Sent
>
130 _Utf_iterator(const _Utf_iterator
<_FromFmt
, _ToFmt
, _Iter2
, _Sent2
,
131 _ErrorHandler
>& __other
)
132 : _M_buf(__other
._M_buf
), _M_first_and_curr(__other
._M_first_and_curr
),
133 _M_buf_index(__other
._M_buf_index
), _M_buf_last(__other
._M_buf_last
),
134 _M_last(__other
._M_last
)
139 begin() const requires bidirectional_iterator
<_Iter
>
140 { return _M_first(); }
144 end() const { return _M_last
; }
148 base() const requires forward_iterator
<_Iter
>
149 { return _M_curr(); }
153 operator*() const { return _M_buf
[_M_buf_index
]; }
155 constexpr _Utf_iterator
&
158 if (_M_buf_index
+ 1 == _M_buf_last
&& _M_curr() != _M_last
)
160 if constexpr (forward_iterator
<_Iter
>)
161 std::advance(_M_curr(), _M_to_increment
);
162 if (_M_curr() == _M_last
)
167 else if (_M_buf_index
+ 1 < _M_buf_last
)
172 constexpr _Utf_iterator
180 constexpr _Utf_iterator
&
181 operator--() requires bidirectional_iterator
<_Iter
>
183 if (!_M_buf_index
&& _M_curr() != _M_first())
185 else if (_M_buf_index
)
190 constexpr _Utf_iterator
199 friend constexpr bool
200 operator==(_Utf_iterator __lhs
, _Utf_iterator __rhs
)
201 requires forward_iterator
<_Iter
> || requires (_Iter __i
) { __i
!= __i
; }
203 if constexpr (forward_iterator
<_Iter
>)
204 return __lhs
._M_curr() == __rhs
._M_curr()
205 && __lhs
._M_buf_index
== __rhs
._M_buf_index
;
206 else if (__lhs
._M_curr() != __rhs
._M_curr())
208 else if (__lhs
._M_buf_index
== __rhs
._M_buf_index
209 && __lhs
._M_buf_last
== __rhs
._M_buf_last
)
212 return __lhs
._M_buf_index
== __lhs
._M_buf_last
213 && __rhs
._M_buf_index
== __rhs
._M_buf_last
;
217 friend constexpr bool
218 operator==(_Utf_iterator __lhs
, _Sent __rhs
)
220 if constexpr (forward_iterator
<_Iter
>)
221 return __lhs
._M_curr() == __rhs
;
223 return __lhs
._M_curr() == __rhs
224 && __lhs
._M_buf_index
== __lhs
._M_buf_last
;
231 if constexpr (sizeof(_FromFmt
) == sizeof(uint8_t))
233 else if constexpr (sizeof(_FromFmt
) == sizeof(uint16_t))
237 static_assert(sizeof(_FromFmt
) == sizeof(uint32_t));
243 _M_read_reverse(); // TODO
248 _Guard(void*, _Iter
&) { }
251 template<typename _It
> requires forward_iterator
<_It
>
254 constexpr ~_Guard() { _M_this
->_M_curr() = std::move(_M_orig
); }
255 _Utf_iterator
* _M_this
;
262 _Guard
<_Iter
> __g
{this, _M_curr()};
264 const uint8_t __lo_bound
= 0x80, __hi_bound
= 0xBF;
265 uint8_t __u
= *_M_curr()++;
266 uint8_t __to_incr
= 1;
267 auto __incr
= [&, this] {
272 if (__u
<= 0x7F) [[likely
]] // 0x00 to 0x7F
274 else if (__u
< 0xC2) [[unlikely
]]
276 else if (_M_curr() == _M_last
) [[unlikely
]]
278 else if (__u
<= 0xDF) // 0xC2 to 0xDF
283 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
287 __c
= (__c
<< 6) | (__u
& 0x3F);
291 else if (__u
<= 0xEF) // 0xE0 to 0xEF
293 const uint8_t __lo_bound_2
= __u
== 0xE0 ? 0xA0 : __lo_bound
;
294 const uint8_t __hi_bound_2
= __u
== 0xED ? 0x9F : __hi_bound
;
299 if (__u
< __lo_bound_2
|| __u
> __hi_bound_2
) [[unlikely
]]
301 else if (__incr() == _M_last
) [[unlikely
]]
305 __c
= (__c
<< 6) | (__u
& 0x3F);
308 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
312 __c
= (__c
<< 6) | (__u
& 0x3F);
317 else if (__u
<= 0xF4) // 0xF0 to 0xF4
319 const uint8_t __lo_bound_2
= __u
== 0xF0 ? 0x90 : __lo_bound
;
320 const uint8_t __hi_bound_2
= __u
== 0xF4 ? 0x8F : __hi_bound
;
325 if (__u
< __lo_bound_2
|| __u
> __hi_bound_2
) [[unlikely
]]
327 else if (__incr() == _M_last
) [[unlikely
]]
331 __c
= (__c
<< 6) | (__u
& 0x3F);
334 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
336 else if (__incr() == _M_last
) [[unlikely
]]
340 __c
= (__c
<< 6) | (__u
& 0x3F);
343 if (__u
< __lo_bound
|| __u
> __hi_bound
) [[unlikely
]]
347 __c
= (__c
<< 6) | (__u
& 0x3F);
356 _M_update(__c
, __to_incr
);
362 _Guard
<_Iter
> __g
{this, _M_curr()};
364 uint16_t __u
= *_M_curr()++;
365 uint8_t __to_incr
= 1;
367 if (__u
< 0xD800 || __u
> 0xDFFF) [[likely
]]
369 else if (__u
< 0xDC00 && _M_curr() != _M_last
)
371 uint16_t __u2
= *_M_curr();
372 if (__u2
< 0xDC00 || __u2
> 0xDFFF) [[unlikely
]]
378 uint32_t __x
= (__u
& 0x3F) << 10 | __u2
& 0x3FF;
379 uint32_t __w
= (__u
>> 6) & 0x1F;
380 __c
= (__w
+ 1) << 16 | __x
;
386 _M_update(__c
, __to_incr
);
392 _Guard
<_Iter
> __g
{this, _M_curr()};
393 char32_t __c
= *_M_curr()++;
394 if (!__is_scalar_value(__c
)) [[unlikely
]]
399 // Encode the code point __c as one or more code units in _M_buf.
401 _M_update(char32_t __c
, uint8_t __to_incr
)
403 _M_to_increment
= __to_incr
;
405 if constexpr (sizeof(_ToFmt
) == sizeof(uint32_t))
410 else if constexpr (sizeof(_ToFmt
) == sizeof(uint16_t))
412 if (__is_single_code_unit
<_ToFmt
>(__c
))
420 // From http://www.unicode.org/faq/utf_bom.html#utf16-4
421 const char32_t __lead_offset
= 0xD800 - (0x10000 >> 10);
422 char16_t __lead
= __lead_offset
+ (__c
>> 10);
423 char16_t __trail
= 0xDC00 + (__c
& 0x3FF);
431 static_assert(sizeof(_ToFmt
) == 1);
432 int __bits
= std::bit_width((uint32_t)__c
);
433 if (__bits
<= 7) [[likely
]]
436 _M_buf
[1] = _M_buf
[2] = _M_buf
[3] = 0;
439 else if (__bits
<= 11)
441 _M_buf
[0] = 0xC0 | (__c
>> 6);
442 _M_buf
[1] = 0x80 | (__c
& 0x3F);
443 _M_buf
[2] = _M_buf
[3] = 0;
446 else if (__bits
<= 16)
448 _M_buf
[0] = 0xE0 | (__c
>> 12);
449 _M_buf
[1] = 0x80 | ((__c
>> 6) & 0x3F);
450 _M_buf
[2] = 0x80 | (__c
& 0x3F);
456 _M_buf
[0] = 0xF0 | ((__c
>> 18) & 0x07);
457 _M_buf
[1] = 0x80 | ((__c
>> 12) & 0x3F);
458 _M_buf
[2] = 0x80 | ((__c
>> 6) & 0x3F);
459 _M_buf
[3] = 0x80 | (__c
& 0x3F);
468 char32_t __c
= _ErrorHandler()();
469 __glibcxx_assert(__is_scalar_value(__c
));
474 _M_first() const requires bidirectional_iterator
<_Iter
>
475 { return _M_first_and_curr
._M_first
; }
478 _M_curr() { return _M_first_and_curr
._M_curr
; }
481 _M_curr() const { return _M_first_and_curr
._M_curr
; }
483 array
<value_type
, 4 / sizeof(_ToFmt
)> _M_buf
;
485 template<typename _It
>
486 struct _First_and_curr
488 _First_and_curr() = default;
491 _First_and_curr(_It __curr
) : _M_curr(__curr
) { }
493 template<convertible_to
<_It
> _It2
>
495 _First_and_curr(const _First_and_curr
<_It2
>& __other
)
496 : _M_curr(__other
._M_curr
) { }
501 template<typename _It
> requires bidirectional_iterator
<_It
>
502 struct _First_and_curr
<_It
>
504 _First_and_curr() = default;
507 _First_and_curr(_It __first
, _It __curr
)
508 : _M_first(__first
), _M_curr(__curr
) { }
510 template<convertible_to
<_It
> _It2
>
512 _First_and_curr(const _First_and_curr
<_It2
>& __other
)
513 : _M_first(__other
._M_first
), _M_curr(__other
._M_curr
) { }
519 _First_and_curr
<_Iter
> _M_first_and_curr
;
521 uint8_t _M_buf_index
= 0;
522 uint8_t _M_buf_last
= 0;
523 uint8_t _M_to_increment
= 0;
525 [[no_unique_address
]] _Sent _M_last
;
527 template<typename _FromFmt2
, typename _ToFmt2
,
528 input_iterator _Iter2
, sentinel_for
<_Iter2
> _Sent2
,
529 typename _ErrHandler
>
530 requires convertible_to
<iter_value_t
<_Iter2
>, _FromFmt2
>
531 friend class _Utf_iterator
;
534 template<typename _ToFormat
, ranges::input_range _Range
>
536 : public ranges::view_interface
<_Utf_view
<_ToFormat
, _Range
>>
538 using _Iterator
= _Utf_iterator
<ranges::range_value_t
<_Range
>,
539 _ToFormat
, ranges::iterator_t
<_Range
>,
540 ranges::sentinel_t
<_Range
>>;
542 template<typename _Iter
, typename _Sent
>
544 _M_begin(_Iter __first
, _Sent __last
)
546 if constexpr (bidirectional_iterator
<_Iter
>)
547 return _Iterator(__first
, __first
, __last
);
549 return _Iterator(__first
, __last
);
552 template<typename _Iter
, typename _Sent
>
554 _M_end(_Iter __first
, _Sent __last
)
556 if constexpr (!is_same_v
<_Iter
, _Sent
>)
558 else if constexpr (bidirectional_iterator
<_Iter
>)
559 return _Iterator(__first
, __last
, __last
);
561 return _Iterator(__last
, __last
);
568 _Utf_view(_Range
&& __r
) : _M_base(std::forward
<_Range
>(__r
)) { }
570 constexpr auto begin()
571 { return _M_begin(ranges::begin(_M_base
), ranges::end(_M_base
)); }
574 { return _M_end(ranges::begin(_M_base
), ranges::end(_M_base
)); }
576 constexpr bool empty() const { return ranges::empty(_M_base
); }
580 template<typename _View
>
581 using _Utf8_view
= _Utf_view
<char8_t
, _View
>;
583 template<typename _View
>
584 using _Utf8_view
= _Utf_view
<char, _View
>;
586 template<typename _View
>
587 using _Utf16_view
= _Utf_view
<char16_t
, _View
>;
588 template<typename _View
>
589 using _Utf32_view
= _Utf_view
<char32_t
, _View
>;
591 inline namespace __v15_1_0
593 #define _GLIBCXX_GET_UNICODE_DATA 150100
594 #include "unicode-data.h"
595 #ifdef _GLIBCXX_GET_UNICODE_DATA
596 # error "Invalid unicode data"
599 // The field width of a code point.
601 __field_width(char32_t __c
) noexcept
603 if (__c
< __width_edges
[0]) [[likely
]]
606 auto* __p
= std::upper_bound(__width_edges
, std::end(__width_edges
), __c
);
607 return (__p
- __width_edges
) % 2 + 1;
610 // @pre c <= 0x10FFFF
611 constexpr _Gcb_property
612 __grapheme_cluster_break_property(char32_t __c
) noexcept
614 constexpr uint32_t __mask
= (1 << __gcb_shift_bits
) - 1;
615 auto* __end
= std::end(__gcb_edges
);
616 auto* __p
= std::lower_bound(__gcb_edges
, __end
,
617 (__c
<< __gcb_shift_bits
) | __mask
);
618 return _Gcb_property(__p
[-1] & __mask
);
622 __is_incb_linker(char32_t __c
) noexcept
624 const auto __end
= std::end(__incb_linkers
);
625 // Array is small enough that linear search is faster than binary search.
626 return std::find(__incb_linkers
, __end
, __c
) != __end
;
629 // @pre c <= 0x10FFFF
631 __incb_property(char32_t __c
) noexcept
633 if ((__c
<< 2) < __incb_edges
[0]) [[likely
]]
636 constexpr uint32_t __mask
= 0x3;
637 auto* __end
= std::end(__incb_edges
);
638 auto* __p
= std::lower_bound(__incb_edges
, __end
, (__c
<< 2) | __mask
);
639 return _InCB(__p
[-1] & __mask
);
643 __is_extended_pictographic(char32_t __c
)
645 if (__c
< __xpicto_edges
[0]) [[likely
]]
648 auto* __p
= std::upper_bound(__xpicto_edges
, std::end(__xpicto_edges
), __c
);
649 return (__p
- __xpicto_edges
) % 2;
652 struct _Grapheme_cluster_iterator_base
654 char32_t _M_c
; // First code point in the cluster.
655 _Gcb_property _M_prop
; // GCB property of _M_c.
656 enum class _XPicto
: unsigned char { _Init
, _Zwj
, _Matched
, _Failed
};
657 _XPicto _M_xpicto_seq_state
= _XPicto::_Init
;
658 unsigned char _M_RI_count
= 0;
659 bool _M_incb_linker_seen
= false;
662 _M_reset(char32_t __c
, _Gcb_property __p
)
666 _M_xpicto_seq_state
= _XPicto::_Init
;
668 _M_incb_linker_seen
= false;
672 _M_update_xpicto_seq_state(char32_t __c
, _Gcb_property __p
)
674 if (_M_xpicto_seq_state
== _XPicto::_Failed
)
677 auto __next_state
= _XPicto::_Failed
;
678 if (_M_xpicto_seq_state
!= _XPicto::_Zwj
) // i.e. Init or Matched
680 if (__p
== _Gcb_property::_Gcb_ZWJ
)
682 if (_M_xpicto_seq_state
== _XPicto::_Matched
)
683 __next_state
= _XPicto::_Zwj
;
684 // We check _M_c here so that we do the lookup at most once,
685 // and only for clusters containing at least one ZWJ.
686 else if (__is_extended_pictographic(_M_c
))
687 __next_state
= _XPicto::_Zwj
;
689 else if (__p
== _Gcb_property::_Gcb_Extend
)
690 __next_state
= _M_xpicto_seq_state
; // no change
694 // This assumes that all \p{Extended_Pictographic} emoji have
695 // Grapheme_Cluster_Break=Other.
696 if (__p
== _Gcb_property::_Gcb_Other
697 && __is_extended_pictographic(__c
))
698 __next_state
= _XPicto::_Matched
;
700 _M_xpicto_seq_state
= __next_state
;
704 _M_update_ri_count(_Gcb_property __p
)
706 if (__p
== _Gcb_property::_Gcb_Regional_Indicator
)
713 _M_update_incb_state(char32_t __c
, _Gcb_property
)
715 if (__is_incb_linker(__c
))
716 _M_incb_linker_seen
= true;
720 // Split a range into extended grapheme clusters.
721 template<ranges::forward_range _View
> requires
ranges::view
<_View
>
722 class _Grapheme_cluster_view
723 : public ranges::view_interface
<_Grapheme_cluster_view
<_View
>>
728 _Grapheme_cluster_view(_View __v
)
729 : _M_begin(_Utf32_view
<_View
>(std::move(__v
)).begin())
732 constexpr auto begin() const { return _M_begin
; }
733 constexpr auto end() const { return _M_begin
.end(); }
736 struct _Iterator
: private _Grapheme_cluster_iterator_base
739 // Iterator over the underlying code points.
740 using _U32_iterator
= ranges::iterator_t
<_Utf32_view
<_View
>>;
743 // TODO: Change value_type to be subrange<_U32_iterator> instead?
744 // Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
745 // That would be the whole cluster, not just the first code point.
746 // Would need to store two iterators and find end of current cluster
747 // on increment, so operator* returns value_type(_M_base, _M_next).
748 using value_type
= char32_t
;
749 using iterator_concept
= forward_iterator_tag
;
750 using difference_type
= ptrdiff_t;
753 _Iterator(_U32_iterator __i
)
756 if (__i
!= __i
.end())
759 _M_prop
= __grapheme_cluster_break_property(_M_c
);
763 // The first code point of the current extended grapheme cluster.
772 // Move to the next extended grapheme cluster.
776 const auto __end
= _M_base
.end();
777 if (_M_base
!= __end
)
779 auto __p_prev
= _M_prop
;
781 while (++__it
!= __end
)
783 char32_t __c
= *__it
;
784 auto __p
= __grapheme_cluster_break_property(*__it
);
785 _M_update_xpicto_seq_state(__c
, __p
);
786 _M_update_ri_count(__p
);
787 _M_update_incb_state(__c
, __p
);
788 if (_M_is_break(__p_prev
, __p
, __it
))
790 // Found a grapheme cluster break
810 operator==(const _Iterator
& __i
) const
811 { return _M_base
== __i
._M_base
; }
813 // This supports iter != iter.end()
815 operator==(const ranges::sentinel_t
<_View
>& __i
) const
816 { return _M_base
== __i
; }
818 // Iterator to the start of the current cluster.
819 constexpr auto base() const { return _M_base
.base(); }
821 // The end of the underlying view (not the end of the current cluster!)
822 constexpr auto end() const { return _M_base
.end(); }
824 // Field width of the first code point in the cluster.
826 width() const noexcept
827 { return __field_width(_M_c
); }
830 _U32_iterator _M_base
;
832 // Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
833 // http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
834 // This implements the rules from TR29 revision 43 in Unicode 15.1.0.
835 // Return true if there is a break between code point with property p1
836 // and code point with property p2.
838 _M_is_break(_Gcb_property __p1
, _Gcb_property __p2
,
839 _U32_iterator __curr
) const
841 using enum _Gcb_property
;
843 if (__p1
== _Gcb_Control
|| __p1
== _Gcb_LF
)
844 return true; // Break after Control or LF.
847 return __p2
!= _Gcb_LF
; // Do not break between a CR and LF.
850 if (__p2
== _Gcb_Control
|| __p2
== _Gcb_CR
|| __p2
== _Gcb_LF
)
851 return true; // Break before Control, CR or LF.
861 return false; // Do not break Hangul syllable sequences.
867 if (__p1
== _Gcb_LV
|| __p1
== _Gcb_V
)
872 return false; // Do not break Hangul syllable sequences.
878 if (__p1
== _Gcb_LVT
|| __p1
== _Gcb_T
)
879 return __p2
!= _Gcb_T
; // Do not break Hangul syllable sequences.
882 if (__p2
== _Gcb_Extend
|| __p2
== _Gcb_ZWJ
)
883 return false; // Do not break before extending characters or ZWJ.
885 // The following GB9x rules only apply to extended grapheme clusters,
886 // which is what the C++ standard uses (not legacy grapheme clusters).
889 if (__p2
== _Gcb_SpacingMark
)
890 return false; // Do not break before SpacingMarks,
892 if (__p1
== _Gcb_Prepend
)
893 return false; // or after Prepend characters.
895 // Rule GB9c (Unicode 15.1.0)
896 // Do not break within certain combinations with
897 // Indic_Conjunct_Break (InCB)=Linker.
898 if (_M_incb_linker_seen
899 && __incb_property(_M_c
) == _InCB::_Consonant
900 && __incb_property(*__curr
) == _InCB::_Consonant
)
902 // Match [_M_base, __curr] against regular expression
903 // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
904 bool __have_linker
= false;
906 while (++__it
!= __curr
)
908 if (__is_incb_linker(*__it
))
909 __have_linker
= true;
912 auto __incb
= __incb_property(*__it
);
913 if (__incb
== _InCB::_Consonant
)
914 __have_linker
= false;
915 else if (__incb
!= _InCB::_Extend
)
919 if (__it
== __curr
&& __have_linker
)
924 // Do not break within emoji modifier sequences
925 // or emoji zwj sequences.
926 if (__p1
== _Gcb_ZWJ
&& _M_xpicto_seq_state
== _XPicto::_Matched
)
929 // Rules GB12 and GB13
930 // Do not break within emoji flag sequences. That is, do not break
931 // between regional indicator (RI) symbols if there is an odd number
932 // of RI characters before the break point.
933 if (__p1
== _Gcb_property::_Gcb_Regional_Indicator
&& __p1
== __p2
)
934 return (_M_RI_count
& 1) == 0;
937 return true; // Otherwise, break everywhere.
944 } // namespace __v15_1_0
946 // Return the field width of a string.
947 template<typename _CharT
>
949 __field_width(basic_string_view
<_CharT
> __s
)
951 if (__s
.empty()) [[unlikely
]]
953 _Grapheme_cluster_view
<basic_string_view
<_CharT
>> __gc(__s
);
954 auto __it
= __gc
.begin();
955 const auto __end
= __gc
.end();
956 size_t __n
= __it
.width();
957 while (++__it
!= __end
)
962 // Truncate a string to at most `__max` field width units, and return the
963 // resulting field width.
964 template<typename _CharT
>
966 __truncate(basic_string_view
<_CharT
>& __s
, size_t __max
)
968 if (__s
.empty()) [[unlikely
]]
971 _Grapheme_cluster_view
<basic_string_view
<_CharT
>> __gc(__s
);
972 auto __it
= __gc
.begin();
973 const auto __end
= __gc
.end();
974 size_t __n
= __it
.width();
980 while (++__it
!= __end
)
982 size_t __n2
= __n
+ __it
.width();
985 __s
= basic_string_view
<_CharT
>(__s
.begin(), __it
.base());
993 template<typename _CharT
>
995 __literal_encoding_is_unicode()
997 if constexpr (is_same_v
<_CharT
, char16_t
>)
999 else if constexpr (is_same_v
<_CharT
, char32_t
>)
1001 #ifdef __cpp_char8_t
1002 else if constexpr (is_same_v
<_CharT
, char8_t
>)
1006 const char* __enc
= "";
1008 #ifdef __GNUC_EXECUTION_CHARSET_NAME
1009 auto __remove_iso10646_prefix
= [](const char* __s
) {
1010 // GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1011 if (__s
[0] == 'I' || __s
[0] == 'i')
1012 if (__s
[1] == 'S' || __s
[1] == 's')
1013 if (__s
[2] == 'O' || __s
[2] == 'o')
1014 if (string_view(__s
+ 3).starts_with("-10646/"))
1019 if constexpr (is_same_v
<_CharT
, char>)
1020 __enc
= __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME
);
1021 # if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1023 __enc
= __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME
);
1026 if ((__enc
[0] == 'U' || __enc
[0] == 'u')
1027 && (__enc
[1] == 'T' || __enc
[1] == 't')
1028 && (__enc
[2] == 'F' || __enc
[2] == 'f'))
1031 if (__enc
[0] == '-')
1033 if (__enc
[0] == '8')
1034 return __enc
[1] == '\0' || string_view(__enc
+ 1) == "//";
1035 else if constexpr (!is_same_v
<_CharT
, char>)
1037 string_view
__s(__enc
);
1038 if (__s
.ends_with("//"))
1039 __s
.remove_suffix(2);
1040 return __s
== "16" || __s
== "32";
1043 #elif defined __clang_literal_encoding__
1044 if constexpr (is_same_v
<_CharT
, char>)
1045 __enc
= __clang_literal_encoding__
;
1046 # if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1048 __enc
= __clang_wide_literal_encoding__
;
1050 // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1051 string_view
__s(__enc
);
1054 else if constexpr (!is_same_v
<_CharT
, char>)
1055 return __s
== "UTF-16" || __s
== "UTF-32";
1062 __literal_encoding_is_utf8()
1063 { return __literal_encoding_is_unicode
<char>(); }
1066 __literal_encoding_is_extended_ascii()
1068 return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
1069 && 'a' == 0x61 && 'z' == 0x7a;
1072 // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1074 __charset_alias_match(string_view __a
, string_view __b
)
1076 // Map alphanumeric chars to their base 64 value, everything else to 127.
1077 auto __map
= [](char __c
, bool& __num
) -> unsigned char {
1078 if (__c
== '0') [[unlikely
]]
1079 return __num
? 0 : 127;
1080 const auto __v
= __detail::__from_chars_alnum_to_val(__c
);
1085 auto __ptr_a
= __a
.begin(), __end_a
= __a
.end();
1086 auto __ptr_b
= __b
.begin(), __end_b
= __b
.end();
1087 bool __num_a
= false, __num_b
= false;
1091 // Find the value of the next alphanumeric character in each string.
1092 unsigned char __val_a
{}, __val_b
{};
1093 while (__ptr_a
!= __end_a
1094 && (__val_a
= __map(*__ptr_a
, __num_a
)) == 127)
1096 while (__ptr_b
!= __end_b
1097 && (__val_b
= __map(*__ptr_b
, __num_b
)) == 127)
1099 // Stop when we reach the end of a string, or get a mismatch.
1100 if (__ptr_a
== __end_a
)
1101 return __ptr_b
== __end_b
;
1102 else if (__ptr_b
== __end_b
)
1104 else if (__val_a
!= __val_b
)
1105 return false; // Found non-matching characters.
1112 } // namespace __unicode
1116 template<typename _To
, typename _Range
>
1117 inline constexpr bool
1118 enable_borrowed_range
<std::__unicode::_Utf_view
<_To
, _Range
>>
1119 = enable_borrowed_range
<_Range
>;
1121 template<typename _Range
>
1122 inline constexpr bool
1123 enable_borrowed_range
<std::__unicode::_Grapheme_cluster_view
<_Range
>>
1124 = enable_borrowed_range
<_Range
>;
1125 } // namespace ranges
1127 _GLIBCXX_END_NAMESPACE_VERSION
1130 #endif // _GLIBCXX_UNICODE_H