1 // Locale support (codecvt) -*- C++ -*-
3 // Copyright (C) 2015-2024 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::min
29 namespace std
_GLIBCXX_VISIBILITY(default)
31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
33 // The standard doesn't define these operators, which is annoying.
34 static underlying_type
<codecvt_mode
>::type
35 to_integer(codecvt_mode m
)
36 { return static_cast<underlying_type
<codecvt_mode
>::type
>(m
); }
38 static codecvt_mode
& operator&=(codecvt_mode
& m
, codecvt_mode n
)
39 { return m
= codecvt_mode(to_integer(m
) & to_integer(n
)); }
41 static codecvt_mode
& operator|=(codecvt_mode
& m
, codecvt_mode n
)
42 { return m
= codecvt_mode(to_integer(m
) | to_integer(n
)); }
44 static codecvt_mode
operator~(codecvt_mode m
)
45 { return codecvt_mode(~to_integer(m
)); }
49 // Largest code point that fits in a single UTF-16 code unit.
50 const char32_t max_single_utf16_unit
= 0xFFFF;
52 const char32_t max_code_point
= 0x10FFFF;
54 // The functions below rely on maxcode < incomplete_mb_character
55 // (which is enforced by the codecvt_utf* classes on construction).
56 const char32_t incomplete_mb_character
= char32_t(-2);
57 const char32_t invalid_mb_sequence
= char32_t(-1);
59 // Utility type for reading and writing code units of type Elem from
60 // a range defined by a pair of pointers.
61 template<typename Elem
, bool Aligned
= true>
68 range
& operator=(Elem e
)
74 // Read the next code unit.
75 Elem
operator*() const { return *next
; }
77 // Read the Nth code unit.
78 Elem
operator[](size_t n
) const { return next
[n
]; }
80 // Move to the next code unit.
87 // Move to the Nth code unit.
88 range
& operator+=(size_t n
)
94 // The number of code units remaining.
95 size_t size() const { return end
- next
; }
97 // The number of bytes remaining.
98 size_t nbytes() const { return (const char*)end
- (const char*)next
; }
101 // This specialization is used when accessing char16_t values through
102 // pointers to char, which might not be correctly aligned for char16_t.
103 template<typename Elem
>
104 struct range
<Elem
, false>
106 using value_type
= typename remove_const
<Elem
>::type
;
108 using char_pointer
= typename
109 conditional
<is_const
<Elem
>::value
, const char*, char*>::type
;
114 // Write a code unit.
115 range
& operator=(Elem e
)
117 memcpy(next
, &e
, sizeof(Elem
));
122 // Read the next code unit.
123 Elem
operator*() const
126 memcpy(&e
, next
, sizeof(Elem
));
130 // Read the Nth code unit.
131 Elem
operator[](size_t n
) const
134 memcpy(&e
, next
+ n
* sizeof(Elem
), sizeof(Elem
));
138 // Move to the next code unit.
141 next
+= sizeof(Elem
);
145 // Move to the Nth code unit.
146 range
& operator+=(size_t n
)
148 next
+= n
* sizeof(Elem
);
152 // The number of code units remaining.
153 size_t size() const { return nbytes() / sizeof(Elem
); }
155 // The number of bytes remaining.
156 size_t nbytes() const { return end
- next
; }
159 // Multibyte sequences can have "header" consisting of Byte Order Mark
160 const unsigned char utf8_bom
[3] = { 0xEF, 0xBB, 0xBF };
161 const unsigned char utf16_bom
[2] = { 0xFE, 0xFF };
162 const unsigned char utf16le_bom
[2] = { 0xFF, 0xFE };
164 // Write a BOM (space permitting).
165 template<typename C
, bool A
, size_t N
>
167 write_bom(range
<C
, A
>& to
, const unsigned char (&bom
)[N
])
169 static_assert( (N
/ sizeof(C
)) != 0, "" );
170 static_assert( (N
% sizeof(C
)) == 0, "" );
174 memcpy(to
.next
, bom
, N
);
175 to
+= (N
/ sizeof(C
));
179 // Try to read a BOM.
180 template<typename C
, bool A
, size_t N
>
182 read_bom(range
<C
, A
>& from
, const unsigned char (&bom
)[N
])
184 static_assert( (N
/ sizeof(C
)) != 0, "" );
185 static_assert( (N
% sizeof(C
)) == 0, "" );
187 if (from
.nbytes() >= N
&& !memcmp(from
.next
, bom
, N
))
189 from
+= (N
/ sizeof(C
));
195 // If generate_header is set in mode write out UTF-8 BOM.
198 write_utf8_bom(range
<C
>& to
, codecvt_mode mode
)
200 if (mode
& generate_header
)
201 return write_bom(to
, utf8_bom
);
205 // If generate_header is set in mode write out the UTF-16 BOM indicated
206 // by whether little_endian is set in mode.
207 template<bool Aligned
>
209 write_utf16_bom(range
<char16_t
, Aligned
>& to
, codecvt_mode mode
)
211 if (mode
& generate_header
)
213 if (mode
& little_endian
)
214 return write_bom(to
, utf16le_bom
);
216 return write_bom(to
, utf16_bom
);
221 // If consume_header is set in mode update from.next to after any BOM.
224 read_utf8_bom(range
<const C
>& from
, codecvt_mode mode
)
226 if (mode
& consume_header
)
227 read_bom(from
, utf8_bom
);
230 // If consume_header is not set in mode, no effects.
231 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
232 // - if the UTF-16BE BOM was found unset little_endian in mode, or
233 // - if the UTF-16LE BOM was found set little_endian in mode.
234 template<bool Aligned
>
236 read_utf16_bom(range
<const char16_t
, Aligned
>& from
, codecvt_mode
& mode
)
238 if (mode
& consume_header
)
240 if (read_bom(from
, utf16_bom
))
241 mode
&= ~little_endian
;
242 else if (read_bom(from
, utf16le_bom
))
243 mode
|= little_endian
;
247 // Read a codepoint from a UTF-8 multibyte sequence.
248 // Updates from.next if the codepoint is not greater than maxcode.
249 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
252 read_utf8_code_point(range
<const C
>& from
, unsigned long maxcode
)
254 const size_t avail
= from
.size();
256 return incomplete_mb_character
;
257 char32_t c1
= (unsigned char) from
[0];
258 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
259 if (c1
< 0x80) [[likely
]]
264 else if (c1
< 0xC2) [[unlikely
]] // continuation or overlong 2-byte sequence
265 return invalid_mb_sequence
;
266 else if (c1
< 0xE0) // 2-byte sequence
268 if (avail
< 2) [[unlikely
]]
269 return incomplete_mb_character
;
270 char32_t c2
= (unsigned char) from
[1];
271 if ((c2
& 0xC0) != 0x80) [[unlikely
]]
272 return invalid_mb_sequence
;
273 char32_t c
= (c1
<< 6) + c2
- 0x3080;
278 else if (c1
< 0xF0) // 3-byte sequence
280 if (avail
< 2) [[unlikely
]]
281 return incomplete_mb_character
;
282 char32_t c2
= (unsigned char) from
[1];
283 if ((c2
& 0xC0) != 0x80) [[unlikely
]]
284 return invalid_mb_sequence
;
285 if (c1
== 0xE0 && c2
< 0xA0) [[unlikely
]] // overlong
286 return invalid_mb_sequence
;
287 if (c1
== 0xED && c2
>= 0xA0) [[unlikely
]] // surrogate
288 return invalid_mb_sequence
;
289 if (avail
< 3) [[unlikely
]]
290 return incomplete_mb_character
;
291 char32_t c3
= (unsigned char) from
[2];
292 if ((c3
& 0xC0) != 0x80) [[unlikely
]]
293 return invalid_mb_sequence
;
294 char32_t c
= (c1
<< 12) + (c2
<< 6) + c3
- 0xE2080;
299 else if (c1
< 0xF5 && maxcode
> 0xFFFF) // 4-byte sequence
301 if (avail
< 2) [[unlikely
]]
302 return incomplete_mb_character
;
303 char32_t c2
= (unsigned char) from
[1];
304 if ((c2
& 0xC0) != 0x80) [[unlikely
]]
305 return invalid_mb_sequence
;
306 if (c1
== 0xF0 && c2
< 0x90) [[unlikely
]] // overlong
307 return invalid_mb_sequence
;
308 if (c1
== 0xF4 && c2
>= 0x90) [[unlikely
]] // > U+10FFFF
309 return invalid_mb_sequence
;
310 if (avail
< 3) [[unlikely
]]
311 return incomplete_mb_character
;
312 char32_t c3
= (unsigned char) from
[2];
313 if ((c3
& 0xC0) != 0x80) [[unlikely
]]
314 return invalid_mb_sequence
;
315 if (avail
< 4) [[unlikely
]]
316 return incomplete_mb_character
;
317 char32_t c4
= (unsigned char) from
[3];
318 if ((c4
& 0xC0) != 0x80) [[unlikely
]]
319 return invalid_mb_sequence
;
320 char32_t c
= (c1
<< 18) + (c2
<< 12) + (c3
<< 6) + c4
- 0x3C82080;
325 else [[unlikely
]] // > U+10FFFF
326 return invalid_mb_sequence
;
331 write_utf8_code_point(range
<C
>& to
, char32_t code_point
)
333 if (code_point
< 0x80)
335 if (to
.size() < 1) [[unlikely
]]
339 else if (code_point
<= 0x7FF)
341 if (to
.size() < 2) [[unlikely
]]
343 to
= (code_point
>> 6) + 0xC0;
344 to
= (code_point
& 0x3F) + 0x80;
346 else if (code_point
<= 0xFFFF)
348 if (to
.size() < 3) [[unlikely
]]
350 to
= (code_point
>> 12) + 0xE0;
351 to
= ((code_point
>> 6) & 0x3F) + 0x80;
352 to
= (code_point
& 0x3F) + 0x80;
354 else if (code_point
<= 0x10FFFF)
356 if (to
.size() < 4) [[unlikely
]]
358 to
= (code_point
>> 18) + 0xF0;
359 to
= ((code_point
>> 12) & 0x3F) + 0x80;
360 to
= ((code_point
>> 6) & 0x3F) + 0x80;
361 to
= (code_point
& 0x3F) + 0x80;
369 adjust_byte_order(char16_t c
, codecvt_mode mode
)
371 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
372 return (mode
& little_endian
) ? __builtin_bswap16(c
) : c
;
374 return (mode
& little_endian
) ? c
: __builtin_bswap16(c
);
378 // Return true if c is a high-surrogate (aka leading) code point.
380 is_high_surrogate(char32_t c
)
382 return c
>= 0xD800 && c
<= 0xDBFF;
385 // Return true if c is a low-surrogate (aka trailing) code point.
387 is_low_surrogate(char32_t c
)
389 return c
>= 0xDC00 && c
<= 0xDFFF;
393 surrogate_pair_to_code_point(char32_t high
, char32_t low
)
395 return (high
<< 10) + low
- 0x35FDC00;
398 // Read a codepoint from a UTF-16 multibyte sequence.
399 // The sequence's endianness is indicated by (mode & little_endian).
400 // Updates from.next if the codepoint is not greater than maxcode.
401 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
402 template<bool Aligned
>
404 read_utf16_code_point(range
<const char16_t
, Aligned
>& from
,
405 unsigned long maxcode
, codecvt_mode mode
)
407 const size_t avail
= from
.size();
408 if (avail
== 0) [[unlikely
]]
409 return incomplete_mb_character
;
411 char32_t c
= adjust_byte_order(from
[0], mode
);
412 if (is_high_surrogate(c
))
414 if (avail
< 2) [[unlikely
]]
415 return incomplete_mb_character
;
416 const char16_t c2
= adjust_byte_order(from
[1], mode
);
417 if (is_low_surrogate(c2
)) [[likely
]]
419 c
= surrogate_pair_to_code_point(c
, c2
);
423 return invalid_mb_sequence
;
425 else if (is_low_surrogate(c
)) [[unlikely
]]
426 return invalid_mb_sequence
;
432 template<typename C
, bool A
>
434 write_utf16_code_point(range
<C
, A
>& to
, char32_t codepoint
, codecvt_mode mode
)
436 static_assert(sizeof(C
) >= 2, "a code unit must be at least 16-bit");
438 if (codepoint
<= max_single_utf16_unit
)
442 to
= adjust_byte_order(codepoint
, mode
);
446 else if (to
.size() > 1)
448 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
449 const char32_t LEAD_OFFSET
= 0xD800 - (0x10000 >> 10);
450 char16_t lead
= LEAD_OFFSET
+ (codepoint
>> 10);
451 char16_t trail
= 0xDC00 + (codepoint
& 0x3FF);
452 to
= adjust_byte_order(lead
, mode
);
453 to
= adjust_byte_order(trail
, mode
);
462 ucs4_in(range
<const C
>& from
, range
<char32_t
>& to
,
463 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
465 read_utf8_bom(from
, mode
);
466 while (from
.size() && to
.size())
468 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
469 if (codepoint
== incomplete_mb_character
) [[unlikely
]]
470 return codecvt_base::partial
;
471 if (codepoint
> maxcode
) [[unlikely
]]
472 return codecvt_base::error
;
475 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
481 ucs4_out(range
<const char32_t
>& from
, range
<C
>& to
,
482 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
484 if (!write_utf8_bom(to
, mode
)) [[unlikely
]]
485 return codecvt_base::partial
;
488 const char32_t c
= from
[0];
489 if (0xD800 <= c
&& c
<= 0xDFFF) [[unlikely
]]
490 return codecvt_base::error
;
491 if (c
> maxcode
) [[unlikely
]]
492 return codecvt_base::error
;
493 if (!write_utf8_code_point(to
, c
)) [[unlikely
]]
494 return codecvt_base::partial
;
497 return codecvt_base::ok
;
502 ucs4_in(range
<const char16_t
, false>& from
, range
<char32_t
>& to
,
503 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
505 read_utf16_bom(from
, mode
);
506 while (from
.size() && to
.size())
508 const char32_t codepoint
= read_utf16_code_point(from
, maxcode
, mode
);
509 if (codepoint
== incomplete_mb_character
) [[unlikely
]]
510 return codecvt_base::partial
;
511 if (codepoint
> maxcode
) [[unlikely
]]
512 return codecvt_base::error
;
515 return from
.nbytes() ? codecvt_base::partial
: codecvt_base::ok
;
520 ucs4_out(range
<const char32_t
>& from
, range
<char16_t
, false>& to
,
521 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
523 if (!write_utf16_bom(to
, mode
)) [[unlikely
]]
524 return codecvt_base::partial
;
527 const char32_t c
= from
[0];
528 if (0xD800 <= c
&& c
<= 0xDFFF) [[unlikely
]]
529 return codecvt_base::error
;
530 if (c
> maxcode
) [[unlikely
]]
531 return codecvt_base::error
;
532 if (!write_utf16_code_point(to
, c
, mode
)) [[unlikely
]]
533 return codecvt_base::partial
;
536 return codecvt_base::ok
;
539 // Flag indicating whether to process UTF-16 or UCS2
540 enum class surrogates
{ allowed
, disallowed
};
542 // utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF)
543 template <typename C8
, typename C16
>
545 utf16_in(range
<const C8
> &from
, range
<C16
> &to
,
546 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
548 read_utf8_bom(from
, mode
);
549 while (from
.size() && to
.size())
552 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
553 if (codepoint
== incomplete_mb_character
) [[unlikely
]]
554 return codecvt_base::partial
;
555 if (codepoint
> maxcode
)
556 return codecvt_base::error
;
557 if (!write_utf16_code_point(to
, codepoint
, mode
)) [[unlikely
]]
559 from
= orig
; // rewind to previous position
560 return codecvt_base::partial
;
563 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
566 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
567 template<typename C16
, typename C8
>
569 utf16_out(range
<const C16
>& from
, range
<C8
>& to
,
570 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {},
571 surrogates s
= surrogates::allowed
)
573 if (!write_utf8_bom(to
, mode
)) [[unlikely
]]
574 return codecvt_base::partial
;
577 char32_t c
= from
[0];
579 if (is_high_surrogate(c
))
581 if (s
== surrogates::disallowed
) [[unlikely
]]
582 return codecvt_base::error
; // No surrogates in UCS-2
584 if (from
.size() < 2) [[unlikely
]]
585 return codecvt_base::partial
; // stop converting at this point
587 const char32_t c2
= from
[1];
588 if (is_low_surrogate(c2
)) [[likely
]]
590 c
= surrogate_pair_to_code_point(c
, c2
);
594 return codecvt_base::error
;
596 else if (is_low_surrogate(c
)) [[unlikely
]]
597 return codecvt_base::error
;
598 if (c
> maxcode
) [[unlikely
]]
599 return codecvt_base::error
;
600 if (!write_utf8_code_point(to
, c
)) [[unlikely
]]
601 return codecvt_base::partial
;
604 return codecvt_base::ok
;
607 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
610 utf16_span(const C
* begin
, const C
* end
, size_t max
,
611 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
613 range
<const C
> from
{ begin
, end
};
614 read_utf8_bom(from
, mode
);
616 while (count
+1 < max
)
618 char32_t c
= read_utf8_code_point(from
, maxcode
);
621 else if (c
> max_single_utf16_unit
)
625 if (count
+1 == max
) // take one more character if it fits in a single unit
626 read_utf8_code_point(from
, std::min(max_single_utf16_unit
, maxcode
));
633 ucs2_in(range
<const C
>& from
, range
<char16_t
>& to
,
634 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
636 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
637 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
638 return utf16_in(from
, to
, maxcode
, mode
);
644 ucs2_out(range
<const char16_t
>& from
, range
<C
>& to
,
645 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
647 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
648 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
649 return utf16_out(from
, to
, maxcode
, mode
, surrogates::disallowed
);
654 ucs2_out(range
<const char16_t
>& from
, range
<char16_t
, false>& to
,
655 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
657 if (!write_utf16_bom(to
, mode
))
658 return codecvt_base::partial
;
659 while (from
.size() && to
.size())
661 char16_t c
= from
[0];
662 if (0xD800 <= c
&& c
<= 0xDFFF)
663 return codecvt_base::error
;
665 return codecvt_base::error
;
666 to
= adjust_byte_order(c
, mode
);
669 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
674 ucs2_in(range
<const char16_t
, false>& from
, range
<char16_t
>& to
,
675 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
677 read_utf16_bom(from
, mode
);
678 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
679 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
680 while (from
.size() && to
.size())
682 const char32_t c
= read_utf16_code_point(from
, maxcode
, mode
);
683 if (c
== incomplete_mb_character
)
684 return codecvt_base::error
; // UCS-2 only supports single units.
686 return codecvt_base::error
;
689 return from
.nbytes() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
693 ucs2_span(range
<const char16_t
, false>& from
, size_t max
,
694 char32_t maxcode
, codecvt_mode mode
)
696 read_utf16_bom(from
, mode
);
697 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
698 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
700 while (max
-- && c
<= maxcode
)
701 c
= read_utf16_code_point(from
, maxcode
, mode
);
702 return reinterpret_cast<const char16_t
*>(from
.next
);
707 ucs2_span(const C
* begin
, const C
* end
, size_t max
,
708 char32_t maxcode
, codecvt_mode mode
)
710 range
<const C
> from
{ begin
, end
};
711 read_utf8_bom(from
, mode
);
712 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
713 maxcode
= std::min(max_single_utf16_unit
, maxcode
);
715 while (max
-- && c
<= maxcode
)
716 c
= read_utf8_code_point(from
, maxcode
);
720 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
723 ucs4_span(const C
* begin
, const C
* end
, size_t max
,
724 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
726 range
<const C
> from
{ begin
, end
};
727 read_utf8_bom(from
, mode
);
729 while (max
-- && c
<= maxcode
)
730 c
= read_utf8_code_point(from
, maxcode
);
734 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
736 ucs4_span(range
<const char16_t
, false>& from
, size_t max
,
737 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
739 read_utf16_bom(from
, mode
);
741 while (max
-- && c
<= maxcode
)
742 c
= read_utf16_code_point(from
, maxcode
, mode
);
743 return reinterpret_cast<const char16_t
*>(from
.next
);
747 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
748 // Converts from UTF-8 to UTF-16.
750 locale::id codecvt
<char16_t
, char, mbstate_t>::id
;
752 codecvt
<char16_t
, char, mbstate_t>::~codecvt() { }
755 codecvt
<char16_t
, char, mbstate_t>::
757 const intern_type
* __from
,
758 const intern_type
* __from_end
, const intern_type
*& __from_next
,
759 extern_type
* __to
, extern_type
* __to_end
,
760 extern_type
*& __to_next
) const
762 range
<const char16_t
> from
{ __from
, __from_end
};
763 range
<char> to
{ __to
, __to_end
};
764 auto res
= utf16_out(from
, to
);
765 __from_next
= from
.next
;
771 codecvt
<char16_t
, char, mbstate_t>::
772 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
773 extern_type
*& __to_next
) const
776 return noconv
; // we don't use mbstate_t for the unicode facets
780 codecvt
<char16_t
, char, mbstate_t>::
781 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
782 const extern_type
*& __from_next
,
783 intern_type
* __to
, intern_type
* __to_end
,
784 intern_type
*& __to_next
) const
786 range
<const char> from
{ __from
, __from_end
};
787 range
<char16_t
> to
{ __to
, __to_end
};
788 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
789 codecvt_mode mode
= {};
791 codecvt_mode mode
= little_endian
;
793 auto res
= utf16_in(from
, to
, max_code_point
, mode
);
794 __from_next
= from
.next
;
800 codecvt
<char16_t
, char, mbstate_t>::do_encoding() const throw()
801 { return 0; } // UTF-8 is not a fixed-width encoding
804 codecvt
<char16_t
, char, mbstate_t>::do_always_noconv() const throw()
808 codecvt
<char16_t
, char, mbstate_t>::
809 do_length(state_type
&, const extern_type
* __from
,
810 const extern_type
* __end
, size_t __max
) const
812 __end
= utf16_span(__from
, __end
, __max
);
813 return __end
- __from
;
817 codecvt
<char16_t
, char, mbstate_t>::do_max_length() const throw()
819 // A single character (one or two UTF-16 code units) requires
820 // up to four UTF-8 code units.
824 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
825 // Converts from UTF-8 to UTF-32 (aka UCS-4).
827 locale::id codecvt
<char32_t
, char, mbstate_t>::id
;
829 codecvt
<char32_t
, char, mbstate_t>::~codecvt() { }
832 codecvt
<char32_t
, char, mbstate_t>::
833 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
834 const intern_type
*& __from_next
,
835 extern_type
* __to
, extern_type
* __to_end
,
836 extern_type
*& __to_next
) const
838 range
<const char32_t
> from
{ __from
, __from_end
};
839 range
<char> to
{ __to
, __to_end
};
840 auto res
= ucs4_out(from
, to
);
841 __from_next
= from
.next
;
847 codecvt
<char32_t
, char, mbstate_t>::
848 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
849 extern_type
*& __to_next
) const
856 codecvt
<char32_t
, char, mbstate_t>::
857 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
858 const extern_type
*& __from_next
,
859 intern_type
* __to
, intern_type
* __to_end
,
860 intern_type
*& __to_next
) const
862 range
<const char> from
{ __from
, __from_end
};
863 range
<char32_t
> to
{ __to
, __to_end
};
864 auto res
= ucs4_in(from
, to
);
865 __from_next
= from
.next
;
871 codecvt
<char32_t
, char, mbstate_t>::do_encoding() const throw()
872 { return 0; } // UTF-8 is not a fixed-width encoding
875 codecvt
<char32_t
, char, mbstate_t>::do_always_noconv() const throw()
879 codecvt
<char32_t
, char, mbstate_t>::
880 do_length(state_type
&, const extern_type
* __from
,
881 const extern_type
* __end
, size_t __max
) const
883 __end
= ucs4_span(__from
, __end
, __max
);
884 return __end
- __from
;
888 codecvt
<char32_t
, char, mbstate_t>::do_max_length() const throw()
890 // A single character (one UTF-32 code unit) requires
891 // up to 4 UTF-8 code units.
895 #if defined(_GLIBCXX_USE_CHAR8_T)
896 // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
897 // Converts from UTF-8 to UTF-16.
899 locale::id codecvt
<char16_t
, char8_t
, mbstate_t>::id
;
901 codecvt
<char16_t
, char8_t
, mbstate_t>::~codecvt() { }
904 codecvt
<char16_t
, char8_t
, mbstate_t>::
906 const intern_type
* __from
,
907 const intern_type
* __from_end
, const intern_type
*& __from_next
,
908 extern_type
* __to
, extern_type
* __to_end
,
909 extern_type
*& __to_next
) const
911 range
<const char16_t
> from
{ __from
, __from_end
};
912 range
<char8_t
> to
{ __to
, __to_end
};
913 auto res
= utf16_out(from
, to
);
914 __from_next
= from
.next
;
920 codecvt
<char16_t
, char8_t
, mbstate_t>::
921 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
922 extern_type
*& __to_next
) const
925 return noconv
; // we don't use mbstate_t for the unicode facets
929 codecvt
<char16_t
, char8_t
, mbstate_t>::
930 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
931 const extern_type
*& __from_next
,
932 intern_type
* __to
, intern_type
* __to_end
,
933 intern_type
*& __to_next
) const
935 range
<const char8_t
> from
{ __from
, __from_end
};
936 range
<char16_t
> to
{ __to
, __to_end
};
937 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
938 codecvt_mode mode
= {};
940 codecvt_mode mode
= little_endian
;
942 auto res
= utf16_in(from
, to
, max_code_point
, mode
);
943 __from_next
= from
.next
;
949 codecvt
<char16_t
, char8_t
, mbstate_t>::do_encoding() const throw()
950 { return 0; } // UTF-8 is not a fixed-width encoding
953 codecvt
<char16_t
, char8_t
, mbstate_t>::do_always_noconv() const throw()
957 codecvt
<char16_t
, char8_t
, mbstate_t>::
958 do_length(state_type
&, const extern_type
* __from
,
959 const extern_type
* __end
, size_t __max
) const
961 __end
= utf16_span(__from
, __end
, __max
);
962 return __end
- __from
;
966 codecvt
<char16_t
, char8_t
, mbstate_t>::do_max_length() const throw()
968 // A single character (one or two UTF-16 code units) requires
969 // up to four UTF-8 code units.
973 // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
974 // Converts from UTF-8 to UTF-32 (aka UCS-4).
976 locale::id codecvt
<char32_t
, char8_t
, mbstate_t>::id
;
978 codecvt
<char32_t
, char8_t
, mbstate_t>::~codecvt() { }
981 codecvt
<char32_t
, char8_t
, mbstate_t>::
982 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
983 const intern_type
*& __from_next
,
984 extern_type
* __to
, extern_type
* __to_end
,
985 extern_type
*& __to_next
) const
987 range
<const char32_t
> from
{ __from
, __from_end
};
988 range
<char8_t
> to
{ __to
, __to_end
};
989 auto res
= ucs4_out(from
, to
);
990 __from_next
= from
.next
;
996 codecvt
<char32_t
, char8_t
, mbstate_t>::
997 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
998 extern_type
*& __to_next
) const
1004 codecvt_base::result
1005 codecvt
<char32_t
, char8_t
, mbstate_t>::
1006 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1007 const extern_type
*& __from_next
,
1008 intern_type
* __to
, intern_type
* __to_end
,
1009 intern_type
*& __to_next
) const
1011 range
<const char8_t
> from
{ __from
, __from_end
};
1012 range
<char32_t
> to
{ __to
, __to_end
};
1013 auto res
= ucs4_in(from
, to
);
1014 __from_next
= from
.next
;
1015 __to_next
= to
.next
;
1020 codecvt
<char32_t
, char8_t
, mbstate_t>::do_encoding() const throw()
1021 { return 0; } // UTF-8 is not a fixed-width encoding
1024 codecvt
<char32_t
, char8_t
, mbstate_t>::do_always_noconv() const throw()
1028 codecvt
<char32_t
, char8_t
, mbstate_t>::
1029 do_length(state_type
&, const extern_type
* __from
,
1030 const extern_type
* __end
, size_t __max
) const
1032 __end
= ucs4_span(__from
, __end
, __max
);
1033 return __end
- __from
;
1037 codecvt
<char32_t
, char8_t
, mbstate_t>::do_max_length() const throw()
1039 // A single character (one UTF-32 code unit) requires
1040 // up to 4 UTF-8 code units.
1043 #endif // _GLIBCXX_USE_CHAR8_T
1045 // Define members of codecvt_utf8<char16_t> base class implementation.
1046 // Converts from UTF-8 to UCS-2.
1048 __codecvt_utf8_base
<char16_t
>::~__codecvt_utf8_base() { }
1050 codecvt_base::result
1051 __codecvt_utf8_base
<char16_t
>::
1052 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1053 const intern_type
*& __from_next
,
1054 extern_type
* __to
, extern_type
* __to_end
,
1055 extern_type
*& __to_next
) const
1057 range
<const char16_t
> from
{ __from
, __from_end
};
1058 range
<char> to
{ __to
, __to_end
};
1059 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1060 __from_next
= from
.next
;
1061 __to_next
= to
.next
;
1065 codecvt_base::result
1066 __codecvt_utf8_base
<char16_t
>::
1067 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1068 extern_type
*& __to_next
) const
1074 codecvt_base::result
1075 __codecvt_utf8_base
<char16_t
>::
1076 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1077 const extern_type
*& __from_next
,
1078 intern_type
* __to
, intern_type
* __to_end
,
1079 intern_type
*& __to_next
) const
1081 range
<const char> from
{ __from
, __from_end
};
1082 range
<char16_t
> to
{ __to
, __to_end
};
1083 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1084 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1085 mode
= codecvt_mode(mode
| little_endian
);
1087 auto res
= ucs2_in(from
, to
, _M_maxcode
, mode
);
1088 __from_next
= from
.next
;
1089 __to_next
= to
.next
;
1094 __codecvt_utf8_base
<char16_t
>::do_encoding() const throw()
1095 { return 0; } // UTF-8 is not a fixed-width encoding
1098 __codecvt_utf8_base
<char16_t
>::do_always_noconv() const throw()
1102 __codecvt_utf8_base
<char16_t
>::
1103 do_length(state_type
&, const extern_type
* __from
,
1104 const extern_type
* __end
, size_t __max
) const
1106 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1107 return __end
- __from
;
1111 __codecvt_utf8_base
<char16_t
>::do_max_length() const throw()
1113 // A single UCS-2 character requires up to three UTF-8 code units.
1114 // (UCS-2 cannot represent characters that use four UTF-8 code units).
1116 if (_M_mode
& consume_header
)
1117 max
+= sizeof(utf8_bom
);
1121 // Define members of codecvt_utf8<char32_t> base class implementation.
1122 // Converts from UTF-8 to UTF-32 (aka UCS-4).
1124 __codecvt_utf8_base
<char32_t
>::~__codecvt_utf8_base() { }
1126 codecvt_base::result
1127 __codecvt_utf8_base
<char32_t
>::
1128 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1129 const intern_type
*& __from_next
,
1130 extern_type
* __to
, extern_type
* __to_end
,
1131 extern_type
*& __to_next
) const
1133 range
<const char32_t
> from
{ __from
, __from_end
};
1134 range
<char> to
{ __to
, __to_end
};
1135 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1136 __from_next
= from
.next
;
1137 __to_next
= to
.next
;
1141 codecvt_base::result
1142 __codecvt_utf8_base
<char32_t
>::
1143 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1144 extern_type
*& __to_next
) const
1150 codecvt_base::result
1151 __codecvt_utf8_base
<char32_t
>::
1152 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1153 const extern_type
*& __from_next
,
1154 intern_type
* __to
, intern_type
* __to_end
,
1155 intern_type
*& __to_next
) const
1157 range
<const char> from
{ __from
, __from_end
};
1158 range
<char32_t
> to
{ __to
, __to_end
};
1159 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1160 __from_next
= from
.next
;
1161 __to_next
= to
.next
;
1166 __codecvt_utf8_base
<char32_t
>::do_encoding() const throw()
1167 { return 0; } // UTF-8 is not a fixed-width encoding
1170 __codecvt_utf8_base
<char32_t
>::do_always_noconv() const throw()
1174 __codecvt_utf8_base
<char32_t
>::
1175 do_length(state_type
&, const extern_type
* __from
,
1176 const extern_type
* __end
, size_t __max
) const
1178 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1179 return __end
- __from
;
1183 __codecvt_utf8_base
<char32_t
>::do_max_length() const throw()
1185 // A single UCS-4 character requires up to four UTF-8 code units.
1187 if (_M_mode
& consume_header
)
1188 max
+= sizeof(utf8_bom
);
1192 #ifdef _GLIBCXX_USE_WCHAR_T
1194 #if __SIZEOF_WCHAR_T__ == 2
1195 static_assert(sizeof(wchar_t) == sizeof(char16_t
), "");
1196 #elif __SIZEOF_WCHAR_T__ == 4
1197 static_assert(sizeof(wchar_t) == sizeof(char32_t
), "");
1200 // Define members of codecvt_utf8<wchar_t> base class implementation.
1201 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1203 __codecvt_utf8_base
<wchar_t>::~__codecvt_utf8_base() { }
1205 codecvt_base::result
1206 __codecvt_utf8_base
<wchar_t>::
1207 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1208 const intern_type
*& __from_next
,
1209 extern_type
* __to
, extern_type
* __to_end
,
1210 extern_type
*& __to_next
) const
1212 range
<char> to
{ __to
, __to_end
};
1213 #if __SIZEOF_WCHAR_T__ == 2
1214 range
<const char16_t
> from
{
1215 reinterpret_cast<const char16_t
*>(__from
),
1216 reinterpret_cast<const char16_t
*>(__from_end
)
1218 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1219 #elif __SIZEOF_WCHAR_T__ == 4
1220 range
<const char32_t
> from
{
1221 reinterpret_cast<const char32_t
*>(__from
),
1222 reinterpret_cast<const char32_t
*>(__from_end
)
1224 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1226 return codecvt_base::error
;
1228 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
1229 __to_next
= to
.next
;
1233 codecvt_base::result
1234 __codecvt_utf8_base
<wchar_t>::
1235 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1236 extern_type
*& __to_next
) const
1242 codecvt_base::result
1243 __codecvt_utf8_base
<wchar_t>::
1244 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1245 const extern_type
*& __from_next
,
1246 intern_type
* __to
, intern_type
* __to_end
,
1247 intern_type
*& __to_next
) const
1249 range
<const char> from
{ __from
, __from_end
};
1250 #if __SIZEOF_WCHAR_T__ == 2
1252 reinterpret_cast<char16_t
*>(__to
),
1253 reinterpret_cast<char16_t
*>(__to_end
)
1255 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1256 codecvt_mode mode
= {};
1258 codecvt_mode mode
= little_endian
;
1260 auto res
= ucs2_in(from
, to
, _M_maxcode
, mode
);
1261 #elif __SIZEOF_WCHAR_T__ == 4
1263 reinterpret_cast<char32_t
*>(__to
),
1264 reinterpret_cast<char32_t
*>(__to_end
)
1266 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1268 return codecvt_base::error
;
1270 __from_next
= from
.next
;
1271 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
1276 __codecvt_utf8_base
<wchar_t>::do_encoding() const throw()
1277 { return 0; } // UTF-8 is not a fixed-width encoding
1280 __codecvt_utf8_base
<wchar_t>::do_always_noconv() const throw()
1284 __codecvt_utf8_base
<wchar_t>::
1285 do_length(state_type
&, const extern_type
* __from
,
1286 const extern_type
* __end
, size_t __max
) const
1288 #if __SIZEOF_WCHAR_T__ == 2
1289 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1290 #elif __SIZEOF_WCHAR_T__ == 4
1291 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1295 return __end
- __from
;
1299 __codecvt_utf8_base
<wchar_t>::do_max_length() const throw()
1301 #if __SIZEOF_WCHAR_T__ == 2
1302 int max
= 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1304 int max
= 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1306 if (_M_mode
& consume_header
)
1307 max
+= sizeof(utf8_bom
);
1312 // Define members of codecvt_utf16<char16_t> base class implementation.
1313 // Converts from UTF-16 to UCS-2.
1315 __codecvt_utf16_base
<char16_t
>::~__codecvt_utf16_base() { }
1317 codecvt_base::result
1318 __codecvt_utf16_base
<char16_t
>::
1319 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1320 const intern_type
*& __from_next
,
1321 extern_type
* __to
, extern_type
* __to_end
,
1322 extern_type
*& __to_next
) const
1324 range
<const char16_t
> from
{ __from
, __from_end
};
1325 range
<char16_t
, false> to
{ __to
, __to_end
};
1326 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1327 __from_next
= from
.next
;
1328 __to_next
= reinterpret_cast<char*>(to
.next
);
1332 codecvt_base::result
1333 __codecvt_utf16_base
<char16_t
>::
1334 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1335 extern_type
*& __to_next
) const
1341 codecvt_base::result
1342 __codecvt_utf16_base
<char16_t
>::
1343 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1344 const extern_type
*& __from_next
,
1345 intern_type
* __to
, intern_type
* __to_end
,
1346 intern_type
*& __to_next
) const
1348 range
<const char16_t
, false> from
{ __from
, __from_end
};
1349 range
<char16_t
> to
{ __to
, __to_end
};
1350 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1351 __from_next
= reinterpret_cast<const char*>(from
.next
);
1352 __to_next
= to
.next
;
1357 __codecvt_utf16_base
<char16_t
>::do_encoding() const throw()
1358 { return 0; } // UTF-16 is not a fixed-width encoding
1361 __codecvt_utf16_base
<char16_t
>::do_always_noconv() const throw()
1365 __codecvt_utf16_base
<char16_t
>::
1366 do_length(state_type
&, const extern_type
* __from
,
1367 const extern_type
* __end
, size_t __max
) const
1369 range
<const char16_t
, false> from
{ __from
, __end
};
1370 const char16_t
* next
= ucs2_span(from
, __max
, _M_maxcode
, _M_mode
);
1371 return reinterpret_cast<const char*>(next
) - __from
;
1375 __codecvt_utf16_base
<char16_t
>::do_max_length() const throw()
1377 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1378 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1380 if (_M_mode
& consume_header
)
1381 max
+= sizeof(utf16_bom
);
1385 // Define members of codecvt_utf16<char32_t> base class implementation.
1386 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1388 __codecvt_utf16_base
<char32_t
>::~__codecvt_utf16_base() { }
1390 codecvt_base::result
1391 __codecvt_utf16_base
<char32_t
>::
1392 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1393 const intern_type
*& __from_next
,
1394 extern_type
* __to
, extern_type
* __to_end
,
1395 extern_type
*& __to_next
) const
1397 range
<const char32_t
> from
{ __from
, __from_end
};
1398 range
<char16_t
, false> to
{ __to
, __to_end
};
1399 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1400 __from_next
= from
.next
;
1401 __to_next
= reinterpret_cast<char*>(to
.next
);
1405 codecvt_base::result
1406 __codecvt_utf16_base
<char32_t
>::
1407 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1408 extern_type
*& __to_next
) const
1414 codecvt_base::result
1415 __codecvt_utf16_base
<char32_t
>::
1416 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1417 const extern_type
*& __from_next
,
1418 intern_type
* __to
, intern_type
* __to_end
,
1419 intern_type
*& __to_next
) const
1421 range
<const char16_t
, false> from
{ __from
, __from_end
};
1422 range
<char32_t
> to
{ __to
, __to_end
};
1423 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1424 __from_next
= reinterpret_cast<const char*>(from
.next
);
1425 __to_next
= to
.next
;
1430 __codecvt_utf16_base
<char32_t
>::do_encoding() const throw()
1431 { return 0; } // UTF-16 is not a fixed-width encoding
1434 __codecvt_utf16_base
<char32_t
>::do_always_noconv() const throw()
1438 __codecvt_utf16_base
<char32_t
>::
1439 do_length(state_type
&, const extern_type
* __from
,
1440 const extern_type
* __end
, size_t __max
) const
1442 range
<const char16_t
, false> from
{ __from
, __end
};
1443 const char16_t
* next
= ucs4_span(from
, __max
, _M_maxcode
, _M_mode
);
1444 return reinterpret_cast<const char*>(next
) - __from
;
1448 __codecvt_utf16_base
<char32_t
>::do_max_length() const throw()
1450 // A single UCS-4 character requires one or two UTF-16 code units
1451 // (so up to four chars).
1453 if (_M_mode
& consume_header
)
1454 max
+= sizeof(utf16_bom
);
1458 #ifdef _GLIBCXX_USE_WCHAR_T
1459 // Define members of codecvt_utf16<wchar_t> base class implementation.
1460 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1462 __codecvt_utf16_base
<wchar_t>::~__codecvt_utf16_base() { }
1464 codecvt_base::result
1465 __codecvt_utf16_base
<wchar_t>::
1466 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1467 const intern_type
*& __from_next
,
1468 extern_type
* __to
, extern_type
* __to_end
,
1469 extern_type
*& __to_next
) const
1471 range
<char16_t
, false> to
{ __to
, __to_end
};
1472 #if __SIZEOF_WCHAR_T__ == 2
1473 range
<const char16_t
> from
{
1474 reinterpret_cast<const char16_t
*>(__from
),
1475 reinterpret_cast<const char16_t
*>(__from_end
),
1477 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1478 #elif __SIZEOF_WCHAR_T__ == 4
1479 range
<const char32_t
> from
{
1480 reinterpret_cast<const char32_t
*>(__from
),
1481 reinterpret_cast<const char32_t
*>(__from_end
),
1483 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1485 return codecvt_base::error
;
1487 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
1488 __to_next
= reinterpret_cast<char*>(to
.next
);
1492 codecvt_base::result
1493 __codecvt_utf16_base
<wchar_t>::
1494 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1495 extern_type
*& __to_next
) const
1501 codecvt_base::result
1502 __codecvt_utf16_base
<wchar_t>::
1503 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1504 const extern_type
*& __from_next
,
1505 intern_type
* __to
, intern_type
* __to_end
,
1506 intern_type
*& __to_next
) const
1508 range
<const char16_t
, false> from
{ __from
, __from_end
};
1509 #if __SIZEOF_WCHAR_T__ == 2
1511 reinterpret_cast<char16_t
*>(__to
),
1512 reinterpret_cast<char16_t
*>(__to_end
),
1514 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1515 #elif __SIZEOF_WCHAR_T__ == 4
1517 reinterpret_cast<char32_t
*>(__to
),
1518 reinterpret_cast<char32_t
*>(__to_end
),
1520 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1522 return codecvt_base::error
;
1524 __from_next
= reinterpret_cast<const char*>(from
.next
);
1525 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
1530 __codecvt_utf16_base
<wchar_t>::do_encoding() const throw()
1531 { return 0; } // UTF-16 is not a fixed-width encoding
1534 __codecvt_utf16_base
<wchar_t>::do_always_noconv() const throw()
1538 __codecvt_utf16_base
<wchar_t>::
1539 do_length(state_type
&, const extern_type
* __from
,
1540 const extern_type
* __end
, size_t __max
) const
1542 range
<const char16_t
, false> from
{ __from
, __end
};
1543 #if __SIZEOF_WCHAR_T__ == 2
1544 const char16_t
* next
= ucs2_span(from
, __max
, _M_maxcode
, _M_mode
);
1545 #elif __SIZEOF_WCHAR_T__ == 4
1546 const char16_t
* next
= ucs4_span(from
, __max
, _M_maxcode
, _M_mode
);
1548 return reinterpret_cast<const char*>(next
) - __from
;
1552 __codecvt_utf16_base
<wchar_t>::do_max_length() const throw()
1554 #if __SIZEOF_WCHAR_T__ == 2
1555 int max
= 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1557 int max
= 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1559 if (_M_mode
& consume_header
)
1560 max
+= sizeof(utf16_bom
);
1565 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566 // Converts from UTF-8 to UTF-16.
1568 __codecvt_utf8_utf16_base
<char16_t
>::~__codecvt_utf8_utf16_base() { }
1570 codecvt_base::result
1571 __codecvt_utf8_utf16_base
<char16_t
>::
1572 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1573 const intern_type
*& __from_next
,
1574 extern_type
* __to
, extern_type
* __to_end
,
1575 extern_type
*& __to_next
) const
1577 range
<const char16_t
> from
{ __from
, __from_end
};
1578 range
<char> to
{ __to
, __to_end
};
1579 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1580 __from_next
= from
.next
;
1581 __to_next
= to
.next
;
1585 codecvt_base::result
1586 __codecvt_utf8_utf16_base
<char16_t
>::
1587 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1588 extern_type
*& __to_next
) const
1594 codecvt_base::result
1595 __codecvt_utf8_utf16_base
<char16_t
>::
1596 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1597 const extern_type
*& __from_next
,
1598 intern_type
* __to
, intern_type
* __to_end
,
1599 intern_type
*& __to_next
) const
1601 range
<const char> from
{ __from
, __from_end
};
1602 range
<char16_t
> to
{ __to
, __to_end
};
1603 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1604 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605 mode
= codecvt_mode(mode
| little_endian
);
1607 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1608 __from_next
= from
.next
;
1609 __to_next
= to
.next
;
1614 __codecvt_utf8_utf16_base
<char16_t
>::do_encoding() const throw()
1615 { return 0; } // UTF-8 is not a fixed-width encoding
1618 __codecvt_utf8_utf16_base
<char16_t
>::do_always_noconv() const throw()
1622 __codecvt_utf8_utf16_base
<char16_t
>::
1623 do_length(state_type
&, const extern_type
* __from
,
1624 const extern_type
* __end
, size_t __max
) const
1626 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1627 return __end
- __from
;
1631 __codecvt_utf8_utf16_base
<char16_t
>::do_max_length() const throw()
1633 // A single character can be 1 or 2 UTF-16 code units,
1634 // requiring up to 4 UTF-8 code units.
1636 if (_M_mode
& consume_header
)
1637 max
+= sizeof(utf8_bom
);
1641 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642 // Converts from UTF-8 to UTF-16.
1644 __codecvt_utf8_utf16_base
<char32_t
>::~__codecvt_utf8_utf16_base() { }
1646 codecvt_base::result
1647 __codecvt_utf8_utf16_base
<char32_t
>::
1648 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1649 const intern_type
*& __from_next
,
1650 extern_type
* __to
, extern_type
* __to_end
,
1651 extern_type
*& __to_next
) const
1653 range
<const char32_t
> from
{ __from
, __from_end
};
1654 range
<char> to
{ __to
, __to_end
};
1655 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1656 __from_next
= from
.next
;
1657 __to_next
= to
.next
;
1661 codecvt_base::result
1662 __codecvt_utf8_utf16_base
<char32_t
>::
1663 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1664 extern_type
*& __to_next
) const
1670 codecvt_base::result
1671 __codecvt_utf8_utf16_base
<char32_t
>::
1672 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1673 const extern_type
*& __from_next
,
1674 intern_type
* __to
, intern_type
* __to_end
,
1675 intern_type
*& __to_next
) const
1677 range
<const char> from
{ __from
, __from_end
};
1678 range
<char32_t
> to
{ __to
, __to_end
};
1679 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1680 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681 mode
= codecvt_mode(mode
| little_endian
);
1683 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1684 __from_next
= from
.next
;
1685 __to_next
= to
.next
;
1690 __codecvt_utf8_utf16_base
<char32_t
>::do_encoding() const throw()
1691 { return 0; } // UTF-8 is not a fixed-width encoding
1694 __codecvt_utf8_utf16_base
<char32_t
>::do_always_noconv() const throw()
1698 __codecvt_utf8_utf16_base
<char32_t
>::
1699 do_length(state_type
&, const extern_type
* __from
,
1700 const extern_type
* __end
, size_t __max
) const
1702 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1703 return __end
- __from
;
1707 __codecvt_utf8_utf16_base
<char32_t
>::do_max_length() const throw()
1709 // A single character can be 1 or 2 UTF-16 code units,
1710 // requiring up to 4 UTF-8 code units.
1712 if (_M_mode
& consume_header
)
1713 max
+= sizeof(utf8_bom
);
1717 #ifdef _GLIBCXX_USE_WCHAR_T
1718 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719 // Converts from UTF-8 to UTF-16.
1721 __codecvt_utf8_utf16_base
<wchar_t>::~__codecvt_utf8_utf16_base() { }
1723 codecvt_base::result
1724 __codecvt_utf8_utf16_base
<wchar_t>::
1725 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1726 const intern_type
*& __from_next
,
1727 extern_type
* __to
, extern_type
* __to_end
,
1728 extern_type
*& __to_next
) const
1730 range
<const wchar_t> from
{ __from
, __from_end
};
1731 range
<char> to
{ __to
, __to_end
};
1732 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1733 __from_next
= from
.next
;
1734 __to_next
= to
.next
;
1738 codecvt_base::result
1739 __codecvt_utf8_utf16_base
<wchar_t>::
1740 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1741 extern_type
*& __to_next
) const
1747 codecvt_base::result
1748 __codecvt_utf8_utf16_base
<wchar_t>::
1749 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1750 const extern_type
*& __from_next
,
1751 intern_type
* __to
, intern_type
* __to_end
,
1752 intern_type
*& __to_next
) const
1754 range
<const char> from
{ __from
, __from_end
};
1755 range
<wchar_t> to
{ __to
, __to_end
};
1756 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1757 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758 mode
= codecvt_mode(mode
| little_endian
);
1760 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1761 __from_next
= from
.next
;
1762 __to_next
= to
.next
;
1767 __codecvt_utf8_utf16_base
<wchar_t>::do_encoding() const throw()
1768 { return 0; } // UTF-8 is not a fixed-width encoding
1771 __codecvt_utf8_utf16_base
<wchar_t>::do_always_noconv() const throw()
1775 __codecvt_utf8_utf16_base
<wchar_t>::
1776 do_length(state_type
&, const extern_type
* __from
,
1777 const extern_type
* __end
, size_t __max
) const
1779 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1780 return __end
- __from
;
1784 __codecvt_utf8_utf16_base
<wchar_t>::do_max_length() const throw()
1786 // A single character can be 1 or 2 UTF-16 code units,
1787 // requiring up to 4 UTF-8 code units.
1789 if (_M_mode
& consume_header
)
1790 max
+= sizeof(utf8_bom
);
1795 inline template class __codecvt_abstract_base
<char16_t
, char, mbstate_t>;
1796 inline template class __codecvt_abstract_base
<char32_t
, char, mbstate_t>;
1797 template class codecvt_byname
<char16_t
, char, mbstate_t>;
1798 template class codecvt_byname
<char32_t
, char, mbstate_t>;
1800 #if defined(_GLIBCXX_USE_CHAR8_T)
1801 inline template class __codecvt_abstract_base
<char16_t
, char8_t
, mbstate_t>;
1802 inline template class __codecvt_abstract_base
<char32_t
, char8_t
, mbstate_t>;
1803 template class codecvt_byname
<char16_t
, char8_t
, mbstate_t>;
1804 template class codecvt_byname
<char32_t
, char8_t
, mbstate_t>;
1807 _GLIBCXX_END_NAMESPACE_VERSION