1 // Locale support (codecvt) -*- C++ -*-
3 // Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::max
29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
30 namespace std
_GLIBCXX_VISIBILITY(default)
32 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 // Largest code point that fits in a single UTF-16 code unit.
37 const char32_t max_single_utf16_unit
= 0xFFFF;
39 const char32_t max_code_point
= 0x10FFFF;
41 // The functions below rely on maxcode < incomplete_mb_character
42 // (which is enforced by the codecvt_utf* classes on construction).
43 const char32_t incomplete_mb_character
= char32_t(-2);
44 const char32_t invalid_mb_sequence
= char32_t(-1);
46 template<typename Elem
>
52 Elem
operator*() const { return *next
; }
54 range
& operator++() { ++next
; return *this; }
56 size_t size() const { return end
- next
; }
59 // Multibyte sequences can have "header" consisting of Byte Order Mark
60 const unsigned char utf8_bom
[3] = { 0xEF, 0xBB, 0xBF };
61 const unsigned char utf16_bom
[4] = { 0xFE, 0xFF };
62 const unsigned char utf16le_bom
[4] = { 0xFF, 0xFE };
66 write_bom(range
<char>& to
, const unsigned char (&bom
)[N
])
70 memcpy(to
.next
, bom
, N
);
75 // If generate_header is set in mode write out UTF-8 BOM.
77 write_utf8_bom(range
<char>& to
, codecvt_mode mode
)
79 if (mode
& generate_header
)
80 return write_bom(to
, utf8_bom
);
84 // If generate_header is set in mode write out the UTF-16 BOM indicated
85 // by whether little_endian is set in mode.
87 write_utf16_bom(range
<char16_t
>& to
, codecvt_mode mode
)
89 if (mode
& generate_header
)
93 auto* bom
= (mode
& little_endian
) ? utf16le_bom
: utf16_bom
;
94 std::memcpy(to
.next
, bom
, 2);
102 read_bom(range
<const char>& from
, const unsigned char (&bom
)[N
])
104 if (from
.size() >= N
&& !memcmp(from
.next
, bom
, N
))
112 // If consume_header is set in mode update from.next to after any BOM.
114 read_utf8_bom(range
<const char>& from
, codecvt_mode mode
)
116 if (mode
& consume_header
)
117 read_bom(from
, utf8_bom
);
120 // If consume_header is set in mode update from.next to after any BOM.
121 // Return little_endian iff the UTF-16LE BOM was present.
123 read_utf16_bom(range
<const char16_t
>& from
, codecvt_mode mode
)
125 if (mode
& consume_header
&& from
.size())
127 if (*from
.next
== 0xFEFF)
129 else if (*from
.next
== 0xFFFE)
132 return little_endian
;
138 // Read a codepoint from a UTF-8 multibyte sequence.
139 // Updates from.next if the codepoint is not greater than maxcode.
140 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
142 read_utf8_code_point(range
<const char>& from
, unsigned long maxcode
)
144 const size_t avail
= from
.size();
146 return incomplete_mb_character
;
147 unsigned char c1
= from
.next
[0];
148 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
154 else if (c1
< 0xC2) // continuation or overlong 2-byte sequence
155 return invalid_mb_sequence
;
156 else if (c1
< 0xE0) // 2-byte sequence
159 return incomplete_mb_character
;
160 unsigned char c2
= from
.next
[1];
161 if ((c2
& 0xC0) != 0x80)
162 return invalid_mb_sequence
;
163 char32_t c
= (c1
<< 6) + c2
- 0x3080;
168 else if (c1
< 0xF0) // 3-byte sequence
171 return incomplete_mb_character
;
172 unsigned char c2
= from
.next
[1];
173 if ((c2
& 0xC0) != 0x80)
174 return invalid_mb_sequence
;
175 if (c1
== 0xE0 && c2
< 0xA0) // overlong
176 return invalid_mb_sequence
;
177 unsigned char c3
= from
.next
[2];
178 if ((c3
& 0xC0) != 0x80)
179 return invalid_mb_sequence
;
180 char32_t c
= (c1
<< 12) + (c2
<< 6) + c3
- 0xE2080;
185 else if (c1
< 0xF5) // 4-byte sequence
188 return incomplete_mb_character
;
189 unsigned char c2
= from
.next
[1];
190 if ((c2
& 0xC0) != 0x80)
191 return invalid_mb_sequence
;
192 if (c1
== 0xF0 && c2
< 0x90) // overlong
193 return invalid_mb_sequence
;
194 if (c1
== 0xF4 && c2
>= 0x90) // > U+10FFFF
195 return invalid_mb_sequence
;
196 unsigned char c3
= from
.next
[2];
197 if ((c3
& 0xC0) != 0x80)
198 return invalid_mb_sequence
;
199 unsigned char c4
= from
.next
[3];
200 if ((c4
& 0xC0) != 0x80)
201 return invalid_mb_sequence
;
202 char32_t c
= (c1
<< 18) + (c2
<< 12) + (c3
<< 6) + c4
- 0x3C82080;
208 return invalid_mb_sequence
;
212 write_utf8_code_point(range
<char>& to
, char32_t code_point
)
214 if (code_point
< 0x80)
218 *to
.next
++ = code_point
;
220 else if (code_point
<= 0x7FF)
224 *to
.next
++ = (code_point
>> 6) + 0xC0;
225 *to
.next
++ = (code_point
& 0x3F) + 0x80;
227 else if (code_point
<= 0xFFFF)
231 *to
.next
++ = (code_point
>> 12) + 0xE0;
232 *to
.next
++ = ((code_point
>> 6) & 0x3F) + 0x80;
233 *to
.next
++ = (code_point
& 0x3F) + 0x80;
235 else if (code_point
<= 0x10FFFF)
239 *to
.next
++ = (code_point
>> 18) + 0xF0;
240 *to
.next
++ = ((code_point
>> 12) & 0x3F) + 0x80;
241 *to
.next
++ = ((code_point
>> 6) & 0x3F) + 0x80;
242 *to
.next
++ = (code_point
& 0x3F) + 0x80;
250 adjust_byte_order(char16_t c
, codecvt_mode mode
)
252 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
253 return (mode
& little_endian
) ? __builtin_bswap16(c
) : c
;
255 return (mode
& little_endian
) ? c
: __builtin_bswap16(c
);
259 // Return true if c is a high-surrogate (aka leading) code point.
261 is_high_surrogate(char32_t c
)
263 return c
>= 0xD800 && c
<= 0xDBFF;
266 // Return true if c is a low-surrogate (aka trailing) code point.
268 is_low_surrogate(char32_t c
)
270 return c
>= 0xDC00 && c
<= 0xDFFF;
274 surrogate_pair_to_code_point(char32_t high
, char32_t low
)
276 return (high
<< 10) + low
- 0x35FDC00;
279 // Read a codepoint from a UTF-16 multibyte sequence.
280 // The sequence's endianness is indicated by (mode & little_endian).
281 // Updates from.next if the codepoint is not greater than maxcode.
282 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
284 read_utf16_code_point(range
<const char16_t
>& from
, unsigned long maxcode
,
287 const size_t avail
= from
.size();
289 return incomplete_mb_character
;
291 char32_t c
= adjust_byte_order(from
.next
[0], mode
);
292 if (is_high_surrogate(c
))
295 return incomplete_mb_character
;
296 const char16_t c2
= adjust_byte_order(from
.next
[1], mode
);
297 if (is_low_surrogate(c2
))
299 c
= surrogate_pair_to_code_point(c
, c2
);
303 return invalid_mb_sequence
;
305 else if (is_low_surrogate(c
))
306 return invalid_mb_sequence
;
314 write_utf16_code_point(range
<C
>& to
, char32_t codepoint
, codecvt_mode mode
)
316 static_assert(sizeof(C
) >= 2, "a code unit must be at least 16-bit");
318 if (codepoint
< max_single_utf16_unit
)
322 *to
.next
= adjust_byte_order(codepoint
, mode
);
327 else if (to
.size() > 1)
329 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
330 const char32_t LEAD_OFFSET
= 0xD800 - (0x10000 >> 10);
331 char16_t lead
= LEAD_OFFSET
+ (codepoint
>> 10);
332 char16_t trail
= 0xDC00 + (codepoint
& 0x3FF);
333 to
.next
[0] = adjust_byte_order(lead
, mode
);
334 to
.next
[1] = adjust_byte_order(trail
, mode
);
343 ucs4_in(range
<const char>& from
, range
<char32_t
>& to
,
344 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
346 read_utf8_bom(from
, mode
);
347 while (from
.size() && to
.size())
349 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
350 if (codepoint
== incomplete_mb_character
)
351 return codecvt_base::partial
;
352 if (codepoint
> maxcode
)
353 return codecvt_base::error
;
354 *to
.next
++ = codepoint
;
356 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
361 ucs4_out(range
<const char32_t
>& from
, range
<char>& to
,
362 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
364 if (!write_utf8_bom(to
, mode
))
365 return codecvt_base::partial
;
368 const char32_t c
= from
.next
[0];
370 return codecvt_base::error
;
371 if (!write_utf8_code_point(to
, c
))
372 return codecvt_base::partial
;
375 return codecvt_base::ok
;
380 ucs4_in(range
<const char16_t
>& from
, range
<char32_t
>& to
,
381 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
383 if (read_utf16_bom(from
, mode
) == little_endian
)
384 mode
= codecvt_mode(mode
& little_endian
);
385 while (from
.size() && to
.size())
387 const char32_t codepoint
= read_utf16_code_point(from
, maxcode
, mode
);
388 if (codepoint
== incomplete_mb_character
)
389 return codecvt_base::partial
;
390 if (codepoint
> maxcode
)
391 return codecvt_base::error
;
392 *to
.next
++ = codepoint
;
394 return from
.size() ? codecvt_base::partial
: codecvt_base::ok
;
399 ucs4_out(range
<const char32_t
>& from
, range
<char16_t
>& to
,
400 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
402 if (!write_utf16_bom(to
, mode
))
403 return codecvt_base::partial
;
406 const char32_t c
= from
.next
[0];
408 return codecvt_base::error
;
409 if (!write_utf16_code_point(to
, c
, mode
))
410 return codecvt_base::partial
;
413 return codecvt_base::ok
;
419 utf16_in(range
<const char>& from
, range
<C
>& to
,
420 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
422 read_utf8_bom(from
, mode
);
423 while (from
.size() && to
.size())
425 const char* const first
= from
.next
;
426 const char32_t codepoint
= read_utf8_code_point(from
, maxcode
);
427 if (codepoint
== incomplete_mb_character
)
428 return codecvt_base::partial
;
429 if (codepoint
> maxcode
)
430 return codecvt_base::error
;
431 if (!write_utf16_code_point(to
, codepoint
, mode
))
434 return codecvt_base::partial
;
437 return codecvt_base::ok
;
443 utf16_out(range
<const C
>& from
, range
<char>& to
,
444 unsigned long maxcode
= max_code_point
, codecvt_mode mode
= {})
446 if (!write_utf8_bom(to
, mode
))
447 return codecvt_base::partial
;
450 char32_t c
= from
.next
[0];
452 if (is_high_surrogate(c
))
455 return codecvt_base::ok
; // stop converting at this point
457 const char32_t c2
= from
.next
[1];
458 if (is_low_surrogate(c2
))
460 c
= surrogate_pair_to_code_point(c
, c2
);
464 return codecvt_base::error
;
466 else if (is_low_surrogate(c
))
467 return codecvt_base::error
;
469 return codecvt_base::error
;
470 if (!write_utf8_code_point(to
, c
))
471 return codecvt_base::partial
;
474 return codecvt_base::ok
;
477 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
479 utf16_span(const char* begin
, const char* end
, size_t max
,
480 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
482 range
<const char> from
{ begin
, end
};
483 read_utf8_bom(from
, mode
);
485 while (count
+1 < max
)
487 char32_t c
= read_utf8_code_point(from
, maxcode
);
490 else if (c
> max_single_utf16_unit
)
494 if (count
+1 == max
) // take one more character if it fits in a single unit
495 read_utf8_code_point(from
, std::max(max_single_utf16_unit
, maxcode
));
501 ucs2_in(range
<const char>& from
, range
<char16_t
>& to
,
502 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
504 return utf16_in(from
, to
, std::max(max_single_utf16_unit
, maxcode
), mode
);
509 ucs2_out(range
<const char16_t
>& from
, range
<char>& to
,
510 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
512 return utf16_out(from
, to
, std::max(max_single_utf16_unit
, maxcode
), mode
);
517 ucs2_out(range
<const char16_t
>& from
, range
<char16_t
>& to
,
518 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
520 if (!write_utf16_bom(to
, mode
))
521 return codecvt_base::partial
;
522 while (from
.size() && to
.size())
524 char16_t c
= from
.next
[0];
525 if (is_high_surrogate(c
))
526 return codecvt_base::error
;
528 return codecvt_base::error
;
529 *to
.next
++ = adjust_byte_order(c
, mode
);
532 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
537 ucs2_in(range
<const char16_t
>& from
, range
<char16_t
>& to
,
538 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
540 if (read_utf16_bom(from
, mode
) == little_endian
)
541 mode
= codecvt_mode(mode
& little_endian
);
542 maxcode
= std::max(max_single_utf16_unit
, maxcode
);
543 while (from
.size() && to
.size())
545 const char32_t c
= read_utf16_code_point(from
, maxcode
, mode
);
546 if (c
== incomplete_mb_character
)
547 return codecvt_base::partial
;
549 return codecvt_base::error
;
552 return from
.size() == 0 ? codecvt_base::ok
: codecvt_base::partial
;
556 ucs2_span(const char16_t
* begin
, const char16_t
* end
, size_t max
,
557 char32_t maxcode
, codecvt_mode mode
)
559 range
<const char16_t
> from
{ begin
, end
};
560 if (read_utf16_bom(from
, mode
) == little_endian
)
561 mode
= codecvt_mode(mode
& little_endian
);
562 maxcode
= std::max(max_single_utf16_unit
, maxcode
);
564 while (max
-- && c
<= maxcode
)
565 c
= read_utf16_code_point(from
, maxcode
, mode
);
570 ucs2_span(const char* begin
, const char* end
, size_t max
,
571 char32_t maxcode
, codecvt_mode mode
)
573 range
<const char> from
{ begin
, end
};
574 read_utf8_bom(from
, mode
);
575 maxcode
= std::max(max_single_utf16_unit
, maxcode
);
577 while (max
-- && c
<= maxcode
)
578 c
= read_utf8_code_point(from
, maxcode
);
582 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
584 ucs4_span(const char* begin
, const char* end
, size_t max
,
585 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
587 range
<const char> from
{ begin
, end
};
588 read_utf8_bom(from
, mode
);
590 while (max
-- && c
<= maxcode
)
591 c
= read_utf8_code_point(from
, maxcode
);
595 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
597 ucs4_span(const char16_t
* begin
, const char16_t
* end
, size_t max
,
598 char32_t maxcode
= max_code_point
, codecvt_mode mode
= {})
600 range
<const char16_t
> from
{ begin
, end
};
601 if (read_utf16_bom(from
, mode
) == little_endian
)
602 mode
= codecvt_mode(mode
& little_endian
);
604 while (max
-- && c
<= maxcode
)
605 c
= read_utf16_code_point(from
, maxcode
, mode
);
610 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
611 // Converts from UTF-8 to UTF-16.
613 locale::id codecvt
<char16_t
, char, mbstate_t>::id
;
615 codecvt
<char16_t
, char, mbstate_t>::~codecvt() { }
618 codecvt
<char16_t
, char, mbstate_t>::
620 const intern_type
* __from
,
621 const intern_type
* __from_end
, const intern_type
*& __from_next
,
622 extern_type
* __to
, extern_type
* __to_end
,
623 extern_type
*& __to_next
) const
625 range
<const char16_t
> from
{ __from
, __from_end
};
626 range
<char> to
{ __to
, __to_end
};
627 auto res
= utf16_out(from
, to
);
628 __from_next
= from
.next
;
634 codecvt
<char16_t
, char, mbstate_t>::
635 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
636 extern_type
*& __to_next
) const
639 return noconv
; // we don't use mbstate_t for the unicode facets
643 codecvt
<char16_t
, char, mbstate_t>::
644 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
645 const extern_type
*& __from_next
,
646 intern_type
* __to
, intern_type
* __to_end
,
647 intern_type
*& __to_next
) const
649 range
<const char> from
{ __from
, __from_end
};
650 range
<char16_t
> to
{ __to
, __to_end
};
651 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
652 codecvt_mode mode
= {};
654 codecvt_mode mode
= little_endian
;
656 auto res
= utf16_in(from
, to
, max_code_point
, mode
);
657 __from_next
= from
.next
;
663 codecvt
<char16_t
, char, mbstate_t>::do_encoding() const throw()
667 codecvt
<char16_t
, char, mbstate_t>::do_always_noconv() const throw()
671 codecvt
<char16_t
, char, mbstate_t>::
672 do_length(state_type
&, const extern_type
* __from
,
673 const extern_type
* __end
, size_t __max
) const
675 __end
= utf16_span(__from
, __end
, __max
);
676 return __end
- __from
;
680 codecvt
<char16_t
, char, mbstate_t>::do_max_length() const throw()
682 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
683 // whereas 4 byte sequences require two 16-bit code units.
687 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
688 // Converts from UTF-8 to UTF-32 (aka UCS-4).
690 locale::id codecvt
<char32_t
, char, mbstate_t>::id
;
692 codecvt
<char32_t
, char, mbstate_t>::~codecvt() { }
695 codecvt
<char32_t
, char, mbstate_t>::
696 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
697 const intern_type
*& __from_next
,
698 extern_type
* __to
, extern_type
* __to_end
,
699 extern_type
*& __to_next
) const
701 range
<const char32_t
> from
{ __from
, __from_end
};
702 range
<char> to
{ __to
, __to_end
};
703 auto res
= ucs4_out(from
, to
);
704 __from_next
= from
.next
;
710 codecvt
<char32_t
, char, mbstate_t>::
711 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
712 extern_type
*& __to_next
) const
719 codecvt
<char32_t
, char, mbstate_t>::
720 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
721 const extern_type
*& __from_next
,
722 intern_type
* __to
, intern_type
* __to_end
,
723 intern_type
*& __to_next
) const
725 range
<const char> from
{ __from
, __from_end
};
726 range
<char32_t
> to
{ __to
, __to_end
};
727 auto res
= ucs4_in(from
, to
);
728 __from_next
= from
.next
;
734 codecvt
<char32_t
, char, mbstate_t>::do_encoding() const throw()
738 codecvt
<char32_t
, char, mbstate_t>::do_always_noconv() const throw()
742 codecvt
<char32_t
, char, mbstate_t>::
743 do_length(state_type
&, const extern_type
* __from
,
744 const extern_type
* __end
, size_t __max
) const
746 __end
= ucs4_span(__from
, __end
, __max
);
747 return __end
- __from
;
751 codecvt
<char32_t
, char, mbstate_t>::do_max_length() const throw()
754 // Define members of codecvt_utf8<char16_t> base class implementation.
755 // Converts from UTF-8 to UCS-2.
757 __codecvt_utf8_base
<char16_t
>::~__codecvt_utf8_base() { }
760 __codecvt_utf8_base
<char16_t
>::
761 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
762 const intern_type
*& __from_next
,
763 extern_type
* __to
, extern_type
* __to_end
,
764 extern_type
*& __to_next
) const
766 range
<const char16_t
> from
{ __from
, __from_end
};
767 range
<char> to
{ __to
, __to_end
};
768 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
769 __from_next
= from
.next
;
775 __codecvt_utf8_base
<char16_t
>::
776 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
777 extern_type
*& __to_next
) const
784 __codecvt_utf8_base
<char16_t
>::
785 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
786 const extern_type
*& __from_next
,
787 intern_type
* __to
, intern_type
* __to_end
,
788 intern_type
*& __to_next
) const
790 range
<const char> from
{ __from
, __from_end
};
791 range
<char16_t
> to
{ __to
, __to_end
};
792 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
793 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
794 mode
= codecvt_mode(mode
| little_endian
);
796 auto res
= ucs2_in(from
, to
, _M_maxcode
, mode
);
797 __from_next
= from
.next
;
803 __codecvt_utf8_base
<char16_t
>::do_encoding() const throw()
807 __codecvt_utf8_base
<char16_t
>::do_always_noconv() const throw()
811 __codecvt_utf8_base
<char16_t
>::
812 do_length(state_type
&, const extern_type
* __from
,
813 const extern_type
* __end
, size_t __max
) const
815 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
816 return __end
- __from
;
820 __codecvt_utf8_base
<char16_t
>::do_max_length() const throw()
823 // Define members of codecvt_utf8<char32_t> base class implementation.
824 // Converts from UTF-8 to UTF-32 (aka UCS-4).
826 __codecvt_utf8_base
<char32_t
>::~__codecvt_utf8_base() { }
829 __codecvt_utf8_base
<char32_t
>::
830 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
831 const intern_type
*& __from_next
,
832 extern_type
* __to
, extern_type
* __to_end
,
833 extern_type
*& __to_next
) const
835 range
<const char32_t
> from
{ __from
, __from_end
};
836 range
<char> to
{ __to
, __to_end
};
837 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
838 __from_next
= from
.next
;
844 __codecvt_utf8_base
<char32_t
>::
845 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
846 extern_type
*& __to_next
) const
853 __codecvt_utf8_base
<char32_t
>::
854 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
855 const extern_type
*& __from_next
,
856 intern_type
* __to
, intern_type
* __to_end
,
857 intern_type
*& __to_next
) const
859 range
<const char> from
{ __from
, __from_end
};
860 range
<char32_t
> to
{ __to
, __to_end
};
861 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
862 __from_next
= from
.next
;
868 __codecvt_utf8_base
<char32_t
>::do_encoding() const throw()
872 __codecvt_utf8_base
<char32_t
>::do_always_noconv() const throw()
876 __codecvt_utf8_base
<char32_t
>::
877 do_length(state_type
&, const extern_type
* __from
,
878 const extern_type
* __end
, size_t __max
) const
880 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
881 return __end
- __from
;
885 __codecvt_utf8_base
<char32_t
>::do_max_length() const throw()
888 #ifdef _GLIBCXX_USE_WCHAR_T
889 // Define members of codecvt_utf8<wchar_t> base class implementation.
890 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
892 __codecvt_utf8_base
<wchar_t>::~__codecvt_utf8_base() { }
895 __codecvt_utf8_base
<wchar_t>::
896 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
897 const intern_type
*& __from_next
,
898 extern_type
* __to
, extern_type
* __to_end
,
899 extern_type
*& __to_next
) const
901 range
<char> to
{ __to
, __to_end
};
902 #if __SIZEOF_WCHAR_T__ == 2
903 range
<const char16_t
> from
{
904 reinterpret_cast<const char16_t
*>(__from
),
905 reinterpret_cast<const char16_t
*>(__from_end
)
907 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
908 #elif __SIZEOF_WCHAR_T__ == 4
909 range
<const char32_t
> from
{
910 reinterpret_cast<const char32_t
*>(__from
),
911 reinterpret_cast<const char32_t
*>(__from_end
)
913 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
915 return codecvt_base::error
;
917 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
923 __codecvt_utf8_base
<wchar_t>::
924 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
925 extern_type
*& __to_next
) const
932 __codecvt_utf8_base
<wchar_t>::
933 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
934 const extern_type
*& __from_next
,
935 intern_type
* __to
, intern_type
* __to_end
,
936 intern_type
*& __to_next
) const
938 range
<const char> from
{ __from
, __from_end
};
939 #if __SIZEOF_WCHAR_T__ == 2
941 reinterpret_cast<char16_t
*>(__to
),
942 reinterpret_cast<char16_t
*>(__to_end
)
944 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
945 #elif __SIZEOF_WCHAR_T__ == 4
947 reinterpret_cast<char32_t
*>(__to
),
948 reinterpret_cast<char32_t
*>(__to_end
)
950 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
952 return codecvt_base::error
;
954 __from_next
= from
.next
;
955 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
960 __codecvt_utf8_base
<wchar_t>::do_encoding() const throw()
964 __codecvt_utf8_base
<wchar_t>::do_always_noconv() const throw()
968 __codecvt_utf8_base
<wchar_t>::
969 do_length(state_type
&, const extern_type
* __from
,
970 const extern_type
* __end
, size_t __max
) const
972 #if __SIZEOF_WCHAR_T__ == 2
973 __end
= ucs2_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
974 #elif __SIZEOF_WCHAR_T__ == 4
975 __end
= ucs4_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
979 return __end
- __from
;
983 __codecvt_utf8_base
<wchar_t>::do_max_length() const throw()
987 // Define members of codecvt_utf16<char16_t> base class implementation.
988 // Converts from UTF-16 to UCS-2.
990 __codecvt_utf16_base
<char16_t
>::~__codecvt_utf16_base() { }
993 __codecvt_utf16_base
<char16_t
>::
994 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
995 const intern_type
*& __from_next
,
996 extern_type
* __to
, extern_type
* __to_end
,
997 extern_type
*& __to_next
) const
999 range
<const char16_t
> from
{ __from
, __from_end
};
1001 reinterpret_cast<char16_t
*>(__to
),
1002 reinterpret_cast<char16_t
*>(__to_end
)
1004 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1005 __from_next
= from
.next
;
1006 __to_next
= reinterpret_cast<char*>(to
.next
);
1010 codecvt_base::result
1011 __codecvt_utf16_base
<char16_t
>::
1012 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1013 extern_type
*& __to_next
) const
1019 codecvt_base::result
1020 __codecvt_utf16_base
<char16_t
>::
1021 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1022 const extern_type
*& __from_next
,
1023 intern_type
* __to
, intern_type
* __to_end
,
1024 intern_type
*& __to_next
) const
1026 range
<const char16_t
> from
{
1027 reinterpret_cast<const char16_t
*>(__from
),
1028 reinterpret_cast<const char16_t
*>(__from_end
)
1030 range
<char16_t
> to
{ __to
, __to_end
};
1031 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1032 __from_next
= reinterpret_cast<const char*>(from
.next
);
1033 __to_next
= to
.next
;
1038 __codecvt_utf16_base
<char16_t
>::do_encoding() const throw()
1042 __codecvt_utf16_base
<char16_t
>::do_always_noconv() const throw()
1046 __codecvt_utf16_base
<char16_t
>::
1047 do_length(state_type
&, const extern_type
* __from
,
1048 const extern_type
* __end
, size_t __max
) const
1050 auto next
= reinterpret_cast<const char16_t
*>(__from
);
1051 next
= ucs2_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1052 _M_maxcode
, _M_mode
);
1053 return reinterpret_cast<const char*>(next
) - __from
;
1057 __codecvt_utf16_base
<char16_t
>::do_max_length() const throw()
1060 // Define members of codecvt_utf16<char32_t> base class implementation.
1061 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1063 __codecvt_utf16_base
<char32_t
>::~__codecvt_utf16_base() { }
1065 codecvt_base::result
1066 __codecvt_utf16_base
<char32_t
>::
1067 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1068 const intern_type
*& __from_next
,
1069 extern_type
* __to
, extern_type
* __to_end
,
1070 extern_type
*& __to_next
) const
1072 range
<const char32_t
> from
{ __from
, __from_end
};
1074 reinterpret_cast<char16_t
*>(__to
),
1075 reinterpret_cast<char16_t
*>(__to_end
)
1077 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1078 __from_next
= from
.next
;
1079 __to_next
= reinterpret_cast<char*>(to
.next
);
1083 codecvt_base::result
1084 __codecvt_utf16_base
<char32_t
>::
1085 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1086 extern_type
*& __to_next
) const
1092 codecvt_base::result
1093 __codecvt_utf16_base
<char32_t
>::
1094 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1095 const extern_type
*& __from_next
,
1096 intern_type
* __to
, intern_type
* __to_end
,
1097 intern_type
*& __to_next
) const
1099 range
<const char16_t
> from
{
1100 reinterpret_cast<const char16_t
*>(__from
),
1101 reinterpret_cast<const char16_t
*>(__from_end
)
1103 range
<char32_t
> to
{ __to
, __to_end
};
1104 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1105 __from_next
= reinterpret_cast<const char*>(from
.next
);
1106 __to_next
= to
.next
;
1111 __codecvt_utf16_base
<char32_t
>::do_encoding() const throw()
1115 __codecvt_utf16_base
<char32_t
>::do_always_noconv() const throw()
1119 __codecvt_utf16_base
<char32_t
>::
1120 do_length(state_type
&, const extern_type
* __from
,
1121 const extern_type
* __end
, size_t __max
) const
1123 auto next
= reinterpret_cast<const char16_t
*>(__from
);
1124 next
= ucs4_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1125 _M_maxcode
, _M_mode
);
1126 return reinterpret_cast<const char*>(next
) - __from
;
1130 __codecvt_utf16_base
<char32_t
>::do_max_length() const throw()
1133 #ifdef _GLIBCXX_USE_WCHAR_T
1134 // Define members of codecvt_utf16<wchar_t> base class implementation.
1135 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1137 __codecvt_utf16_base
<wchar_t>::~__codecvt_utf16_base() { }
1139 codecvt_base::result
1140 __codecvt_utf16_base
<wchar_t>::
1141 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1142 const intern_type
*& __from_next
,
1143 extern_type
* __to
, extern_type
* __to_end
,
1144 extern_type
*& __to_next
) const
1146 range
<char> to
{ __to
, __to_end
};
1147 #if __SIZEOF_WCHAR_T__ == 2
1148 range
<const char16_t
> from
{
1149 reinterpret_cast<const char16_t
*>(__from
),
1150 reinterpret_cast<const char16_t
*>(__from_end
)
1152 auto res
= ucs2_out(from
, to
, _M_maxcode
, _M_mode
);
1153 #elif __SIZEOF_WCHAR_T__ == 4
1154 range
<const char32_t
> from
{
1155 reinterpret_cast<const char32_t
*>(__from
),
1156 reinterpret_cast<const char32_t
*>(__from_end
)
1158 auto res
= ucs4_out(from
, to
, _M_maxcode
, _M_mode
);
1160 return codecvt_base::error
;
1162 __from_next
= reinterpret_cast<const wchar_t*>(from
.next
);
1163 __to_next
= to
.next
;
1167 codecvt_base::result
1168 __codecvt_utf16_base
<wchar_t>::
1169 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1170 extern_type
*& __to_next
) const
1176 codecvt_base::result
1177 __codecvt_utf16_base
<wchar_t>::
1178 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1179 const extern_type
*& __from_next
,
1180 intern_type
* __to
, intern_type
* __to_end
,
1181 intern_type
*& __to_next
) const
1183 range
<const char> from
{ __from
, __from_end
};
1184 #if __SIZEOF_WCHAR_T__ == 2
1186 reinterpret_cast<char16_t
*>(__to
),
1187 reinterpret_cast<char16_t
*>(__to_end
)
1189 auto res
= ucs2_in(from
, to
, _M_maxcode
, _M_mode
);
1190 #elif __SIZEOF_WCHAR_T__ == 4
1192 reinterpret_cast<char32_t
*>(__to
),
1193 reinterpret_cast<char32_t
*>(__to_end
)
1195 auto res
= ucs4_in(from
, to
, _M_maxcode
, _M_mode
);
1197 return codecvt_base::error
;
1199 __from_next
= from
.next
;
1200 __to_next
= reinterpret_cast<wchar_t*>(to
.next
);
1205 __codecvt_utf16_base
<wchar_t>::do_encoding() const throw()
1209 __codecvt_utf16_base
<wchar_t>::do_always_noconv() const throw()
1213 __codecvt_utf16_base
<wchar_t>::
1214 do_length(state_type
&, const extern_type
* __from
,
1215 const extern_type
* __end
, size_t __max
) const
1217 auto next
= reinterpret_cast<const char16_t
*>(__from
);
1218 #if __SIZEOF_WCHAR_T__ == 2
1219 next
= ucs2_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1220 _M_maxcode
, _M_mode
);
1221 #elif __SIZEOF_WCHAR_T__ == 4
1222 next
= ucs4_span(next
, reinterpret_cast<const char16_t
*>(__end
), __max
,
1223 _M_maxcode
, _M_mode
);
1225 return reinterpret_cast<const char*>(next
) - __from
;
1229 __codecvt_utf16_base
<wchar_t>::do_max_length() const throw()
1233 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1234 // Converts from UTF-8 to UTF-16.
1236 __codecvt_utf8_utf16_base
<char16_t
>::~__codecvt_utf8_utf16_base() { }
1238 codecvt_base::result
1239 __codecvt_utf8_utf16_base
<char16_t
>::
1240 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1241 const intern_type
*& __from_next
,
1242 extern_type
* __to
, extern_type
* __to_end
,
1243 extern_type
*& __to_next
) const
1245 range
<const char16_t
> from
{ __from
, __from_end
};
1246 range
<char> to
{ __to
, __to_end
};
1247 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1248 __from_next
= from
.next
;
1249 __to_next
= to
.next
;
1253 codecvt_base::result
1254 __codecvt_utf8_utf16_base
<char16_t
>::
1255 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1256 extern_type
*& __to_next
) const
1262 codecvt_base::result
1263 __codecvt_utf8_utf16_base
<char16_t
>::
1264 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1265 const extern_type
*& __from_next
,
1266 intern_type
* __to
, intern_type
* __to_end
,
1267 intern_type
*& __to_next
) const
1269 range
<const char> from
{ __from
, __from_end
};
1270 range
<char16_t
> to
{ __to
, __to_end
};
1271 codecvt_mode mode
= codecvt_mode(_M_mode
& (consume_header
|generate_header
));
1272 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1273 mode
= codecvt_mode(mode
| little_endian
);
1275 auto res
= utf16_in(from
, to
, _M_maxcode
, mode
);
1276 __from_next
= from
.next
;
1277 __to_next
= to
.next
;
1282 __codecvt_utf8_utf16_base
<char16_t
>::do_encoding() const throw()
1286 __codecvt_utf8_utf16_base
<char16_t
>::do_always_noconv() const throw()
1290 __codecvt_utf8_utf16_base
<char16_t
>::
1291 do_length(state_type
&, const extern_type
* __from
,
1292 const extern_type
* __end
, size_t __max
) const
1294 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1295 return __end
- __from
;
1299 __codecvt_utf8_utf16_base
<char16_t
>::do_max_length() const throw()
1301 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1302 // whereas 4 byte sequences require two 16-bit code units.
1306 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1307 // Converts from UTF-8 to UTF-16.
1309 __codecvt_utf8_utf16_base
<char32_t
>::~__codecvt_utf8_utf16_base() { }
1311 codecvt_base::result
1312 __codecvt_utf8_utf16_base
<char32_t
>::
1313 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1314 const intern_type
*& __from_next
,
1315 extern_type
* __to
, extern_type
* __to_end
,
1316 extern_type
*& __to_next
) const
1318 range
<const char32_t
> from
{ __from
, __from_end
};
1319 range
<char> to
{ __to
, __to_end
};
1320 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1321 __from_next
= from
.next
;
1322 __to_next
= to
.next
;
1326 codecvt_base::result
1327 __codecvt_utf8_utf16_base
<char32_t
>::
1328 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1329 extern_type
*& __to_next
) const
1335 codecvt_base::result
1336 __codecvt_utf8_utf16_base
<char32_t
>::
1337 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1338 const extern_type
*& __from_next
,
1339 intern_type
* __to
, intern_type
* __to_end
,
1340 intern_type
*& __to_next
) const
1342 range
<const char> from
{ __from
, __from_end
};
1343 range
<char32_t
> to
{ __to
, __to_end
};
1344 auto res
= utf16_in(from
, to
, _M_maxcode
, _M_mode
);
1345 __from_next
= from
.next
;
1346 __to_next
= to
.next
;
1351 __codecvt_utf8_utf16_base
<char32_t
>::do_encoding() const throw()
1355 __codecvt_utf8_utf16_base
<char32_t
>::do_always_noconv() const throw()
1359 __codecvt_utf8_utf16_base
<char32_t
>::
1360 do_length(state_type
&, const extern_type
* __from
,
1361 const extern_type
* __end
, size_t __max
) const
1363 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1364 return __end
- __from
;
1368 __codecvt_utf8_utf16_base
<char32_t
>::do_max_length() const throw()
1370 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1371 // whereas 4 byte sequences require two 16-bit code units.
1375 #ifdef _GLIBCXX_USE_WCHAR_T
1376 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1377 // Converts from UTF-8 to UTF-16.
1379 __codecvt_utf8_utf16_base
<wchar_t>::~__codecvt_utf8_utf16_base() { }
1381 codecvt_base::result
1382 __codecvt_utf8_utf16_base
<wchar_t>::
1383 do_out(state_type
&, const intern_type
* __from
, const intern_type
* __from_end
,
1384 const intern_type
*& __from_next
,
1385 extern_type
* __to
, extern_type
* __to_end
,
1386 extern_type
*& __to_next
) const
1388 range
<const wchar_t> from
{ __from
, __from_end
};
1389 range
<char> to
{ __to
, __to_end
};
1390 auto res
= utf16_out(from
, to
, _M_maxcode
, _M_mode
);
1391 __from_next
= from
.next
;
1392 __to_next
= to
.next
;
1396 codecvt_base::result
1397 __codecvt_utf8_utf16_base
<wchar_t>::
1398 do_unshift(state_type
&, extern_type
* __to
, extern_type
*,
1399 extern_type
*& __to_next
) const
1405 codecvt_base::result
1406 __codecvt_utf8_utf16_base
<wchar_t>::
1407 do_in(state_type
&, const extern_type
* __from
, const extern_type
* __from_end
,
1408 const extern_type
*& __from_next
,
1409 intern_type
* __to
, intern_type
* __to_end
,
1410 intern_type
*& __to_next
) const
1412 range
<const char> from
{ __from
, __from_end
};
1413 range
<wchar_t> to
{ __to
, __to_end
};
1414 auto res
= utf16_in(from
, to
, _M_maxcode
, _M_mode
);
1415 __from_next
= from
.next
;
1416 __to_next
= to
.next
;
1421 __codecvt_utf8_utf16_base
<wchar_t>::do_encoding() const throw()
1425 __codecvt_utf8_utf16_base
<wchar_t>::do_always_noconv() const throw()
1429 __codecvt_utf8_utf16_base
<wchar_t>::
1430 do_length(state_type
&, const extern_type
* __from
,
1431 const extern_type
* __end
, size_t __max
) const
1433 __end
= utf16_span(__from
, __end
, __max
, _M_maxcode
, _M_mode
);
1434 return __end
- __from
;
1438 __codecvt_utf8_utf16_base
<wchar_t>::do_max_length() const throw()
1440 // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
1441 // whereas 4 byte sequences require two 16-bit code units.
1446 inline template class __codecvt_abstract_base
<char16_t
, char, mbstate_t>;
1447 inline template class __codecvt_abstract_base
<char32_t
, char, mbstate_t>;
1448 template class codecvt_byname
<char16_t
, char, mbstate_t>;
1449 template class codecvt_byname
<char32_t
, char, mbstate_t>;
1451 _GLIBCXX_END_NAMESPACE_VERSION
1453 #endif // _GLIBCXX_USE_C99_STDINT_TR1