Daily bump.
[official-gcc.git] / libstdc++-v3 / src / c++11 / codecvt.cc
blob3e45159ea079f5a7b1c7ae23e0d1f84ae6ba65aa
1 // Locale support (codecvt) -*- C++ -*-
3 // Copyright (C) 2015-2024 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
25 #include <codecvt>
26 #include <cstring> // std::memcpy, std::memcmp
27 #include <bits/stl_algobase.h> // std::min
29 namespace std _GLIBCXX_VISIBILITY(default)
31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
33 // The standard doesn't define these operators, which is annoying.
34 static underlying_type<codecvt_mode>::type
35 to_integer(codecvt_mode m)
36 { return static_cast<underlying_type<codecvt_mode>::type>(m); }
38 static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
39 { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
41 static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
42 { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
44 static codecvt_mode operator~(codecvt_mode m)
45 { return codecvt_mode(~to_integer(m)); }
47 namespace
49 // Largest code point that fits in a single UTF-16 code unit.
50 const char32_t max_single_utf16_unit = 0xFFFF;
52 const char32_t max_code_point = 0x10FFFF;
54 // The functions below rely on maxcode < incomplete_mb_character
55 // (which is enforced by the codecvt_utf* classes on construction).
56 const char32_t incomplete_mb_character = char32_t(-2);
57 const char32_t invalid_mb_sequence = char32_t(-1);
59 // Utility type for reading and writing code units of type Elem from
60 // a range defined by a pair of pointers.
61 template<typename Elem, bool Aligned = true>
62 struct range
64 Elem* next;
65 Elem* end;
67 // Write a code unit.
68 range& operator=(Elem e)
70 *next++ = e;
71 return *this;
74 // Read the next code unit.
75 Elem operator*() const { return *next; }
77 // Read the Nth code unit.
78 Elem operator[](size_t n) const { return next[n]; }
80 // Move to the next code unit.
81 range& operator++()
83 ++next;
84 return *this;
87 // Move to the Nth code unit.
88 range& operator+=(size_t n)
90 next += n;
91 return *this;
94 // The number of code units remaining.
95 size_t size() const { return end - next; }
97 // The number of bytes remaining.
98 size_t nbytes() const { return (const char*)end - (const char*)next; }
101 // This specialization is used when accessing char16_t values through
102 // pointers to char, which might not be correctly aligned for char16_t.
103 template<typename Elem>
104 struct range<Elem, false>
106 using value_type = typename remove_const<Elem>::type;
108 using char_pointer = typename
109 conditional<is_const<Elem>::value, const char*, char*>::type;
111 char_pointer next;
112 char_pointer end;
114 // Write a code unit.
115 range& operator=(Elem e)
117 memcpy(next, &e, sizeof(Elem));
118 ++*this;
119 return *this;
122 // Read the next code unit.
123 Elem operator*() const
125 value_type e;
126 memcpy(&e, next, sizeof(Elem));
127 return e;
130 // Read the Nth code unit.
131 Elem operator[](size_t n) const
133 value_type e;
134 memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
135 return e;
138 // Move to the next code unit.
139 range& operator++()
141 next += sizeof(Elem);
142 return *this;
145 // Move to the Nth code unit.
146 range& operator+=(size_t n)
148 next += n * sizeof(Elem);
149 return *this;
152 // The number of code units remaining.
153 size_t size() const { return nbytes() / sizeof(Elem); }
155 // The number of bytes remaining.
156 size_t nbytes() const { return end - next; }
159 // Multibyte sequences can have "header" consisting of Byte Order Mark
160 const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
161 const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
162 const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
164 // Write a BOM (space permitting).
165 template<typename C, bool A, size_t N>
166 bool
167 write_bom(range<C, A>& to, const unsigned char (&bom)[N])
169 static_assert( (N / sizeof(C)) != 0, "" );
170 static_assert( (N % sizeof(C)) == 0, "" );
172 if (to.nbytes() < N)
173 return false;
174 memcpy(to.next, bom, N);
175 to += (N / sizeof(C));
176 return true;
179 // Try to read a BOM.
180 template<typename C, bool A, size_t N>
181 bool
182 read_bom(range<C, A>& from, const unsigned char (&bom)[N])
184 static_assert( (N / sizeof(C)) != 0, "" );
185 static_assert( (N % sizeof(C)) == 0, "" );
187 if (from.nbytes() >= N && !memcmp(from.next, bom, N))
189 from += (N / sizeof(C));
190 return true;
192 return false;
195 // If generate_header is set in mode write out UTF-8 BOM.
196 template<typename C>
197 bool
198 write_utf8_bom(range<C>& to, codecvt_mode mode)
200 if (mode & generate_header)
201 return write_bom(to, utf8_bom);
202 return true;
205 // If generate_header is set in mode write out the UTF-16 BOM indicated
206 // by whether little_endian is set in mode.
207 template<bool Aligned>
208 bool
209 write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
211 if (mode & generate_header)
213 if (mode & little_endian)
214 return write_bom(to, utf16le_bom);
215 else
216 return write_bom(to, utf16_bom);
218 return true;
221 // If consume_header is set in mode update from.next to after any BOM.
222 template<typename C>
223 void
224 read_utf8_bom(range<const C>& from, codecvt_mode mode)
226 if (mode & consume_header)
227 read_bom(from, utf8_bom);
230 // If consume_header is not set in mode, no effects.
231 // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
232 // - if the UTF-16BE BOM was found unset little_endian in mode, or
233 // - if the UTF-16LE BOM was found set little_endian in mode.
234 template<bool Aligned>
235 void
236 read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
238 if (mode & consume_header)
240 if (read_bom(from, utf16_bom))
241 mode &= ~little_endian;
242 else if (read_bom(from, utf16le_bom))
243 mode |= little_endian;
247 // Read a codepoint from a UTF-8 multibyte sequence.
248 // Updates from.next if the codepoint is not greater than maxcode.
249 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
250 template<typename C>
251 char32_t
252 read_utf8_code_point(range<const C>& from, unsigned long maxcode)
254 const size_t avail = from.size();
255 if (avail == 0)
256 return incomplete_mb_character;
257 char32_t c1 = (unsigned char) from[0];
258 // https://en.wikipedia.org/wiki/UTF-8#Sample_code
259 if (c1 < 0x80) [[likely]]
261 ++from;
262 return c1;
264 else if (c1 < 0xC2) [[unlikely]] // continuation or overlong 2-byte sequence
265 return invalid_mb_sequence;
266 else if (c1 < 0xE0) // 2-byte sequence
268 if (avail < 2) [[unlikely]]
269 return incomplete_mb_character;
270 char32_t c2 = (unsigned char) from[1];
271 if ((c2 & 0xC0) != 0x80) [[unlikely]]
272 return invalid_mb_sequence;
273 char32_t c = (c1 << 6) + c2 - 0x3080;
274 if (c <= maxcode)
275 from += 2;
276 return c;
278 else if (c1 < 0xF0) // 3-byte sequence
280 if (avail < 2) [[unlikely]]
281 return incomplete_mb_character;
282 char32_t c2 = (unsigned char) from[1];
283 if ((c2 & 0xC0) != 0x80) [[unlikely]]
284 return invalid_mb_sequence;
285 if (c1 == 0xE0 && c2 < 0xA0) [[unlikely]] // overlong
286 return invalid_mb_sequence;
287 if (c1 == 0xED && c2 >= 0xA0) [[unlikely]] // surrogate
288 return invalid_mb_sequence;
289 if (avail < 3) [[unlikely]]
290 return incomplete_mb_character;
291 char32_t c3 = (unsigned char) from[2];
292 if ((c3 & 0xC0) != 0x80) [[unlikely]]
293 return invalid_mb_sequence;
294 char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
295 if (c <= maxcode)
296 from += 3;
297 return c;
299 else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence
301 if (avail < 2) [[unlikely]]
302 return incomplete_mb_character;
303 char32_t c2 = (unsigned char) from[1];
304 if ((c2 & 0xC0) != 0x80) [[unlikely]]
305 return invalid_mb_sequence;
306 if (c1 == 0xF0 && c2 < 0x90) [[unlikely]] // overlong
307 return invalid_mb_sequence;
308 if (c1 == 0xF4 && c2 >= 0x90) [[unlikely]] // > U+10FFFF
309 return invalid_mb_sequence;
310 if (avail < 3) [[unlikely]]
311 return incomplete_mb_character;
312 char32_t c3 = (unsigned char) from[2];
313 if ((c3 & 0xC0) != 0x80) [[unlikely]]
314 return invalid_mb_sequence;
315 if (avail < 4) [[unlikely]]
316 return incomplete_mb_character;
317 char32_t c4 = (unsigned char) from[3];
318 if ((c4 & 0xC0) != 0x80) [[unlikely]]
319 return invalid_mb_sequence;
320 char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
321 if (c <= maxcode)
322 from += 4;
323 return c;
325 else [[unlikely]] // > U+10FFFF
326 return invalid_mb_sequence;
329 template<typename C>
330 bool
331 write_utf8_code_point(range<C>& to, char32_t code_point)
333 if (code_point < 0x80)
335 if (to.size() < 1) [[unlikely]]
336 return false;
337 to = code_point;
339 else if (code_point <= 0x7FF)
341 if (to.size() < 2) [[unlikely]]
342 return false;
343 to = (code_point >> 6) + 0xC0;
344 to = (code_point & 0x3F) + 0x80;
346 else if (code_point <= 0xFFFF)
348 if (to.size() < 3) [[unlikely]]
349 return false;
350 to = (code_point >> 12) + 0xE0;
351 to = ((code_point >> 6) & 0x3F) + 0x80;
352 to = (code_point & 0x3F) + 0x80;
354 else if (code_point <= 0x10FFFF)
356 if (to.size() < 4) [[unlikely]]
357 return false;
358 to = (code_point >> 18) + 0xF0;
359 to = ((code_point >> 12) & 0x3F) + 0x80;
360 to = ((code_point >> 6) & 0x3F) + 0x80;
361 to = (code_point & 0x3F) + 0x80;
363 else [[unlikely]]
364 return false;
365 return true;
368 inline char16_t
369 adjust_byte_order(char16_t c, codecvt_mode mode)
371 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
372 return (mode & little_endian) ? __builtin_bswap16(c) : c;
373 #else
374 return (mode & little_endian) ? c : __builtin_bswap16(c);
375 #endif
378 // Return true if c is a high-surrogate (aka leading) code point.
379 inline bool
380 is_high_surrogate(char32_t c)
382 return c >= 0xD800 && c <= 0xDBFF;
385 // Return true if c is a low-surrogate (aka trailing) code point.
386 inline bool
387 is_low_surrogate(char32_t c)
389 return c >= 0xDC00 && c <= 0xDFFF;
392 inline char32_t
393 surrogate_pair_to_code_point(char32_t high, char32_t low)
395 return (high << 10) + low - 0x35FDC00;
398 // Read a codepoint from a UTF-16 multibyte sequence.
399 // The sequence's endianness is indicated by (mode & little_endian).
400 // Updates from.next if the codepoint is not greater than maxcode.
401 // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
402 template<bool Aligned>
403 char32_t
404 read_utf16_code_point(range<const char16_t, Aligned>& from,
405 unsigned long maxcode, codecvt_mode mode)
407 const size_t avail = from.size();
408 if (avail == 0) [[unlikely]]
409 return incomplete_mb_character;
410 int inc = 1;
411 char32_t c = adjust_byte_order(from[0], mode);
412 if (is_high_surrogate(c))
414 if (avail < 2) [[unlikely]]
415 return incomplete_mb_character;
416 const char16_t c2 = adjust_byte_order(from[1], mode);
417 if (is_low_surrogate(c2)) [[likely]]
419 c = surrogate_pair_to_code_point(c, c2);
420 inc = 2;
422 else
423 return invalid_mb_sequence;
425 else if (is_low_surrogate(c)) [[unlikely]]
426 return invalid_mb_sequence;
427 if (c <= maxcode)
428 from += inc;
429 return c;
432 template<typename C, bool A>
433 bool
434 write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
436 static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
438 if (codepoint <= max_single_utf16_unit)
440 if (to.size() > 0)
442 to = adjust_byte_order(codepoint, mode);
443 return true;
446 else if (to.size() > 1)
448 // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
449 const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
450 char16_t lead = LEAD_OFFSET + (codepoint >> 10);
451 char16_t trail = 0xDC00 + (codepoint & 0x3FF);
452 to = adjust_byte_order(lead, mode);
453 to = adjust_byte_order(trail, mode);
454 return true;
456 return false;
459 // utf8 -> ucs4
460 template<typename C>
461 codecvt_base::result
462 ucs4_in(range<const C>& from, range<char32_t>& to,
463 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
465 read_utf8_bom(from, mode);
466 while (from.size() && to.size())
468 const char32_t codepoint = read_utf8_code_point(from, maxcode);
469 if (codepoint == incomplete_mb_character) [[unlikely]]
470 return codecvt_base::partial;
471 if (codepoint > maxcode) [[unlikely]]
472 return codecvt_base::error;
473 to = codepoint;
475 return from.size() ? codecvt_base::partial : codecvt_base::ok;
478 // ucs4 -> utf8
479 template<typename C>
480 codecvt_base::result
481 ucs4_out(range<const char32_t>& from, range<C>& to,
482 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
484 if (!write_utf8_bom(to, mode)) [[unlikely]]
485 return codecvt_base::partial;
486 while (from.size())
488 const char32_t c = from[0];
489 if (0xD800 <= c && c <= 0xDFFF) [[unlikely]]
490 return codecvt_base::error;
491 if (c > maxcode) [[unlikely]]
492 return codecvt_base::error;
493 if (!write_utf8_code_point(to, c)) [[unlikely]]
494 return codecvt_base::partial;
495 ++from;
497 return codecvt_base::ok;
500 // utf16 -> ucs4
501 codecvt_base::result
502 ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
503 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
505 read_utf16_bom(from, mode);
506 while (from.size() && to.size())
508 const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
509 if (codepoint == incomplete_mb_character) [[unlikely]]
510 return codecvt_base::partial;
511 if (codepoint > maxcode) [[unlikely]]
512 return codecvt_base::error;
513 to = codepoint;
515 return from.nbytes() ? codecvt_base::partial : codecvt_base::ok;
518 // ucs4 -> utf16
519 codecvt_base::result
520 ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
521 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
523 if (!write_utf16_bom(to, mode)) [[unlikely]]
524 return codecvt_base::partial;
525 while (from.size())
527 const char32_t c = from[0];
528 if (0xD800 <= c && c <= 0xDFFF) [[unlikely]]
529 return codecvt_base::error;
530 if (c > maxcode) [[unlikely]]
531 return codecvt_base::error;
532 if (!write_utf16_code_point(to, c, mode)) [[unlikely]]
533 return codecvt_base::partial;
534 ++from;
536 return codecvt_base::ok;
539 // Flag indicating whether to process UTF-16 or UCS2
540 enum class surrogates { allowed, disallowed };
542 // utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF)
543 template <typename C8, typename C16>
544 codecvt_base::result
545 utf16_in(range<const C8> &from, range<C16> &to,
546 unsigned long maxcode = max_code_point, codecvt_mode mode = {})
548 read_utf8_bom(from, mode);
549 while (from.size() && to.size())
551 auto orig = from;
552 const char32_t codepoint = read_utf8_code_point(from, maxcode);
553 if (codepoint == incomplete_mb_character) [[unlikely]]
554 return codecvt_base::partial;
555 if (codepoint > maxcode)
556 return codecvt_base::error;
557 if (!write_utf16_code_point(to, codepoint, mode)) [[unlikely]]
559 from = orig; // rewind to previous position
560 return codecvt_base::partial;
563 return from.size() ? codecvt_base::partial : codecvt_base::ok;
566 // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
567 template<typename C16, typename C8>
568 codecvt_base::result
569 utf16_out(range<const C16>& from, range<C8>& to,
570 unsigned long maxcode = max_code_point, codecvt_mode mode = {},
571 surrogates s = surrogates::allowed)
573 if (!write_utf8_bom(to, mode)) [[unlikely]]
574 return codecvt_base::partial;
575 while (from.size())
577 char32_t c = from[0];
578 int inc = 1;
579 if (is_high_surrogate(c))
581 if (s == surrogates::disallowed) [[unlikely]]
582 return codecvt_base::error; // No surrogates in UCS-2
584 if (from.size() < 2) [[unlikely]]
585 return codecvt_base::partial; // stop converting at this point
587 const char32_t c2 = from[1];
588 if (is_low_surrogate(c2)) [[likely]]
590 c = surrogate_pair_to_code_point(c, c2);
591 inc = 2;
593 else
594 return codecvt_base::error;
596 else if (is_low_surrogate(c)) [[unlikely]]
597 return codecvt_base::error;
598 if (c > maxcode) [[unlikely]]
599 return codecvt_base::error;
600 if (!write_utf8_code_point(to, c)) [[unlikely]]
601 return codecvt_base::partial;
602 from += inc;
604 return codecvt_base::ok;
607 // return pos such that [begin,pos) is valid UTF-16 string no longer than max
608 template<typename C>
609 const C*
610 utf16_span(const C* begin, const C* end, size_t max,
611 char32_t maxcode = max_code_point, codecvt_mode mode = {})
613 range<const C> from{ begin, end };
614 read_utf8_bom(from, mode);
615 size_t count = 0;
616 while (count+1 < max)
618 char32_t c = read_utf8_code_point(from, maxcode);
619 if (c > maxcode)
620 return from.next;
621 else if (c > max_single_utf16_unit)
622 ++count;
623 ++count;
625 if (count+1 == max) // take one more character if it fits in a single unit
626 read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
627 return from.next;
630 // utf8 -> ucs2
631 template<typename C>
632 codecvt_base::result
633 ucs2_in(range<const C>& from, range<char16_t>& to,
634 char32_t maxcode = max_code_point, codecvt_mode mode = {})
636 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
637 maxcode = std::min(max_single_utf16_unit, maxcode);
638 return utf16_in(from, to, maxcode, mode);
641 // ucs2 -> utf8
642 template<typename C>
643 codecvt_base::result
644 ucs2_out(range<const char16_t>& from, range<C>& to,
645 char32_t maxcode = max_code_point, codecvt_mode mode = {})
647 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
648 maxcode = std::min(max_single_utf16_unit, maxcode);
649 return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
652 // ucs2 -> utf16
653 codecvt_base::result
654 ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
655 char32_t maxcode = max_code_point, codecvt_mode mode = {})
657 if (!write_utf16_bom(to, mode))
658 return codecvt_base::partial;
659 while (from.size() && to.size())
661 char16_t c = from[0];
662 if (0xD800 <= c && c <= 0xDFFF)
663 return codecvt_base::error;
664 if (c > maxcode)
665 return codecvt_base::error;
666 to = adjust_byte_order(c, mode);
667 ++from;
669 return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
672 // utf16 -> ucs2
673 codecvt_base::result
674 ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
675 char32_t maxcode = max_code_point, codecvt_mode mode = {})
677 read_utf16_bom(from, mode);
678 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
679 maxcode = std::min(max_single_utf16_unit, maxcode);
680 while (from.size() && to.size())
682 const char32_t c = read_utf16_code_point(from, maxcode, mode);
683 if (c == incomplete_mb_character)
684 return codecvt_base::error; // UCS-2 only supports single units.
685 if (c > maxcode)
686 return codecvt_base::error;
687 to = c;
689 return from.nbytes() == 0 ? codecvt_base::ok : codecvt_base::partial;
692 const char16_t*
693 ucs2_span(range<const char16_t, false>& from, size_t max,
694 char32_t maxcode, codecvt_mode mode)
696 read_utf16_bom(from, mode);
697 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
698 maxcode = std::min(max_single_utf16_unit, maxcode);
699 char32_t c = 0;
700 while (max-- && c <= maxcode)
701 c = read_utf16_code_point(from, maxcode, mode);
702 return reinterpret_cast<const char16_t*>(from.next);
705 template<typename C>
706 const C*
707 ucs2_span(const C* begin, const C* end, size_t max,
708 char32_t maxcode, codecvt_mode mode)
710 range<const C> from{ begin, end };
711 read_utf8_bom(from, mode);
712 // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
713 maxcode = std::min(max_single_utf16_unit, maxcode);
714 char32_t c = 0;
715 while (max-- && c <= maxcode)
716 c = read_utf8_code_point(from, maxcode);
717 return from.next;
720 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
721 template<typename C>
722 const C*
723 ucs4_span(const C* begin, const C* end, size_t max,
724 char32_t maxcode = max_code_point, codecvt_mode mode = {})
726 range<const C> from{ begin, end };
727 read_utf8_bom(from, mode);
728 char32_t c = 0;
729 while (max-- && c <= maxcode)
730 c = read_utf8_code_point(from, maxcode);
731 return from.next;
734 // return pos such that [begin,pos) is valid UCS-4 string no longer than max
735 const char16_t*
736 ucs4_span(range<const char16_t, false>& from, size_t max,
737 char32_t maxcode = max_code_point, codecvt_mode mode = {})
739 read_utf16_bom(from, mode);
740 char32_t c = 0;
741 while (max-- && c <= maxcode)
742 c = read_utf16_code_point(from, maxcode, mode);
743 return reinterpret_cast<const char16_t*>(from.next);
747 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
748 // Converts from UTF-8 to UTF-16.
750 locale::id codecvt<char16_t, char, mbstate_t>::id;
752 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
754 codecvt_base::result
755 codecvt<char16_t, char, mbstate_t>::
756 do_out(state_type&,
757 const intern_type* __from,
758 const intern_type* __from_end, const intern_type*& __from_next,
759 extern_type* __to, extern_type* __to_end,
760 extern_type*& __to_next) const
762 range<const char16_t> from{ __from, __from_end };
763 range<char> to{ __to, __to_end };
764 auto res = utf16_out(from, to);
765 __from_next = from.next;
766 __to_next = to.next;
767 return res;
770 codecvt_base::result
771 codecvt<char16_t, char, mbstate_t>::
772 do_unshift(state_type&, extern_type* __to, extern_type*,
773 extern_type*& __to_next) const
775 __to_next = __to;
776 return noconv; // we don't use mbstate_t for the unicode facets
779 codecvt_base::result
780 codecvt<char16_t, char, mbstate_t>::
781 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
782 const extern_type*& __from_next,
783 intern_type* __to, intern_type* __to_end,
784 intern_type*& __to_next) const
786 range<const char> from{ __from, __from_end };
787 range<char16_t> to{ __to, __to_end };
788 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
789 codecvt_mode mode = {};
790 #else
791 codecvt_mode mode = little_endian;
792 #endif
793 auto res = utf16_in(from, to, max_code_point, mode);
794 __from_next = from.next;
795 __to_next = to.next;
796 return res;
800 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
801 { return 0; } // UTF-8 is not a fixed-width encoding
803 bool
804 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
805 { return false; }
808 codecvt<char16_t, char, mbstate_t>::
809 do_length(state_type&, const extern_type* __from,
810 const extern_type* __end, size_t __max) const
812 __end = utf16_span(__from, __end, __max);
813 return __end - __from;
817 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
819 // A single character (one or two UTF-16 code units) requires
820 // up to four UTF-8 code units.
821 return 4;
824 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
825 // Converts from UTF-8 to UTF-32 (aka UCS-4).
827 locale::id codecvt<char32_t, char, mbstate_t>::id;
829 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
831 codecvt_base::result
832 codecvt<char32_t, char, mbstate_t>::
833 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
834 const intern_type*& __from_next,
835 extern_type* __to, extern_type* __to_end,
836 extern_type*& __to_next) const
838 range<const char32_t> from{ __from, __from_end };
839 range<char> to{ __to, __to_end };
840 auto res = ucs4_out(from, to);
841 __from_next = from.next;
842 __to_next = to.next;
843 return res;
846 codecvt_base::result
847 codecvt<char32_t, char, mbstate_t>::
848 do_unshift(state_type&, extern_type* __to, extern_type*,
849 extern_type*& __to_next) const
851 __to_next = __to;
852 return noconv;
855 codecvt_base::result
856 codecvt<char32_t, char, mbstate_t>::
857 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
858 const extern_type*& __from_next,
859 intern_type* __to, intern_type* __to_end,
860 intern_type*& __to_next) const
862 range<const char> from{ __from, __from_end };
863 range<char32_t> to{ __to, __to_end };
864 auto res = ucs4_in(from, to);
865 __from_next = from.next;
866 __to_next = to.next;
867 return res;
871 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
872 { return 0; } // UTF-8 is not a fixed-width encoding
874 bool
875 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
876 { return false; }
879 codecvt<char32_t, char, mbstate_t>::
880 do_length(state_type&, const extern_type* __from,
881 const extern_type* __end, size_t __max) const
883 __end = ucs4_span(__from, __end, __max);
884 return __end - __from;
888 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
890 // A single character (one UTF-32 code unit) requires
891 // up to 4 UTF-8 code units.
892 return 4;
895 #if defined(_GLIBCXX_USE_CHAR8_T)
896 // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
897 // Converts from UTF-8 to UTF-16.
899 locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
901 codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { }
903 codecvt_base::result
904 codecvt<char16_t, char8_t, mbstate_t>::
905 do_out(state_type&,
906 const intern_type* __from,
907 const intern_type* __from_end, const intern_type*& __from_next,
908 extern_type* __to, extern_type* __to_end,
909 extern_type*& __to_next) const
911 range<const char16_t> from{ __from, __from_end };
912 range<char8_t> to{ __to, __to_end };
913 auto res = utf16_out(from, to);
914 __from_next = from.next;
915 __to_next = to.next;
916 return res;
919 codecvt_base::result
920 codecvt<char16_t, char8_t, mbstate_t>::
921 do_unshift(state_type&, extern_type* __to, extern_type*,
922 extern_type*& __to_next) const
924 __to_next = __to;
925 return noconv; // we don't use mbstate_t for the unicode facets
928 codecvt_base::result
929 codecvt<char16_t, char8_t, mbstate_t>::
930 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
931 const extern_type*& __from_next,
932 intern_type* __to, intern_type* __to_end,
933 intern_type*& __to_next) const
935 range<const char8_t> from{ __from, __from_end };
936 range<char16_t> to{ __to, __to_end };
937 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
938 codecvt_mode mode = {};
939 #else
940 codecvt_mode mode = little_endian;
941 #endif
942 auto res = utf16_in(from, to, max_code_point, mode);
943 __from_next = from.next;
944 __to_next = to.next;
945 return res;
949 codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw()
950 { return 0; } // UTF-8 is not a fixed-width encoding
952 bool
953 codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw()
954 { return false; }
957 codecvt<char16_t, char8_t, mbstate_t>::
958 do_length(state_type&, const extern_type* __from,
959 const extern_type* __end, size_t __max) const
961 __end = utf16_span(__from, __end, __max);
962 return __end - __from;
966 codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw()
968 // A single character (one or two UTF-16 code units) requires
969 // up to four UTF-8 code units.
970 return 4;
973 // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
974 // Converts from UTF-8 to UTF-32 (aka UCS-4).
976 locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
978 codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { }
980 codecvt_base::result
981 codecvt<char32_t, char8_t, mbstate_t>::
982 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
983 const intern_type*& __from_next,
984 extern_type* __to, extern_type* __to_end,
985 extern_type*& __to_next) const
987 range<const char32_t> from{ __from, __from_end };
988 range<char8_t> to{ __to, __to_end };
989 auto res = ucs4_out(from, to);
990 __from_next = from.next;
991 __to_next = to.next;
992 return res;
995 codecvt_base::result
996 codecvt<char32_t, char8_t, mbstate_t>::
997 do_unshift(state_type&, extern_type* __to, extern_type*,
998 extern_type*& __to_next) const
1000 __to_next = __to;
1001 return noconv;
1004 codecvt_base::result
1005 codecvt<char32_t, char8_t, mbstate_t>::
1006 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1007 const extern_type*& __from_next,
1008 intern_type* __to, intern_type* __to_end,
1009 intern_type*& __to_next) const
1011 range<const char8_t> from{ __from, __from_end };
1012 range<char32_t> to{ __to, __to_end };
1013 auto res = ucs4_in(from, to);
1014 __from_next = from.next;
1015 __to_next = to.next;
1016 return res;
1020 codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw()
1021 { return 0; } // UTF-8 is not a fixed-width encoding
1023 bool
1024 codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw()
1025 { return false; }
1028 codecvt<char32_t, char8_t, mbstate_t>::
1029 do_length(state_type&, const extern_type* __from,
1030 const extern_type* __end, size_t __max) const
1032 __end = ucs4_span(__from, __end, __max);
1033 return __end - __from;
1037 codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw()
1039 // A single character (one UTF-32 code unit) requires
1040 // up to 4 UTF-8 code units.
1041 return 4;
1043 #endif // _GLIBCXX_USE_CHAR8_T
1045 // Define members of codecvt_utf8<char16_t> base class implementation.
1046 // Converts from UTF-8 to UCS-2.
1048 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
1050 codecvt_base::result
1051 __codecvt_utf8_base<char16_t>::
1052 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1053 const intern_type*& __from_next,
1054 extern_type* __to, extern_type* __to_end,
1055 extern_type*& __to_next) const
1057 range<const char16_t> from{ __from, __from_end };
1058 range<char> to{ __to, __to_end };
1059 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1060 __from_next = from.next;
1061 __to_next = to.next;
1062 return res;
1065 codecvt_base::result
1066 __codecvt_utf8_base<char16_t>::
1067 do_unshift(state_type&, extern_type* __to, extern_type*,
1068 extern_type*& __to_next) const
1070 __to_next = __to;
1071 return noconv;
1074 codecvt_base::result
1075 __codecvt_utf8_base<char16_t>::
1076 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1077 const extern_type*& __from_next,
1078 intern_type* __to, intern_type* __to_end,
1079 intern_type*& __to_next) const
1081 range<const char> from{ __from, __from_end };
1082 range<char16_t> to{ __to, __to_end };
1083 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1084 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1085 mode = codecvt_mode(mode | little_endian);
1086 #endif
1087 auto res = ucs2_in(from, to, _M_maxcode, mode);
1088 __from_next = from.next;
1089 __to_next = to.next;
1090 return res;
1094 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
1095 { return 0; } // UTF-8 is not a fixed-width encoding
1097 bool
1098 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
1099 { return false; }
1102 __codecvt_utf8_base<char16_t>::
1103 do_length(state_type&, const extern_type* __from,
1104 const extern_type* __end, size_t __max) const
1106 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1107 return __end - __from;
1111 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
1113 // A single UCS-2 character requires up to three UTF-8 code units.
1114 // (UCS-2 cannot represent characters that use four UTF-8 code units).
1115 int max = 3;
1116 if (_M_mode & consume_header)
1117 max += sizeof(utf8_bom);
1118 return max;
1121 // Define members of codecvt_utf8<char32_t> base class implementation.
1122 // Converts from UTF-8 to UTF-32 (aka UCS-4).
1124 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
1126 codecvt_base::result
1127 __codecvt_utf8_base<char32_t>::
1128 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1129 const intern_type*& __from_next,
1130 extern_type* __to, extern_type* __to_end,
1131 extern_type*& __to_next) const
1133 range<const char32_t> from{ __from, __from_end };
1134 range<char> to{ __to, __to_end };
1135 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1136 __from_next = from.next;
1137 __to_next = to.next;
1138 return res;
1141 codecvt_base::result
1142 __codecvt_utf8_base<char32_t>::
1143 do_unshift(state_type&, extern_type* __to, extern_type*,
1144 extern_type*& __to_next) const
1146 __to_next = __to;
1147 return noconv;
1150 codecvt_base::result
1151 __codecvt_utf8_base<char32_t>::
1152 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1153 const extern_type*& __from_next,
1154 intern_type* __to, intern_type* __to_end,
1155 intern_type*& __to_next) const
1157 range<const char> from{ __from, __from_end };
1158 range<char32_t> to{ __to, __to_end };
1159 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1160 __from_next = from.next;
1161 __to_next = to.next;
1162 return res;
1166 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1167 { return 0; } // UTF-8 is not a fixed-width encoding
1169 bool
1170 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1171 { return false; }
1174 __codecvt_utf8_base<char32_t>::
1175 do_length(state_type&, const extern_type* __from,
1176 const extern_type* __end, size_t __max) const
1178 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1179 return __end - __from;
1183 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1185 // A single UCS-4 character requires up to four UTF-8 code units.
1186 int max = 4;
1187 if (_M_mode & consume_header)
1188 max += sizeof(utf8_bom);
1189 return max;
1192 #ifdef _GLIBCXX_USE_WCHAR_T
1194 #if __SIZEOF_WCHAR_T__ == 2
1195 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1196 #elif __SIZEOF_WCHAR_T__ == 4
1197 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1198 #endif
1200 // Define members of codecvt_utf8<wchar_t> base class implementation.
1201 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1203 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1205 codecvt_base::result
1206 __codecvt_utf8_base<wchar_t>::
1207 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1208 const intern_type*& __from_next,
1209 extern_type* __to, extern_type* __to_end,
1210 extern_type*& __to_next) const
1212 range<char> to{ __to, __to_end };
1213 #if __SIZEOF_WCHAR_T__ == 2
1214 range<const char16_t> from{
1215 reinterpret_cast<const char16_t*>(__from),
1216 reinterpret_cast<const char16_t*>(__from_end)
1218 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1219 #elif __SIZEOF_WCHAR_T__ == 4
1220 range<const char32_t> from{
1221 reinterpret_cast<const char32_t*>(__from),
1222 reinterpret_cast<const char32_t*>(__from_end)
1224 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1225 #else
1226 return codecvt_base::error;
1227 #endif
1228 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1229 __to_next = to.next;
1230 return res;
1233 codecvt_base::result
1234 __codecvt_utf8_base<wchar_t>::
1235 do_unshift(state_type&, extern_type* __to, extern_type*,
1236 extern_type*& __to_next) const
1238 __to_next = __to;
1239 return noconv;
1242 codecvt_base::result
1243 __codecvt_utf8_base<wchar_t>::
1244 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1245 const extern_type*& __from_next,
1246 intern_type* __to, intern_type* __to_end,
1247 intern_type*& __to_next) const
1249 range<const char> from{ __from, __from_end };
1250 #if __SIZEOF_WCHAR_T__ == 2
1251 range<char16_t> to{
1252 reinterpret_cast<char16_t*>(__to),
1253 reinterpret_cast<char16_t*>(__to_end)
1255 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1256 codecvt_mode mode = {};
1257 #else
1258 codecvt_mode mode = little_endian;
1259 #endif
1260 auto res = ucs2_in(from, to, _M_maxcode, mode);
1261 #elif __SIZEOF_WCHAR_T__ == 4
1262 range<char32_t> to{
1263 reinterpret_cast<char32_t*>(__to),
1264 reinterpret_cast<char32_t*>(__to_end)
1266 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1267 #else
1268 return codecvt_base::error;
1269 #endif
1270 __from_next = from.next;
1271 __to_next = reinterpret_cast<wchar_t*>(to.next);
1272 return res;
1276 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1277 { return 0; } // UTF-8 is not a fixed-width encoding
1279 bool
1280 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1281 { return false; }
1284 __codecvt_utf8_base<wchar_t>::
1285 do_length(state_type&, const extern_type* __from,
1286 const extern_type* __end, size_t __max) const
1288 #if __SIZEOF_WCHAR_T__ == 2
1289 __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1290 #elif __SIZEOF_WCHAR_T__ == 4
1291 __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1292 #else
1293 __end = __from;
1294 #endif
1295 return __end - __from;
1299 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1301 #if __SIZEOF_WCHAR_T__ == 2
1302 int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1303 #else
1304 int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1305 #endif
1306 if (_M_mode & consume_header)
1307 max += sizeof(utf8_bom);
1308 return max;
1310 #endif
1312 // Define members of codecvt_utf16<char16_t> base class implementation.
1313 // Converts from UTF-16 to UCS-2.
1315 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1317 codecvt_base::result
1318 __codecvt_utf16_base<char16_t>::
1319 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1320 const intern_type*& __from_next,
1321 extern_type* __to, extern_type* __to_end,
1322 extern_type*& __to_next) const
1324 range<const char16_t> from{ __from, __from_end };
1325 range<char16_t, false> to{ __to, __to_end };
1326 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1327 __from_next = from.next;
1328 __to_next = reinterpret_cast<char*>(to.next);
1329 return res;
1332 codecvt_base::result
1333 __codecvt_utf16_base<char16_t>::
1334 do_unshift(state_type&, extern_type* __to, extern_type*,
1335 extern_type*& __to_next) const
1337 __to_next = __to;
1338 return noconv;
1341 codecvt_base::result
1342 __codecvt_utf16_base<char16_t>::
1343 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1344 const extern_type*& __from_next,
1345 intern_type* __to, intern_type* __to_end,
1346 intern_type*& __to_next) const
1348 range<const char16_t, false> from{ __from, __from_end };
1349 range<char16_t> to{ __to, __to_end };
1350 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1351 __from_next = reinterpret_cast<const char*>(from.next);
1352 __to_next = to.next;
1353 return res;
1357 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1358 { return 0; } // UTF-16 is not a fixed-width encoding
1360 bool
1361 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1362 { return false; }
1365 __codecvt_utf16_base<char16_t>::
1366 do_length(state_type&, const extern_type* __from,
1367 const extern_type* __end, size_t __max) const
1369 range<const char16_t, false> from{ __from, __end };
1370 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1371 return reinterpret_cast<const char*>(next) - __from;
1375 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1377 // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1378 // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1379 int max = 2;
1380 if (_M_mode & consume_header)
1381 max += sizeof(utf16_bom);
1382 return max;
1385 // Define members of codecvt_utf16<char32_t> base class implementation.
1386 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1388 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1390 codecvt_base::result
1391 __codecvt_utf16_base<char32_t>::
1392 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1393 const intern_type*& __from_next,
1394 extern_type* __to, extern_type* __to_end,
1395 extern_type*& __to_next) const
1397 range<const char32_t> from{ __from, __from_end };
1398 range<char16_t, false> to{ __to, __to_end };
1399 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1400 __from_next = from.next;
1401 __to_next = reinterpret_cast<char*>(to.next);
1402 return res;
1405 codecvt_base::result
1406 __codecvt_utf16_base<char32_t>::
1407 do_unshift(state_type&, extern_type* __to, extern_type*,
1408 extern_type*& __to_next) const
1410 __to_next = __to;
1411 return noconv;
1414 codecvt_base::result
1415 __codecvt_utf16_base<char32_t>::
1416 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1417 const extern_type*& __from_next,
1418 intern_type* __to, intern_type* __to_end,
1419 intern_type*& __to_next) const
1421 range<const char16_t, false> from{ __from, __from_end };
1422 range<char32_t> to{ __to, __to_end };
1423 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1424 __from_next = reinterpret_cast<const char*>(from.next);
1425 __to_next = to.next;
1426 return res;
1430 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1431 { return 0; } // UTF-16 is not a fixed-width encoding
1433 bool
1434 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1435 { return false; }
1438 __codecvt_utf16_base<char32_t>::
1439 do_length(state_type&, const extern_type* __from,
1440 const extern_type* __end, size_t __max) const
1442 range<const char16_t, false> from{ __from, __end };
1443 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1444 return reinterpret_cast<const char*>(next) - __from;
1448 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1450 // A single UCS-4 character requires one or two UTF-16 code units
1451 // (so up to four chars).
1452 int max = 4;
1453 if (_M_mode & consume_header)
1454 max += sizeof(utf16_bom);
1455 return max;
1458 #ifdef _GLIBCXX_USE_WCHAR_T
1459 // Define members of codecvt_utf16<wchar_t> base class implementation.
1460 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1462 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1464 codecvt_base::result
1465 __codecvt_utf16_base<wchar_t>::
1466 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1467 const intern_type*& __from_next,
1468 extern_type* __to, extern_type* __to_end,
1469 extern_type*& __to_next) const
1471 range<char16_t, false> to{ __to, __to_end };
1472 #if __SIZEOF_WCHAR_T__ == 2
1473 range<const char16_t> from{
1474 reinterpret_cast<const char16_t*>(__from),
1475 reinterpret_cast<const char16_t*>(__from_end),
1477 auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1478 #elif __SIZEOF_WCHAR_T__ == 4
1479 range<const char32_t> from{
1480 reinterpret_cast<const char32_t*>(__from),
1481 reinterpret_cast<const char32_t*>(__from_end),
1483 auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1484 #else
1485 return codecvt_base::error;
1486 #endif
1487 __from_next = reinterpret_cast<const wchar_t*>(from.next);
1488 __to_next = reinterpret_cast<char*>(to.next);
1489 return res;
1492 codecvt_base::result
1493 __codecvt_utf16_base<wchar_t>::
1494 do_unshift(state_type&, extern_type* __to, extern_type*,
1495 extern_type*& __to_next) const
1497 __to_next = __to;
1498 return noconv;
1501 codecvt_base::result
1502 __codecvt_utf16_base<wchar_t>::
1503 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1504 const extern_type*& __from_next,
1505 intern_type* __to, intern_type* __to_end,
1506 intern_type*& __to_next) const
1508 range<const char16_t, false> from{ __from, __from_end };
1509 #if __SIZEOF_WCHAR_T__ == 2
1510 range<char16_t> to{
1511 reinterpret_cast<char16_t*>(__to),
1512 reinterpret_cast<char16_t*>(__to_end),
1514 auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1515 #elif __SIZEOF_WCHAR_T__ == 4
1516 range<char32_t> to{
1517 reinterpret_cast<char32_t*>(__to),
1518 reinterpret_cast<char32_t*>(__to_end),
1520 auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1521 #else
1522 return codecvt_base::error;
1523 #endif
1524 __from_next = reinterpret_cast<const char*>(from.next);
1525 __to_next = reinterpret_cast<wchar_t*>(to.next);
1526 return res;
1530 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1531 { return 0; } // UTF-16 is not a fixed-width encoding
1533 bool
1534 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1535 { return false; }
1538 __codecvt_utf16_base<wchar_t>::
1539 do_length(state_type&, const extern_type* __from,
1540 const extern_type* __end, size_t __max) const
1542 range<const char16_t, false> from{ __from, __end };
1543 #if __SIZEOF_WCHAR_T__ == 2
1544 const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1545 #elif __SIZEOF_WCHAR_T__ == 4
1546 const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1547 #endif
1548 return reinterpret_cast<const char*>(next) - __from;
1552 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1554 #if __SIZEOF_WCHAR_T__ == 2
1555 int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1556 #else
1557 int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1558 #endif
1559 if (_M_mode & consume_header)
1560 max += sizeof(utf16_bom);
1561 return max;
1563 #endif
1565 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566 // Converts from UTF-8 to UTF-16.
1568 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1570 codecvt_base::result
1571 __codecvt_utf8_utf16_base<char16_t>::
1572 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1573 const intern_type*& __from_next,
1574 extern_type* __to, extern_type* __to_end,
1575 extern_type*& __to_next) const
1577 range<const char16_t> from{ __from, __from_end };
1578 range<char> to{ __to, __to_end };
1579 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1580 __from_next = from.next;
1581 __to_next = to.next;
1582 return res;
1585 codecvt_base::result
1586 __codecvt_utf8_utf16_base<char16_t>::
1587 do_unshift(state_type&, extern_type* __to, extern_type*,
1588 extern_type*& __to_next) const
1590 __to_next = __to;
1591 return noconv;
1594 codecvt_base::result
1595 __codecvt_utf8_utf16_base<char16_t>::
1596 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1597 const extern_type*& __from_next,
1598 intern_type* __to, intern_type* __to_end,
1599 intern_type*& __to_next) const
1601 range<const char> from{ __from, __from_end };
1602 range<char16_t> to{ __to, __to_end };
1603 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1604 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605 mode = codecvt_mode(mode | little_endian);
1606 #endif
1607 auto res = utf16_in(from, to, _M_maxcode, mode);
1608 __from_next = from.next;
1609 __to_next = to.next;
1610 return res;
1614 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1615 { return 0; } // UTF-8 is not a fixed-width encoding
1617 bool
1618 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1619 { return false; }
1622 __codecvt_utf8_utf16_base<char16_t>::
1623 do_length(state_type&, const extern_type* __from,
1624 const extern_type* __end, size_t __max) const
1626 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1627 return __end - __from;
1631 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1633 // A single character can be 1 or 2 UTF-16 code units,
1634 // requiring up to 4 UTF-8 code units.
1635 int max = 4;
1636 if (_M_mode & consume_header)
1637 max += sizeof(utf8_bom);
1638 return max;
1641 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642 // Converts from UTF-8 to UTF-16.
1644 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1646 codecvt_base::result
1647 __codecvt_utf8_utf16_base<char32_t>::
1648 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1649 const intern_type*& __from_next,
1650 extern_type* __to, extern_type* __to_end,
1651 extern_type*& __to_next) const
1653 range<const char32_t> from{ __from, __from_end };
1654 range<char> to{ __to, __to_end };
1655 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1656 __from_next = from.next;
1657 __to_next = to.next;
1658 return res;
1661 codecvt_base::result
1662 __codecvt_utf8_utf16_base<char32_t>::
1663 do_unshift(state_type&, extern_type* __to, extern_type*,
1664 extern_type*& __to_next) const
1666 __to_next = __to;
1667 return noconv;
1670 codecvt_base::result
1671 __codecvt_utf8_utf16_base<char32_t>::
1672 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1673 const extern_type*& __from_next,
1674 intern_type* __to, intern_type* __to_end,
1675 intern_type*& __to_next) const
1677 range<const char> from{ __from, __from_end };
1678 range<char32_t> to{ __to, __to_end };
1679 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1680 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681 mode = codecvt_mode(mode | little_endian);
1682 #endif
1683 auto res = utf16_in(from, to, _M_maxcode, mode);
1684 __from_next = from.next;
1685 __to_next = to.next;
1686 return res;
1690 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1691 { return 0; } // UTF-8 is not a fixed-width encoding
1693 bool
1694 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1695 { return false; }
1698 __codecvt_utf8_utf16_base<char32_t>::
1699 do_length(state_type&, const extern_type* __from,
1700 const extern_type* __end, size_t __max) const
1702 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1703 return __end - __from;
1707 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1709 // A single character can be 1 or 2 UTF-16 code units,
1710 // requiring up to 4 UTF-8 code units.
1711 int max = 4;
1712 if (_M_mode & consume_header)
1713 max += sizeof(utf8_bom);
1714 return max;
1717 #ifdef _GLIBCXX_USE_WCHAR_T
1718 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719 // Converts from UTF-8 to UTF-16.
1721 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1723 codecvt_base::result
1724 __codecvt_utf8_utf16_base<wchar_t>::
1725 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1726 const intern_type*& __from_next,
1727 extern_type* __to, extern_type* __to_end,
1728 extern_type*& __to_next) const
1730 range<const wchar_t> from{ __from, __from_end };
1731 range<char> to{ __to, __to_end };
1732 auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1733 __from_next = from.next;
1734 __to_next = to.next;
1735 return res;
1738 codecvt_base::result
1739 __codecvt_utf8_utf16_base<wchar_t>::
1740 do_unshift(state_type&, extern_type* __to, extern_type*,
1741 extern_type*& __to_next) const
1743 __to_next = __to;
1744 return noconv;
1747 codecvt_base::result
1748 __codecvt_utf8_utf16_base<wchar_t>::
1749 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1750 const extern_type*& __from_next,
1751 intern_type* __to, intern_type* __to_end,
1752 intern_type*& __to_next) const
1754 range<const char> from{ __from, __from_end };
1755 range<wchar_t> to{ __to, __to_end };
1756 codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1757 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758 mode = codecvt_mode(mode | little_endian);
1759 #endif
1760 auto res = utf16_in(from, to, _M_maxcode, mode);
1761 __from_next = from.next;
1762 __to_next = to.next;
1763 return res;
1767 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1768 { return 0; } // UTF-8 is not a fixed-width encoding
1770 bool
1771 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1772 { return false; }
1775 __codecvt_utf8_utf16_base<wchar_t>::
1776 do_length(state_type&, const extern_type* __from,
1777 const extern_type* __end, size_t __max) const
1779 __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1780 return __end - __from;
1784 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1786 // A single character can be 1 or 2 UTF-16 code units,
1787 // requiring up to 4 UTF-8 code units.
1788 int max = 4;
1789 if (_M_mode & consume_header)
1790 max += sizeof(utf8_bom);
1791 return max;
1793 #endif
1795 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1796 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1797 template class codecvt_byname<char16_t, char, mbstate_t>;
1798 template class codecvt_byname<char32_t, char, mbstate_t>;
1800 #if defined(_GLIBCXX_USE_CHAR8_T)
1801 inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>;
1802 inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>;
1803 template class codecvt_byname<char16_t, char8_t, mbstate_t>;
1804 template class codecvt_byname<char32_t, char8_t, mbstate_t>;
1805 #endif
1807 _GLIBCXX_END_NAMESPACE_VERSION