libstdc++-v3/src/c++11/codecvt.cc

   1 // Locale support (codecvt) -*- C++ -*-
   2
   3 // Copyright (C) 2015-2024 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 #include <codecvt>
  26 #include <cstring>              // std::memcpy, std::memcmp
  27 #include <bits/stl_algobase.h>  // std::min
  28
  29 namespace std _GLIBCXX_VISIBILITY(default)
  30 {
  31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  32
  33   // The standard doesn't define these operators, which is annoying.
  34   static underlying_type<codecvt_mode>::type
  35   to_integer(codecvt_mode m)
  36   { return static_cast<underlying_type<codecvt_mode>::type>(m); }
  37
  38   static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
  39   { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
  40
  41   static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
  42   { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
  43
  44   static codecvt_mode operator~(codecvt_mode m)
  45   { return codecvt_mode(~to_integer(m)); }
  46
  47 namespace
  48 {
  49   // Largest code point that fits in a single UTF-16 code unit.
  50   const char32_t max_single_utf16_unit = 0xFFFF;
  51
  52   const char32_t max_code_point = 0x10FFFF;
  53
  54   // The functions below rely on maxcode < incomplete_mb_character
  55   // (which is enforced by the codecvt_utf* classes on construction).
  56   const char32_t incomplete_mb_character = char32_t(-2);
  57   const char32_t invalid_mb_sequence = char32_t(-1);
  58
  59   // Utility type for reading and writing code units of type Elem from
  60   // a range defined by a pair of pointers.
  61   template<typename Elem, bool Aligned = true>
  62     struct range
  63     {
  64       Elem* next;
  65       Elem* end;
  66
  67       // Write a code unit.
  68       range& operator=(Elem e)
  69       {
  70         *next++ = e;
  71         return *this;
  72       }
  73
  74       // Read the next code unit.
  75       Elem operator*() const { return *next; }
  76
  77       // Read the Nth code unit.
  78       Elem operator[](size_t n) const { return next[n]; }
  79
  80       // Move to the next code unit.
  81       range& operator++()
  82       {
  83         ++next;
  84         return *this;
  85       }
  86
  87       // Move to the Nth code unit.
  88       range& operator+=(size_t n)
  89       {
  90         next += n;
  91         return *this;
  92       }
  93
  94       // The number of code units remaining.
  95       size_t size() const { return end - next; }
  96
  97       // The number of bytes remaining.
  98       size_t nbytes() const { return (const char*)end - (const char*)next; }
  99     };
 100
 101   // This specialization is used when accessing char16_t values through
 102   // pointers to char, which might not be correctly aligned for char16_t.
 103   template<typename Elem>
 104     struct range<Elem, false>
 105     {
 106       using value_type = typename remove_const<Elem>::type;
 107
 108       using char_pointer = typename
 109         conditional<is_const<Elem>::value, const char*, char*>::type;
 110
 111       char_pointer next;
 112       char_pointer end;
 113
 114       // Write a code unit.
 115       range& operator=(Elem e)
 116       {
 117         memcpy(next, &e, sizeof(Elem));
 118         ++*this;
 119         return *this;
 120       }
 121
 122       // Read the next code unit.
 123       Elem operator*() const
 124       {
 125         value_type e;
 126         memcpy(&e, next, sizeof(Elem));
 127         return e;
 128       }
 129
 130       // Read the Nth code unit.
 131       Elem operator[](size_t n) const
 132       {
 133         value_type e;
 134         memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
 135         return e;
 136       }
 137
 138       // Move to the next code unit.
 139       range& operator++()
 140       {
 141         next += sizeof(Elem);
 142         return *this;
 143       }
 144
 145       // Move to the Nth code unit.
 146       range& operator+=(size_t n)
 147       {
 148         next += n * sizeof(Elem);
 149         return *this;
 150       }
 151
 152       // The number of code units remaining.
 153       size_t size() const { return nbytes() / sizeof(Elem); }
 154
 155       // The number of bytes remaining.
 156       size_t nbytes() const { return end - next; }
 157     };
 158
 159   // Multibyte sequences can have "header" consisting of Byte Order Mark
 160   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
 161   const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
 162   const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 163
 164   // Write a BOM (space permitting).
 165   template<typename C, bool A, size_t N>
 166     bool
 167     write_bom(range<C, A>& to, const unsigned char (&bom)[N])
 168     {
 169       static_assert( (N / sizeof(C)) != 0, "" );
 170       static_assert( (N % sizeof(C)) == 0, "" );
 171
 172       if (to.nbytes() < N)
 173         return false;
 174       memcpy(to.next, bom, N);
 175       to += (N / sizeof(C));
 176       return true;
 177     }
 178
 179   // Try to read a BOM.
 180   template<typename C, bool A, size_t N>
 181     bool
 182     read_bom(range<C, A>& from, const unsigned char (&bom)[N])
 183     {
 184       static_assert( (N / sizeof(C)) != 0, "" );
 185       static_assert( (N % sizeof(C)) == 0, "" );
 186
 187       if (from.nbytes() >= N && !memcmp(from.next, bom, N))
 188         {
 189           from += (N / sizeof(C));
 190           return true;
 191         }
 192       return false;
 193     }
 194
 195   // If generate_header is set in mode write out UTF-8 BOM.
 196   template<typename C>
 197   bool
 198   write_utf8_bom(range<C>& to, codecvt_mode mode)
 199   {
 200     if (mode & generate_header)
 201       return write_bom(to, utf8_bom);
 202     return true;
 203   }
 204
 205   // If generate_header is set in mode write out the UTF-16 BOM indicated
 206   // by whether little_endian is set in mode.
 207   template<bool Aligned>
 208   bool
 209   write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
 210   {
 211     if (mode & generate_header)
 212     {
 213       if (mode & little_endian)
 214         return write_bom(to, utf16le_bom);
 215       else
 216         return write_bom(to, utf16_bom);
 217     }
 218     return true;
 219   }
 220
 221   // If consume_header is set in mode update from.next to after any BOM.
 222   template<typename C>
 223   void
 224   read_utf8_bom(range<const C>& from, codecvt_mode mode)
 225   {
 226     if (mode & consume_header)
 227       read_bom(from, utf8_bom);
 228   }
 229
 230   // If consume_header is not set in mode, no effects.
 231   // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
 232   // - if the UTF-16BE BOM was found unset little_endian in mode, or
 233   // - if the UTF-16LE BOM was found set little_endian in mode.
 234   template<bool Aligned>
 235   void
 236   read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
 237   {
 238     if (mode & consume_header)
 239       {
 240         if (read_bom(from, utf16_bom))
 241           mode &= ~little_endian;
 242         else if (read_bom(from, utf16le_bom))
 243           mode |= little_endian;
 244       }
 245   }
 246
 247   // Read a codepoint from a UTF-8 multibyte sequence.
 248   // Updates from.next if the codepoint is not greater than maxcode.
 249   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 250   template<typename C>
 251   char32_t
 252   read_utf8_code_point(range<const C>& from, unsigned long maxcode)
 253   {
 254     const size_t avail = from.size();
 255     if (avail == 0)
 256       return incomplete_mb_character;
 257     char32_t c1 = (unsigned char) from[0];
 258     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
 259     if (c1 < 0x80) [[likely]]
 260     {
 261       ++from;
 262       return c1;
 263     }
 264     else if (c1 < 0xC2) [[unlikely]] // continuation or overlong 2-byte sequence
 265       return invalid_mb_sequence;
 266     else if (c1 < 0xE0) // 2-byte sequence
 267     {
 268       if (avail < 2) [[unlikely]]
 269         return incomplete_mb_character;
 270       char32_t c2 = (unsigned char) from[1];
 271       if ((c2 & 0xC0) != 0x80) [[unlikely]]
 272         return invalid_mb_sequence;
 273       char32_t c = (c1 << 6) + c2 - 0x3080;
 274       if (c <= maxcode)
 275         from += 2;
 276       return c;
 277     }
 278     else if (c1 < 0xF0) // 3-byte sequence
 279     {
 280       if (avail < 2) [[unlikely]]
 281         return incomplete_mb_character;
 282       char32_t c2 = (unsigned char) from[1];
 283       if ((c2 & 0xC0) != 0x80) [[unlikely]]
 284         return invalid_mb_sequence;
 285       if (c1 == 0xE0 && c2 < 0xA0) [[unlikely]] // overlong
 286         return invalid_mb_sequence;
 287       if (c1 == 0xED && c2 >= 0xA0) [[unlikely]] // surrogate
 288         return invalid_mb_sequence;
 289       if (avail < 3) [[unlikely]]
 290         return incomplete_mb_character;
 291       char32_t c3 = (unsigned char) from[2];
 292       if ((c3 & 0xC0) != 0x80) [[unlikely]]
 293         return invalid_mb_sequence;
 294       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
 295       if (c <= maxcode)
 296         from += 3;
 297       return c;
 298     }
 299     else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence
 300     {
 301       if (avail < 2) [[unlikely]]
 302         return incomplete_mb_character;
 303       char32_t c2 = (unsigned char) from[1];
 304       if ((c2 & 0xC0) != 0x80) [[unlikely]]
 305         return invalid_mb_sequence;
 306       if (c1 == 0xF0 && c2 < 0x90) [[unlikely]] // overlong
 307         return invalid_mb_sequence;
 308       if (c1 == 0xF4 && c2 >= 0x90) [[unlikely]] // > U+10FFFF
 309         return invalid_mb_sequence;
 310       if (avail < 3) [[unlikely]]
 311         return incomplete_mb_character;
 312       char32_t c3 = (unsigned char) from[2];
 313       if ((c3 & 0xC0) != 0x80) [[unlikely]]
 314         return invalid_mb_sequence;
 315       if (avail < 4) [[unlikely]]
 316         return incomplete_mb_character;
 317       char32_t c4 = (unsigned char) from[3];
 318       if ((c4 & 0xC0) != 0x80) [[unlikely]]
 319         return invalid_mb_sequence;
 320       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
 321       if (c <= maxcode)
 322         from += 4;
 323       return c;
 324     }
 325     else [[unlikely]] // > U+10FFFF
 326       return invalid_mb_sequence;
 327   }
 328
 329   template<typename C>
 330   bool
 331   write_utf8_code_point(range<C>& to, char32_t code_point)
 332   {
 333     if (code_point < 0x80)
 334       {
 335         if (to.size() < 1) [[unlikely]]
 336           return false;
 337         to = code_point;
 338       }
 339     else if (code_point <= 0x7FF)
 340       {
 341         if (to.size() < 2) [[unlikely]]
 342           return false;
 343         to = (code_point >> 6) + 0xC0;
 344         to = (code_point & 0x3F) + 0x80;
 345       }
 346     else if (code_point <= 0xFFFF)
 347       {
 348         if (to.size() < 3) [[unlikely]]
 349           return false;
 350         to = (code_point >> 12) + 0xE0;
 351         to = ((code_point >> 6) & 0x3F) + 0x80;
 352         to = (code_point & 0x3F) + 0x80;
 353       }
 354     else if (code_point <= 0x10FFFF)
 355       {
 356         if (to.size() < 4) [[unlikely]]
 357           return false;
 358         to = (code_point >> 18) + 0xF0;
 359         to = ((code_point >> 12) & 0x3F) + 0x80;
 360         to = ((code_point >> 6) & 0x3F) + 0x80;
 361         to = (code_point & 0x3F) + 0x80;
 362       }
 363     else [[unlikely]]
 364       return false;
 365     return true;
 366   }
 367
 368   inline char16_t
 369   adjust_byte_order(char16_t c, codecvt_mode mode)
 370   {
 371 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 372     return (mode & little_endian) ? __builtin_bswap16(c) : c;
 373 #else
 374     return (mode & little_endian) ? c : __builtin_bswap16(c);
 375 #endif
 376   }
 377
 378   // Return true if c is a high-surrogate (aka leading) code point.
 379   inline bool
 380   is_high_surrogate(char32_t c)
 381   {
 382     return c >= 0xD800 && c <= 0xDBFF;
 383   }
 384
 385   // Return true if c is a low-surrogate (aka trailing) code point.
 386   inline bool
 387   is_low_surrogate(char32_t c)
 388   {
 389     return c >= 0xDC00 && c <= 0xDFFF;
 390   }
 391
 392   inline char32_t
 393   surrogate_pair_to_code_point(char32_t high, char32_t low)
 394   {
 395     return (high << 10) + low - 0x35FDC00;
 396   }
 397
 398   // Read a codepoint from a UTF-16 multibyte sequence.
 399   // The sequence's endianness is indicated by (mode & little_endian).
 400   // Updates from.next if the codepoint is not greater than maxcode.
 401   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 402   template<bool Aligned>
 403     char32_t
 404     read_utf16_code_point(range<const char16_t, Aligned>& from,
 405                           unsigned long maxcode, codecvt_mode mode)
 406     {
 407       const size_t avail = from.size();
 408       if (avail == 0) [[unlikely]]
 409         return incomplete_mb_character;
 410       int inc = 1;
 411       char32_t c = adjust_byte_order(from[0], mode);
 412       if (is_high_surrogate(c))
 413         {
 414           if (avail < 2) [[unlikely]]
 415             return incomplete_mb_character;
 416           const char16_t c2 = adjust_byte_order(from[1], mode);
 417           if (is_low_surrogate(c2)) [[likely]]
 418             {
 419               c = surrogate_pair_to_code_point(c, c2);
 420               inc = 2;
 421             }
 422           else
 423             return invalid_mb_sequence;
 424         }
 425       else if (is_low_surrogate(c)) [[unlikely]]
 426         return invalid_mb_sequence;
 427       if (c <= maxcode)
 428         from += inc;
 429       return c;
 430     }
 431
 432   template<typename C, bool A>
 433   bool
 434   write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
 435   {
 436     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 437
 438     if (codepoint <= max_single_utf16_unit)
 439       {
 440         if (to.size() > 0)
 441           {
 442             to = adjust_byte_order(codepoint, mode);
 443             return true;
 444           }
 445       }
 446     else if (to.size() > 1)
 447       {
 448         // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
 449         const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
 450         char16_t lead = LEAD_OFFSET + (codepoint >> 10);
 451         char16_t trail = 0xDC00 + (codepoint & 0x3FF);
 452         to = adjust_byte_order(lead, mode);
 453         to = adjust_byte_order(trail, mode);
 454         return true;
 455       }
 456     return false;
 457   }
 458
 459   // utf8 -> ucs4
 460   template<typename C>
 461   codecvt_base::result
 462   ucs4_in(range<const C>& from, range<char32_t>& to,
 463           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 464   {
 465     read_utf8_bom(from, mode);
 466     while (from.size() && to.size())
 467       {
 468         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 469         if (codepoint == incomplete_mb_character) [[unlikely]]
 470           return codecvt_base::partial;
 471         if (codepoint > maxcode) [[unlikely]]
 472           return codecvt_base::error;
 473         to = codepoint;
 474       }
 475     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 476   }
 477
 478   // ucs4 -> utf8
 479   template<typename C>
 480   codecvt_base::result
 481   ucs4_out(range<const char32_t>& from, range<C>& to,
 482            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 483   {
 484     if (!write_utf8_bom(to, mode)) [[unlikely]]
 485       return codecvt_base::partial;
 486     while (from.size())
 487       {
 488         const char32_t c = from[0];
 489         if (0xD800 <= c && c <= 0xDFFF) [[unlikely]]
 490           return codecvt_base::error;
 491         if (c > maxcode) [[unlikely]]
 492           return codecvt_base::error;
 493         if (!write_utf8_code_point(to, c)) [[unlikely]]
 494           return codecvt_base::partial;
 495         ++from;
 496       }
 497     return codecvt_base::ok;
 498   }
 499
 500   // utf16 -> ucs4
 501   codecvt_base::result
 502   ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
 503           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 504   {
 505     read_utf16_bom(from, mode);
 506     while (from.size() && to.size())
 507       {
 508         const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
 509         if (codepoint == incomplete_mb_character) [[unlikely]]
 510           return codecvt_base::partial;
 511         if (codepoint > maxcode) [[unlikely]]
 512           return codecvt_base::error;
 513         to = codepoint;
 514       }
 515     return from.nbytes() ? codecvt_base::partial : codecvt_base::ok;
 516   }
 517
 518   // ucs4 -> utf16
 519   codecvt_base::result
 520   ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
 521            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 522   {
 523     if (!write_utf16_bom(to, mode)) [[unlikely]]
 524       return codecvt_base::partial;
 525     while (from.size())
 526       {
 527         const char32_t c = from[0];
 528         if (0xD800 <= c && c <= 0xDFFF) [[unlikely]]
 529           return codecvt_base::error;
 530         if (c > maxcode) [[unlikely]]
 531           return codecvt_base::error;
 532         if (!write_utf16_code_point(to, c, mode)) [[unlikely]]
 533           return codecvt_base::partial;
 534         ++from;
 535       }
 536     return codecvt_base::ok;
 537   }
 538
 539   // Flag indicating whether to process UTF-16 or UCS2
 540   enum class surrogates { allowed, disallowed };
 541
 542   // utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF)
 543   template <typename C8, typename C16>
 544   codecvt_base::result
 545   utf16_in(range<const C8> &from, range<C16> &to,
 546            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 547   {
 548     read_utf8_bom(from, mode);
 549     while (from.size() && to.size())
 550       {
 551         auto orig = from;
 552         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 553         if (codepoint == incomplete_mb_character) [[unlikely]]
 554           return codecvt_base::partial;
 555         if (codepoint > maxcode)
 556           return codecvt_base::error;
 557         if (!write_utf16_code_point(to, codepoint, mode)) [[unlikely]]
 558           {
 559             from = orig; // rewind to previous position
 560             return codecvt_base::partial;
 561           }
 562       }
 563     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 564   }
 565
 566   // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
 567   template<typename C16, typename C8>
 568   codecvt_base::result
 569   utf16_out(range<const C16>& from, range<C8>& to,
 570             unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 571             surrogates s = surrogates::allowed)
 572   {
 573     if (!write_utf8_bom(to, mode)) [[unlikely]]
 574       return codecvt_base::partial;
 575     while (from.size())
 576       {
 577         char32_t c = from[0];
 578         int inc = 1;
 579         if (is_high_surrogate(c))
 580           {
 581             if (s == surrogates::disallowed) [[unlikely]]
 582               return codecvt_base::error; // No surrogates in UCS-2
 583
 584             if (from.size() < 2) [[unlikely]]
 585               return codecvt_base::partial; // stop converting at this point
 586
 587             const char32_t c2 = from[1];
 588             if (is_low_surrogate(c2)) [[likely]]
 589               {
 590                 c = surrogate_pair_to_code_point(c, c2);
 591                 inc = 2;
 592               }
 593             else
 594               return codecvt_base::error;
 595           }
 596         else if (is_low_surrogate(c)) [[unlikely]]
 597           return codecvt_base::error;
 598         if (c > maxcode) [[unlikely]]
 599           return codecvt_base::error;
 600         if (!write_utf8_code_point(to, c)) [[unlikely]]
 601           return codecvt_base::partial;
 602         from += inc;
 603       }
 604     return codecvt_base::ok;
 605   }
 606
 607   // return pos such that [begin,pos) is valid UTF-16 string no longer than max
 608   template<typename C>
 609   const C*
 610   utf16_span(const C* begin, const C* end, size_t max,
 611              char32_t maxcode = max_code_point, codecvt_mode mode = {})
 612   {
 613     range<const C> from{ begin, end };
 614     read_utf8_bom(from, mode);
 615     size_t count = 0;
 616     while (count+1 < max)
 617       {
 618         char32_t c = read_utf8_code_point(from, maxcode);
 619         if (c > maxcode)
 620           return from.next;
 621         else if (c > max_single_utf16_unit)
 622           ++count;
 623         ++count;
 624       }
 625     if (count+1 == max) // take one more character if it fits in a single unit
 626       read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
 627     return from.next;
 628   }
 629
 630   // utf8 -> ucs2
 631   template<typename C>
 632   codecvt_base::result
 633   ucs2_in(range<const C>& from, range<char16_t>& to,
 634           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 635   {
 636     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 637     maxcode = std::min(max_single_utf16_unit, maxcode);
 638     return utf16_in(from, to, maxcode, mode);
 639   }
 640
 641   // ucs2 -> utf8
 642   template<typename C>
 643   codecvt_base::result
 644   ucs2_out(range<const char16_t>& from, range<C>& to,
 645            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 646   {
 647     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 648     maxcode = std::min(max_single_utf16_unit, maxcode);
 649     return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
 650   }
 651
 652   // ucs2 -> utf16
 653   codecvt_base::result
 654   ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
 655            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 656   {
 657     if (!write_utf16_bom(to, mode))
 658       return codecvt_base::partial;
 659     while (from.size() && to.size())
 660       {
 661         char16_t c = from[0];
 662         if (0xD800 <= c && c <= 0xDFFF)
 663           return codecvt_base::error;
 664         if (c > maxcode)
 665           return codecvt_base::error;
 666         to = adjust_byte_order(c, mode);
 667         ++from;
 668       }
 669     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 670   }
 671
 672   // utf16 -> ucs2
 673   codecvt_base::result
 674   ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
 675           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 676   {
 677     read_utf16_bom(from, mode);
 678     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 679     maxcode = std::min(max_single_utf16_unit, maxcode);
 680     while (from.size() && to.size())
 681       {
 682         const char32_t c = read_utf16_code_point(from, maxcode, mode);
 683         if (c == incomplete_mb_character)
 684           return codecvt_base::error; // UCS-2 only supports single units.
 685         if (c > maxcode)
 686           return codecvt_base::error;
 687         to = c;
 688       }
 689     return from.nbytes() == 0 ? codecvt_base::ok : codecvt_base::partial;
 690   }
 691
 692   const char16_t*
 693   ucs2_span(range<const char16_t, false>& from, size_t max,
 694             char32_t maxcode, codecvt_mode mode)
 695   {
 696     read_utf16_bom(from, mode);
 697     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 698     maxcode = std::min(max_single_utf16_unit, maxcode);
 699     char32_t c = 0;
 700     while (max-- && c <= maxcode)
 701       c = read_utf16_code_point(from, maxcode, mode);
 702     return reinterpret_cast<const char16_t*>(from.next);
 703   }
 704
 705   template<typename C>
 706   const C*
 707   ucs2_span(const C* begin, const C* end, size_t max,
 708             char32_t maxcode, codecvt_mode mode)
 709   {
 710     range<const C> from{ begin, end };
 711     read_utf8_bom(from, mode);
 712     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 713     maxcode = std::min(max_single_utf16_unit, maxcode);
 714     char32_t c = 0;
 715     while (max-- && c <= maxcode)
 716       c = read_utf8_code_point(from, maxcode);
 717     return from.next;
 718   }
 719
 720   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 721   template<typename C>
 722   const C*
 723   ucs4_span(const C* begin, const C* end, size_t max,
 724             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 725   {
 726     range<const C> from{ begin, end };
 727     read_utf8_bom(from, mode);
 728     char32_t c = 0;
 729     while (max-- && c <= maxcode)
 730       c = read_utf8_code_point(from, maxcode);
 731     return from.next;
 732   }
 733
 734   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 735   const char16_t*
 736   ucs4_span(range<const char16_t, false>& from, size_t max,
 737             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 738   {
 739     read_utf16_bom(from, mode);
 740     char32_t c = 0;
 741     while (max-- && c <= maxcode)
 742       c = read_utf16_code_point(from, maxcode, mode);
 743     return reinterpret_cast<const char16_t*>(from.next);
 744   }
 745 }
 746
 747 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
 748 // Converts from UTF-8 to UTF-16.
 749
 750 locale::id codecvt<char16_t, char, mbstate_t>::id;
 751
 752 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
 753
 754 codecvt_base::result
 755 codecvt<char16_t, char, mbstate_t>::
 756 do_out(state_type&,
 757        const intern_type* __from,
 758        const intern_type* __from_end, const intern_type*& __from_next,
 759        extern_type* __to, extern_type* __to_end,
 760        extern_type*& __to_next) const
 761 {
 762   range<const char16_t> from{ __from, __from_end };
 763   range<char> to{ __to, __to_end };
 764   auto res = utf16_out(from, to);
 765   __from_next = from.next;
 766   __to_next = to.next;
 767   return res;
 768 }
 769
 770 codecvt_base::result
 771 codecvt<char16_t, char, mbstate_t>::
 772 do_unshift(state_type&, extern_type* __to, extern_type*,
 773            extern_type*& __to_next) const
 774 {
 775   __to_next = __to;
 776   return noconv; // we don't use mbstate_t for the unicode facets
 777 }
 778
 779 codecvt_base::result
 780 codecvt<char16_t, char, mbstate_t>::
 781 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 782       const extern_type*& __from_next,
 783       intern_type* __to, intern_type* __to_end,
 784       intern_type*& __to_next) const
 785 {
 786   range<const char> from{ __from, __from_end };
 787   range<char16_t> to{ __to, __to_end };
 788 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 789   codecvt_mode mode = {};
 790 #else
 791   codecvt_mode mode = little_endian;
 792 #endif
 793   auto res = utf16_in(from, to, max_code_point, mode);
 794   __from_next = from.next;
 795   __to_next = to.next;
 796   return res;
 797 }
 798
 799 int
 800 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
 801 { return 0; } // UTF-8 is not a fixed-width encoding
 802
 803 bool
 804 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
 805 { return false; }
 806
 807 int
 808 codecvt<char16_t, char, mbstate_t>::
 809 do_length(state_type&, const extern_type* __from,
 810           const extern_type* __end, size_t __max) const
 811 {
 812   __end = utf16_span(__from, __end, __max);
 813   return __end - __from;
 814 }
 815
 816 int
 817 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
 818 {
 819   // A single character (one or two UTF-16 code units) requires
 820   // up to four UTF-8 code units.
 821   return 4;
 822 }
 823
 824 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
 825 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 826
 827 locale::id codecvt<char32_t, char, mbstate_t>::id;
 828
 829 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
 830
 831 codecvt_base::result
 832 codecvt<char32_t, char, mbstate_t>::
 833 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 834        const intern_type*& __from_next,
 835        extern_type* __to, extern_type* __to_end,
 836        extern_type*& __to_next) const
 837 {
 838   range<const char32_t> from{ __from, __from_end };
 839   range<char> to{ __to, __to_end };
 840   auto res = ucs4_out(from, to);
 841   __from_next = from.next;
 842   __to_next = to.next;
 843   return res;
 844 }
 845
 846 codecvt_base::result
 847 codecvt<char32_t, char, mbstate_t>::
 848 do_unshift(state_type&, extern_type* __to, extern_type*,
 849            extern_type*& __to_next) const
 850 {
 851   __to_next = __to;
 852   return noconv;
 853 }
 854
 855 codecvt_base::result
 856 codecvt<char32_t, char, mbstate_t>::
 857 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 858       const extern_type*& __from_next,
 859       intern_type* __to, intern_type* __to_end,
 860       intern_type*& __to_next) const
 861 {
 862   range<const char> from{ __from, __from_end };
 863   range<char32_t> to{ __to, __to_end };
 864   auto res = ucs4_in(from, to);
 865   __from_next = from.next;
 866   __to_next = to.next;
 867   return res;
 868 }
 869
 870 int
 871 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
 872 { return 0; } // UTF-8 is not a fixed-width encoding
 873
 874 bool
 875 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
 876 { return false; }
 877
 878 int
 879 codecvt<char32_t, char, mbstate_t>::
 880 do_length(state_type&, const extern_type* __from,
 881           const extern_type* __end, size_t __max) const
 882 {
 883   __end = ucs4_span(__from, __end, __max);
 884   return __end - __from;
 885 }
 886
 887 int
 888 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
 889 {
 890   // A single character (one UTF-32 code unit) requires
 891   // up to 4 UTF-8 code units.
 892   return 4;
 893 }
 894
 895 #if defined(_GLIBCXX_USE_CHAR8_T)
 896 // Define members of codecvt<char16_t, char8_t, mbstate_t> specialization.
 897 // Converts from UTF-8 to UTF-16.
 898
 899 locale::id codecvt<char16_t, char8_t, mbstate_t>::id;
 900
 901 codecvt<char16_t, char8_t, mbstate_t>::~codecvt() { }
 902
 903 codecvt_base::result
 904 codecvt<char16_t, char8_t, mbstate_t>::
 905 do_out(state_type&,
 906        const intern_type* __from,
 907        const intern_type* __from_end, const intern_type*& __from_next,
 908        extern_type* __to, extern_type* __to_end,
 909        extern_type*& __to_next) const
 910 {
 911   range<const char16_t> from{ __from, __from_end };
 912   range<char8_t> to{ __to, __to_end };
 913   auto res = utf16_out(from, to);
 914   __from_next = from.next;
 915   __to_next = to.next;
 916   return res;
 917 }
 918
 919 codecvt_base::result
 920 codecvt<char16_t, char8_t, mbstate_t>::
 921 do_unshift(state_type&, extern_type* __to, extern_type*,
 922            extern_type*& __to_next) const
 923 {
 924   __to_next = __to;
 925   return noconv; // we don't use mbstate_t for the unicode facets
 926 }
 927
 928 codecvt_base::result
 929 codecvt<char16_t, char8_t, mbstate_t>::
 930 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 931       const extern_type*& __from_next,
 932       intern_type* __to, intern_type* __to_end,
 933       intern_type*& __to_next) const
 934 {
 935   range<const char8_t> from{ __from, __from_end };
 936   range<char16_t> to{ __to, __to_end };
 937 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 938   codecvt_mode mode = {};
 939 #else
 940   codecvt_mode mode = little_endian;
 941 #endif
 942   auto res = utf16_in(from, to, max_code_point, mode);
 943   __from_next = from.next;
 944   __to_next = to.next;
 945   return res;
 946 }
 947
 948 int
 949 codecvt<char16_t, char8_t, mbstate_t>::do_encoding() const throw()
 950 { return 0; } // UTF-8 is not a fixed-width encoding
 951
 952 bool
 953 codecvt<char16_t, char8_t, mbstate_t>::do_always_noconv() const throw()
 954 { return false; }
 955
 956 int
 957 codecvt<char16_t, char8_t, mbstate_t>::
 958 do_length(state_type&, const extern_type* __from,
 959           const extern_type* __end, size_t __max) const
 960 {
 961   __end = utf16_span(__from, __end, __max);
 962   return __end - __from;
 963 }
 964
 965 int
 966 codecvt<char16_t, char8_t, mbstate_t>::do_max_length() const throw()
 967 {
 968   // A single character (one or two UTF-16 code units) requires
 969   // up to four UTF-8 code units.
 970   return 4;
 971 }
 972
 973 // Define members of codecvt<char32_t, char8_t, mbstate_t> specialization.
 974 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 975
 976 locale::id codecvt<char32_t, char8_t, mbstate_t>::id;
 977
 978 codecvt<char32_t, char8_t, mbstate_t>::~codecvt() { }
 979
 980 codecvt_base::result
 981 codecvt<char32_t, char8_t, mbstate_t>::
 982 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 983        const intern_type*& __from_next,
 984        extern_type* __to, extern_type* __to_end,
 985        extern_type*& __to_next) const
 986 {
 987   range<const char32_t> from{ __from, __from_end };
 988   range<char8_t> to{ __to, __to_end };
 989   auto res = ucs4_out(from, to);
 990   __from_next = from.next;
 991   __to_next = to.next;
 992   return res;
 993 }
 994
 995 codecvt_base::result
 996 codecvt<char32_t, char8_t, mbstate_t>::
 997 do_unshift(state_type&, extern_type* __to, extern_type*,
 998            extern_type*& __to_next) const
 999 {
1000   __to_next = __to;
1001   return noconv;
1002 }
1003
1004 codecvt_base::result
1005 codecvt<char32_t, char8_t, mbstate_t>::
1006 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1007       const extern_type*& __from_next,
1008       intern_type* __to, intern_type* __to_end,
1009       intern_type*& __to_next) const
1010 {
1011   range<const char8_t> from{ __from, __from_end };
1012   range<char32_t> to{ __to, __to_end };
1013   auto res = ucs4_in(from, to);
1014   __from_next = from.next;
1015   __to_next = to.next;
1016   return res;
1017 }
1018
1019 int
1020 codecvt<char32_t, char8_t, mbstate_t>::do_encoding() const throw()
1021 { return 0; } // UTF-8 is not a fixed-width encoding
1022
1023 bool
1024 codecvt<char32_t, char8_t, mbstate_t>::do_always_noconv() const throw()
1025 { return false; }
1026
1027 int
1028 codecvt<char32_t, char8_t, mbstate_t>::
1029 do_length(state_type&, const extern_type* __from,
1030           const extern_type* __end, size_t __max) const
1031 {
1032   __end = ucs4_span(__from, __end, __max);
1033   return __end - __from;
1034 }
1035
1036 int
1037 codecvt<char32_t, char8_t, mbstate_t>::do_max_length() const throw()
1038 {
1039   // A single character (one UTF-32 code unit) requires
1040   // up to 4 UTF-8 code units.
1041   return 4;
1042 }
1043 #endif // _GLIBCXX_USE_CHAR8_T
1044
1045 // Define members of codecvt_utf8<char16_t> base class implementation.
1046 // Converts from UTF-8 to UCS-2.
1047
1048 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
1049
1050 codecvt_base::result
1051 __codecvt_utf8_base<char16_t>::
1052 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1053        const intern_type*& __from_next,
1054        extern_type* __to, extern_type* __to_end,
1055        extern_type*& __to_next) const
1056 {
1057   range<const char16_t> from{ __from, __from_end };
1058   range<char> to{ __to, __to_end };
1059   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1060   __from_next = from.next;
1061   __to_next = to.next;
1062   return res;
1063 }
1064
1065 codecvt_base::result
1066 __codecvt_utf8_base<char16_t>::
1067 do_unshift(state_type&, extern_type* __to, extern_type*,
1068            extern_type*& __to_next) const
1069 {
1070   __to_next = __to;
1071   return noconv;
1072 }
1073
1074 codecvt_base::result
1075 __codecvt_utf8_base<char16_t>::
1076 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1077       const extern_type*& __from_next,
1078       intern_type* __to, intern_type* __to_end,
1079       intern_type*& __to_next) const
1080 {
1081   range<const char> from{ __from, __from_end };
1082   range<char16_t> to{ __to, __to_end };
1083   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1084 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1085   mode = codecvt_mode(mode | little_endian);
1086 #endif
1087   auto res = ucs2_in(from, to, _M_maxcode, mode);
1088   __from_next = from.next;
1089   __to_next = to.next;
1090   return res;
1091 }
1092
1093 int
1094 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
1095 { return 0; } // UTF-8 is not a fixed-width encoding
1096
1097 bool
1098 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
1099 { return false; }
1100
1101 int
1102 __codecvt_utf8_base<char16_t>::
1103 do_length(state_type&, const extern_type* __from,
1104           const extern_type* __end, size_t __max) const
1105 {
1106   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1107   return __end - __from;
1108 }
1109
1110 int
1111 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
1112 {
1113   // A single UCS-2 character requires up to three UTF-8 code units.
1114   // (UCS-2 cannot represent characters that use four UTF-8 code units).
1115   int max = 3;
1116   if (_M_mode & consume_header)
1117     max += sizeof(utf8_bom);
1118   return max;
1119 }
1120
1121 // Define members of codecvt_utf8<char32_t> base class implementation.
1122 // Converts from UTF-8 to UTF-32 (aka UCS-4).
1123
1124 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
1125
1126 codecvt_base::result
1127 __codecvt_utf8_base<char32_t>::
1128 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1129        const intern_type*& __from_next,
1130        extern_type* __to, extern_type* __to_end,
1131        extern_type*& __to_next) const
1132 {
1133   range<const char32_t> from{ __from, __from_end };
1134   range<char> to{ __to, __to_end };
1135   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1136   __from_next = from.next;
1137   __to_next = to.next;
1138   return res;
1139 }
1140
1141 codecvt_base::result
1142 __codecvt_utf8_base<char32_t>::
1143 do_unshift(state_type&, extern_type* __to, extern_type*,
1144            extern_type*& __to_next) const
1145 {
1146   __to_next = __to;
1147   return noconv;
1148 }
1149
1150 codecvt_base::result
1151 __codecvt_utf8_base<char32_t>::
1152 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1153       const extern_type*& __from_next,
1154       intern_type* __to, intern_type* __to_end,
1155       intern_type*& __to_next) const
1156 {
1157   range<const char> from{ __from, __from_end };
1158   range<char32_t> to{ __to, __to_end };
1159   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1160   __from_next = from.next;
1161   __to_next = to.next;
1162   return res;
1163 }
1164
1165 int
1166 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1167 { return 0; } // UTF-8 is not a fixed-width encoding
1168
1169 bool
1170 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1171 { return false; }
1172
1173 int
1174 __codecvt_utf8_base<char32_t>::
1175 do_length(state_type&, const extern_type* __from,
1176           const extern_type* __end, size_t __max) const
1177 {
1178   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1179   return __end - __from;
1180 }
1181
1182 int
1183 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1184 {
1185   // A single UCS-4 character requires up to four UTF-8 code units.
1186   int max = 4;
1187   if (_M_mode & consume_header)
1188     max += sizeof(utf8_bom);
1189   return max;
1190 }
1191
1192 #ifdef _GLIBCXX_USE_WCHAR_T
1193
1194 #if __SIZEOF_WCHAR_T__ == 2
1195 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1196 #elif __SIZEOF_WCHAR_T__ == 4
1197 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1198 #endif
1199
1200 // Define members of codecvt_utf8<wchar_t> base class implementation.
1201 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1202
1203 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1204
1205 codecvt_base::result
1206 __codecvt_utf8_base<wchar_t>::
1207 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1208        const intern_type*& __from_next,
1209        extern_type* __to, extern_type* __to_end,
1210        extern_type*& __to_next) const
1211 {
1212   range<char> to{ __to, __to_end };
1213 #if __SIZEOF_WCHAR_T__ == 2
1214   range<const char16_t> from{
1215     reinterpret_cast<const char16_t*>(__from),
1216     reinterpret_cast<const char16_t*>(__from_end)
1217   };
1218   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1219 #elif __SIZEOF_WCHAR_T__ == 4
1220   range<const char32_t> from{
1221     reinterpret_cast<const char32_t*>(__from),
1222     reinterpret_cast<const char32_t*>(__from_end)
1223   };
1224   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1225 #else
1226   return codecvt_base::error;
1227 #endif
1228   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1229   __to_next = to.next;
1230   return res;
1231 }
1232
1233 codecvt_base::result
1234 __codecvt_utf8_base<wchar_t>::
1235 do_unshift(state_type&, extern_type* __to, extern_type*,
1236            extern_type*& __to_next) const
1237 {
1238   __to_next = __to;
1239   return noconv;
1240 }
1241
1242 codecvt_base::result
1243 __codecvt_utf8_base<wchar_t>::
1244 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1245       const extern_type*& __from_next,
1246       intern_type* __to, intern_type* __to_end,
1247       intern_type*& __to_next) const
1248 {
1249   range<const char> from{ __from, __from_end };
1250 #if __SIZEOF_WCHAR_T__ == 2
1251   range<char16_t> to{
1252     reinterpret_cast<char16_t*>(__to),
1253     reinterpret_cast<char16_t*>(__to_end)
1254   };
1255 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1256   codecvt_mode mode = {};
1257 #else
1258   codecvt_mode mode = little_endian;
1259 #endif
1260   auto res = ucs2_in(from, to, _M_maxcode, mode);
1261 #elif __SIZEOF_WCHAR_T__ == 4
1262   range<char32_t> to{
1263     reinterpret_cast<char32_t*>(__to),
1264     reinterpret_cast<char32_t*>(__to_end)
1265   };
1266   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1267 #else
1268   return codecvt_base::error;
1269 #endif
1270   __from_next = from.next;
1271   __to_next = reinterpret_cast<wchar_t*>(to.next);
1272   return res;
1273 }
1274
1275 int
1276 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1277 { return 0; } // UTF-8 is not a fixed-width encoding
1278
1279 bool
1280 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1281 { return false; }
1282
1283 int
1284 __codecvt_utf8_base<wchar_t>::
1285 do_length(state_type&, const extern_type* __from,
1286           const extern_type* __end, size_t __max) const
1287 {
1288 #if __SIZEOF_WCHAR_T__ == 2
1289   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1290 #elif __SIZEOF_WCHAR_T__ == 4
1291   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1292 #else
1293   __end = __from;
1294 #endif
1295   return __end - __from;
1296 }
1297
1298 int
1299 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1300 {
1301 #if __SIZEOF_WCHAR_T__ == 2
1302   int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1303 #else
1304   int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1305 #endif
1306   if (_M_mode & consume_header)
1307     max += sizeof(utf8_bom);
1308   return max;
1309 }
1310 #endif
1311
1312 // Define members of codecvt_utf16<char16_t> base class implementation.
1313 // Converts from UTF-16 to UCS-2.
1314
1315 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1316
1317 codecvt_base::result
1318 __codecvt_utf16_base<char16_t>::
1319 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1320        const intern_type*& __from_next,
1321        extern_type* __to, extern_type* __to_end,
1322        extern_type*& __to_next) const
1323 {
1324   range<const char16_t> from{ __from, __from_end };
1325   range<char16_t, false> to{ __to, __to_end };
1326   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1327   __from_next = from.next;
1328   __to_next = reinterpret_cast<char*>(to.next);
1329   return res;
1330 }
1331
1332 codecvt_base::result
1333 __codecvt_utf16_base<char16_t>::
1334 do_unshift(state_type&, extern_type* __to, extern_type*,
1335            extern_type*& __to_next) const
1336 {
1337   __to_next = __to;
1338   return noconv;
1339 }
1340
1341 codecvt_base::result
1342 __codecvt_utf16_base<char16_t>::
1343 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1344       const extern_type*& __from_next,
1345       intern_type* __to, intern_type* __to_end,
1346       intern_type*& __to_next) const
1347 {
1348   range<const char16_t, false> from{ __from, __from_end };
1349   range<char16_t> to{ __to, __to_end };
1350   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1351   __from_next = reinterpret_cast<const char*>(from.next);
1352   __to_next = to.next;
1353   return res;
1354 }
1355
1356 int
1357 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1358 { return 0; } // UTF-16 is not a fixed-width encoding
1359
1360 bool
1361 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1362 { return false; }
1363
1364 int
1365 __codecvt_utf16_base<char16_t>::
1366 do_length(state_type&, const extern_type* __from,
1367           const extern_type* __end, size_t __max) const
1368 {
1369   range<const char16_t, false> from{ __from, __end };
1370   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1371   return reinterpret_cast<const char*>(next) - __from;
1372 }
1373
1374 int
1375 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1376 {
1377   // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1378   // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1379   int max = 2;
1380   if (_M_mode & consume_header)
1381     max += sizeof(utf16_bom);
1382   return max;
1383 }
1384
1385 // Define members of codecvt_utf16<char32_t> base class implementation.
1386 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1387
1388 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1389
1390 codecvt_base::result
1391 __codecvt_utf16_base<char32_t>::
1392 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1393        const intern_type*& __from_next,
1394        extern_type* __to, extern_type* __to_end,
1395        extern_type*& __to_next) const
1396 {
1397   range<const char32_t> from{ __from, __from_end };
1398   range<char16_t, false> to{ __to, __to_end };
1399   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1400   __from_next = from.next;
1401   __to_next = reinterpret_cast<char*>(to.next);
1402   return res;
1403 }
1404
1405 codecvt_base::result
1406 __codecvt_utf16_base<char32_t>::
1407 do_unshift(state_type&, extern_type* __to, extern_type*,
1408            extern_type*& __to_next) const
1409 {
1410   __to_next = __to;
1411   return noconv;
1412 }
1413
1414 codecvt_base::result
1415 __codecvt_utf16_base<char32_t>::
1416 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1417       const extern_type*& __from_next,
1418       intern_type* __to, intern_type* __to_end,
1419       intern_type*& __to_next) const
1420 {
1421   range<const char16_t, false> from{ __from, __from_end };
1422   range<char32_t> to{ __to, __to_end };
1423   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1424   __from_next = reinterpret_cast<const char*>(from.next);
1425   __to_next = to.next;
1426   return res;
1427 }
1428
1429 int
1430 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1431 { return 0; } // UTF-16 is not a fixed-width encoding
1432
1433 bool
1434 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1435 { return false; }
1436
1437 int
1438 __codecvt_utf16_base<char32_t>::
1439 do_length(state_type&, const extern_type* __from,
1440           const extern_type* __end, size_t __max) const
1441 {
1442   range<const char16_t, false> from{ __from, __end };
1443   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1444   return reinterpret_cast<const char*>(next) - __from;
1445 }
1446
1447 int
1448 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1449 {
1450   // A single UCS-4 character requires one or two UTF-16 code units
1451   // (so up to four chars).
1452   int max = 4;
1453   if (_M_mode & consume_header)
1454     max += sizeof(utf16_bom);
1455   return max;
1456 }
1457
1458 #ifdef _GLIBCXX_USE_WCHAR_T
1459 // Define members of codecvt_utf16<wchar_t> base class implementation.
1460 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1461
1462 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1463
1464 codecvt_base::result
1465 __codecvt_utf16_base<wchar_t>::
1466 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1467        const intern_type*& __from_next,
1468        extern_type* __to, extern_type* __to_end,
1469        extern_type*& __to_next) const
1470 {
1471   range<char16_t, false> to{ __to, __to_end };
1472 #if __SIZEOF_WCHAR_T__ == 2
1473   range<const char16_t> from{
1474     reinterpret_cast<const char16_t*>(__from),
1475     reinterpret_cast<const char16_t*>(__from_end),
1476   };
1477   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1478 #elif __SIZEOF_WCHAR_T__ == 4
1479   range<const char32_t> from{
1480     reinterpret_cast<const char32_t*>(__from),
1481     reinterpret_cast<const char32_t*>(__from_end),
1482   };
1483   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1484 #else
1485   return codecvt_base::error;
1486 #endif
1487   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1488   __to_next = reinterpret_cast<char*>(to.next);
1489   return res;
1490 }
1491
1492 codecvt_base::result
1493 __codecvt_utf16_base<wchar_t>::
1494 do_unshift(state_type&, extern_type* __to, extern_type*,
1495            extern_type*& __to_next) const
1496 {
1497   __to_next = __to;
1498   return noconv;
1499 }
1500
1501 codecvt_base::result
1502 __codecvt_utf16_base<wchar_t>::
1503 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1504       const extern_type*& __from_next,
1505       intern_type* __to, intern_type* __to_end,
1506       intern_type*& __to_next) const
1507 {
1508   range<const char16_t, false> from{ __from, __from_end };
1509 #if __SIZEOF_WCHAR_T__ == 2
1510   range<char16_t> to{
1511     reinterpret_cast<char16_t*>(__to),
1512     reinterpret_cast<char16_t*>(__to_end),
1513   };
1514   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1515 #elif __SIZEOF_WCHAR_T__ == 4
1516   range<char32_t> to{
1517     reinterpret_cast<char32_t*>(__to),
1518     reinterpret_cast<char32_t*>(__to_end),
1519   };
1520   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1521 #else
1522   return codecvt_base::error;
1523 #endif
1524   __from_next = reinterpret_cast<const char*>(from.next);
1525   __to_next = reinterpret_cast<wchar_t*>(to.next);
1526   return res;
1527 }
1528
1529 int
1530 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1531 { return 0; } // UTF-16 is not a fixed-width encoding
1532
1533 bool
1534 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1535 { return false; }
1536
1537 int
1538 __codecvt_utf16_base<wchar_t>::
1539 do_length(state_type&, const extern_type* __from,
1540           const extern_type* __end, size_t __max) const
1541 {
1542   range<const char16_t, false> from{ __from, __end };
1543 #if __SIZEOF_WCHAR_T__ == 2
1544   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1545 #elif __SIZEOF_WCHAR_T__ == 4
1546   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1547 #endif
1548   return reinterpret_cast<const char*>(next) - __from;
1549 }
1550
1551 int
1552 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1553 {
1554 #if __SIZEOF_WCHAR_T__ == 2
1555   int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1556 #else
1557   int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1558 #endif
1559   if (_M_mode & consume_header)
1560     max += sizeof(utf16_bom);
1561   return max;
1562 }
1563 #endif
1564
1565 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1566 // Converts from UTF-8 to UTF-16.
1567
1568 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1569
1570 codecvt_base::result
1571 __codecvt_utf8_utf16_base<char16_t>::
1572 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1573        const intern_type*& __from_next,
1574        extern_type* __to, extern_type* __to_end,
1575        extern_type*& __to_next) const
1576 {
1577   range<const char16_t> from{ __from, __from_end };
1578   range<char> to{ __to, __to_end };
1579   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1580   __from_next = from.next;
1581   __to_next = to.next;
1582   return res;
1583 }
1584
1585 codecvt_base::result
1586 __codecvt_utf8_utf16_base<char16_t>::
1587 do_unshift(state_type&, extern_type* __to, extern_type*,
1588            extern_type*& __to_next) const
1589 {
1590   __to_next = __to;
1591   return noconv;
1592 }
1593
1594 codecvt_base::result
1595 __codecvt_utf8_utf16_base<char16_t>::
1596 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1597       const extern_type*& __from_next,
1598       intern_type* __to, intern_type* __to_end,
1599       intern_type*& __to_next) const
1600 {
1601   range<const char> from{ __from, __from_end };
1602   range<char16_t> to{ __to, __to_end };
1603   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1604 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1605   mode = codecvt_mode(mode | little_endian);
1606 #endif
1607   auto res = utf16_in(from, to, _M_maxcode, mode);
1608   __from_next = from.next;
1609   __to_next = to.next;
1610   return res;
1611 }
1612
1613 int
1614 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1615 { return 0; } // UTF-8 is not a fixed-width encoding
1616
1617 bool
1618 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1619 { return false; }
1620
1621 int
1622 __codecvt_utf8_utf16_base<char16_t>::
1623 do_length(state_type&, const extern_type* __from,
1624           const extern_type* __end, size_t __max) const
1625 {
1626   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1627   return __end - __from;
1628 }
1629
1630 int
1631 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1632 {
1633   // A single character can be 1 or 2 UTF-16 code units,
1634   // requiring up to 4 UTF-8 code units.
1635   int max = 4;
1636   if (_M_mode & consume_header)
1637     max += sizeof(utf8_bom);
1638   return max;
1639 }
1640
1641 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1642 // Converts from UTF-8 to UTF-16.
1643
1644 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1645
1646 codecvt_base::result
1647 __codecvt_utf8_utf16_base<char32_t>::
1648 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1649        const intern_type*& __from_next,
1650        extern_type* __to, extern_type* __to_end,
1651        extern_type*& __to_next) const
1652 {
1653   range<const char32_t> from{ __from, __from_end };
1654   range<char> to{ __to, __to_end };
1655   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1656   __from_next = from.next;
1657   __to_next = to.next;
1658   return res;
1659 }
1660
1661 codecvt_base::result
1662 __codecvt_utf8_utf16_base<char32_t>::
1663 do_unshift(state_type&, extern_type* __to, extern_type*,
1664            extern_type*& __to_next) const
1665 {
1666   __to_next = __to;
1667   return noconv;
1668 }
1669
1670 codecvt_base::result
1671 __codecvt_utf8_utf16_base<char32_t>::
1672 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1673       const extern_type*& __from_next,
1674       intern_type* __to, intern_type* __to_end,
1675       intern_type*& __to_next) const
1676 {
1677   range<const char> from{ __from, __from_end };
1678   range<char32_t> to{ __to, __to_end };
1679   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1680 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1681   mode = codecvt_mode(mode | little_endian);
1682 #endif
1683   auto res = utf16_in(from, to, _M_maxcode, mode);
1684   __from_next = from.next;
1685   __to_next = to.next;
1686   return res;
1687 }
1688
1689 int
1690 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1691 { return 0; } // UTF-8 is not a fixed-width encoding
1692
1693 bool
1694 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1695 { return false; }
1696
1697 int
1698 __codecvt_utf8_utf16_base<char32_t>::
1699 do_length(state_type&, const extern_type* __from,
1700           const extern_type* __end, size_t __max) const
1701 {
1702   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1703   return __end - __from;
1704 }
1705
1706 int
1707 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1708 {
1709   // A single character can be 1 or 2 UTF-16 code units,
1710   // requiring up to 4 UTF-8 code units.
1711   int max = 4;
1712   if (_M_mode & consume_header)
1713     max += sizeof(utf8_bom);
1714   return max;
1715 }
1716
1717 #ifdef _GLIBCXX_USE_WCHAR_T
1718 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1719 // Converts from UTF-8 to UTF-16.
1720
1721 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1722
1723 codecvt_base::result
1724 __codecvt_utf8_utf16_base<wchar_t>::
1725 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1726        const intern_type*& __from_next,
1727        extern_type* __to, extern_type* __to_end,
1728        extern_type*& __to_next) const
1729 {
1730   range<const wchar_t> from{ __from, __from_end };
1731   range<char> to{ __to, __to_end };
1732   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1733   __from_next = from.next;
1734   __to_next = to.next;
1735   return res;
1736 }
1737
1738 codecvt_base::result
1739 __codecvt_utf8_utf16_base<wchar_t>::
1740 do_unshift(state_type&, extern_type* __to, extern_type*,
1741            extern_type*& __to_next) const
1742 {
1743   __to_next = __to;
1744   return noconv;
1745 }
1746
1747 codecvt_base::result
1748 __codecvt_utf8_utf16_base<wchar_t>::
1749 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1750       const extern_type*& __from_next,
1751       intern_type* __to, intern_type* __to_end,
1752       intern_type*& __to_next) const
1753 {
1754   range<const char> from{ __from, __from_end };
1755   range<wchar_t> to{ __to, __to_end };
1756   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1757 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1758   mode = codecvt_mode(mode | little_endian);
1759 #endif
1760   auto res = utf16_in(from, to, _M_maxcode, mode);
1761   __from_next = from.next;
1762   __to_next = to.next;
1763   return res;
1764 }
1765
1766 int
1767 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1768 { return 0; } // UTF-8 is not a fixed-width encoding
1769
1770 bool
1771 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1772 { return false; }
1773
1774 int
1775 __codecvt_utf8_utf16_base<wchar_t>::
1776 do_length(state_type&, const extern_type* __from,
1777           const extern_type* __end, size_t __max) const
1778 {
1779   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1780   return __end - __from;
1781 }
1782
1783 int
1784 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1785 {
1786   // A single character can be 1 or 2 UTF-16 code units,
1787   // requiring up to 4 UTF-8 code units.
1788   int max = 4;
1789   if (_M_mode & consume_header)
1790     max += sizeof(utf8_bom);
1791   return max;
1792 }
1793 #endif
1794
1795 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1796 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1797 template class codecvt_byname<char16_t, char, mbstate_t>;
1798 template class codecvt_byname<char32_t, char, mbstate_t>;
1799
1800 #if defined(_GLIBCXX_USE_CHAR8_T)
1801 inline template class __codecvt_abstract_base<char16_t, char8_t, mbstate_t>;
1802 inline template class __codecvt_abstract_base<char32_t, char8_t, mbstate_t>;
1803 template class codecvt_byname<char16_t, char8_t, mbstate_t>;
1804 template class codecvt_byname<char32_t, char8_t, mbstate_t>;
1805 #endif
1806
1807 _GLIBCXX_END_NAMESPACE_VERSION
1808 }