libstdc++-v3/src/c++11/codecvt.cc

   1 // Locale support (codecvt) -*- C++ -*-
   2
   3 // Copyright (C) 2015-2018 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 #include <codecvt>
  26 #include <cstring>              // std::memcpy, std::memcmp
  27 #include <bits/stl_algobase.h>  // std::min
  28
  29 namespace std _GLIBCXX_VISIBILITY(default)
  30 {
  31 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  32
  33   // The standard doesn't define these operators, which is annoying.
  34   static underlying_type<codecvt_mode>::type
  35   to_integer(codecvt_mode m)
  36   { return static_cast<underlying_type<codecvt_mode>::type>(m); }
  37
  38   static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
  39   { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
  40
  41   static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
  42   { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
  43
  44   static codecvt_mode operator~(codecvt_mode m)
  45   { return codecvt_mode(~to_integer(m)); }
  46
  47 namespace
  48 {
  49   // Largest code point that fits in a single UTF-16 code unit.
  50   const char32_t max_single_utf16_unit = 0xFFFF;
  51
  52   const char32_t max_code_point = 0x10FFFF;
  53
  54   // The functions below rely on maxcode < incomplete_mb_character
  55   // (which is enforced by the codecvt_utf* classes on construction).
  56   const char32_t incomplete_mb_character = char32_t(-2);
  57   const char32_t invalid_mb_sequence = char32_t(-1);
  58
  59   // Utility type for reading and writing code units of type Elem from
  60   // a range defined by a pair of pointers.
  61   template<typename Elem, bool Aligned = true>
  62     struct range
  63     {
  64       Elem* next;
  65       Elem* end;
  66
  67       // Write a code unit.
  68       range& operator=(Elem e)
  69       {
  70         *next++ = e;
  71         return *this;
  72       }
  73
  74       // Read the next code unit.
  75       Elem operator*() const { return *next; }
  76
  77       // Read the Nth code unit.
  78       Elem operator[](size_t n) const { return next[n]; }
  79
  80       // Move to the next code unit.
  81       range& operator++()
  82       {
  83         ++next;
  84         return *this;
  85       }
  86
  87       // Move to the Nth code unit.
  88       range& operator+=(size_t n)
  89       {
  90         next += n;
  91         return *this;
  92       }
  93
  94       // The number of code units remaining.
  95       size_t size() const { return end - next; }
  96
  97       // The number of bytes remaining.
  98       size_t nbytes() const { return (const char*)end - (const char*)next; }
  99     };
 100
 101   // This specialization is used when accessing char16_t values through
 102   // pointers to char, which might not be correctly aligned for char16_t.
 103   template<typename Elem>
 104     struct range<Elem, false>
 105     {
 106       using value_type = typename remove_const<Elem>::type;
 107
 108       using char_pointer = typename
 109         conditional<is_const<Elem>::value, const char*, char*>::type;
 110
 111       char_pointer next;
 112       char_pointer end;
 113
 114       // Write a code unit.
 115       range& operator=(Elem e)
 116       {
 117         memcpy(next, &e, sizeof(Elem));
 118         ++*this;
 119         return *this;
 120       }
 121
 122       // Read the next code unit.
 123       Elem operator*() const
 124       {
 125         value_type e;
 126         memcpy(&e, next, sizeof(Elem));
 127         return e;
 128       }
 129
 130       // Read the Nth code unit.
 131       Elem operator[](size_t n) const
 132       {
 133         value_type e;
 134         memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
 135         return e;
 136       }
 137
 138       // Move to the next code unit.
 139       range& operator++()
 140       {
 141         next += sizeof(Elem);
 142         return *this;
 143       }
 144
 145       // Move to the Nth code unit.
 146       range& operator+=(size_t n)
 147       {
 148         next += n * sizeof(Elem);
 149         return *this;
 150       }
 151
 152       // The number of code units remaining.
 153       size_t size() const { return nbytes() / sizeof(Elem); }
 154
 155       // The number of bytes remaining.
 156       size_t nbytes() const { return end - next; }
 157     };
 158
 159   // Multibyte sequences can have "header" consisting of Byte Order Mark
 160   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
 161   const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
 162   const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 163
 164   // Write a BOM (space permitting).
 165   template<typename C, bool A, size_t N>
 166     bool
 167     write_bom(range<C, A>& to, const unsigned char (&bom)[N])
 168     {
 169       static_assert( (N / sizeof(C)) != 0, "" );
 170       static_assert( (N % sizeof(C)) == 0, "" );
 171
 172       if (to.nbytes() < N)
 173         return false;
 174       memcpy(to.next, bom, N);
 175       to += (N / sizeof(C));
 176       return true;
 177     }
 178
 179   // Try to read a BOM.
 180   template<typename C, bool A, size_t N>
 181     bool
 182     read_bom(range<C, A>& from, const unsigned char (&bom)[N])
 183     {
 184       static_assert( (N / sizeof(C)) != 0, "" );
 185       static_assert( (N % sizeof(C)) == 0, "" );
 186
 187       if (from.nbytes() >= N && !memcmp(from.next, bom, N))
 188         {
 189           from += (N / sizeof(C));
 190           return true;
 191         }
 192       return false;
 193     }
 194
 195   // If generate_header is set in mode write out UTF-8 BOM.
 196   bool
 197   write_utf8_bom(range<char>& to, codecvt_mode mode)
 198   {
 199     if (mode & generate_header)
 200       return write_bom(to, utf8_bom);
 201     return true;
 202   }
 203
 204   // If generate_header is set in mode write out the UTF-16 BOM indicated
 205   // by whether little_endian is set in mode.
 206   template<bool Aligned>
 207   bool
 208   write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
 209   {
 210     if (mode & generate_header)
 211     {
 212       if (mode & little_endian)
 213         return write_bom(to, utf16le_bom);
 214       else
 215         return write_bom(to, utf16_bom);
 216     }
 217     return true;
 218   }
 219
 220   // If consume_header is set in mode update from.next to after any BOM.
 221   void
 222   read_utf8_bom(range<const char>& from, codecvt_mode mode)
 223   {
 224     if (mode & consume_header)
 225       read_bom(from, utf8_bom);
 226   }
 227
 228   // If consume_header is not set in mode, no effects.
 229   // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
 230   // - if the UTF-16BE BOM was found unset little_endian in mode, or
 231   // - if the UTF-16LE BOM was found set little_endian in mode.
 232   template<bool Aligned>
 233   void
 234   read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
 235   {
 236     if (mode & consume_header)
 237       {
 238         if (read_bom(from, utf16_bom))
 239           mode &= ~little_endian;
 240         else if (read_bom(from, utf16le_bom))
 241           mode |= little_endian;
 242       }
 243   }
 244
 245   // Read a codepoint from a UTF-8 multibyte sequence.
 246   // Updates from.next if the codepoint is not greater than maxcode.
 247   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 248   char32_t
 249   read_utf8_code_point(range<const char>& from, unsigned long maxcode)
 250   {
 251     const size_t avail = from.size();
 252     if (avail == 0)
 253       return incomplete_mb_character;
 254     unsigned char c1 = from[0];
 255     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
 256     if (c1 < 0x80)
 257     {
 258       ++from;
 259       return c1;
 260     }
 261     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
 262       return invalid_mb_sequence;
 263     else if (c1 < 0xE0) // 2-byte sequence
 264     {
 265       if (avail < 2)
 266         return incomplete_mb_character;
 267       unsigned char c2 = from[1];
 268       if ((c2 & 0xC0) != 0x80)
 269         return invalid_mb_sequence;
 270       char32_t c = (c1 << 6) + c2 - 0x3080;
 271       if (c <= maxcode)
 272         from += 2;
 273       return c;
 274     }
 275     else if (c1 < 0xF0) // 3-byte sequence
 276     {
 277       if (avail < 3)
 278         return incomplete_mb_character;
 279       unsigned char c2 = from[1];
 280       if ((c2 & 0xC0) != 0x80)
 281         return invalid_mb_sequence;
 282       if (c1 == 0xE0 && c2 < 0xA0) // overlong
 283         return invalid_mb_sequence;
 284       unsigned char c3 = from[2];
 285       if ((c3 & 0xC0) != 0x80)
 286         return invalid_mb_sequence;
 287       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
 288       if (c <= maxcode)
 289         from += 3;
 290       return c;
 291     }
 292     else if (c1 < 0xF5) // 4-byte sequence
 293     {
 294       if (avail < 4)
 295         return incomplete_mb_character;
 296       unsigned char c2 = from[1];
 297       if ((c2 & 0xC0) != 0x80)
 298         return invalid_mb_sequence;
 299       if (c1 == 0xF0 && c2 < 0x90) // overlong
 300         return invalid_mb_sequence;
 301       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
 302       return invalid_mb_sequence;
 303       unsigned char c3 = from[2];
 304       if ((c3 & 0xC0) != 0x80)
 305         return invalid_mb_sequence;
 306       unsigned char c4 = from[3];
 307       if ((c4 & 0xC0) != 0x80)
 308         return invalid_mb_sequence;
 309       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
 310       if (c <= maxcode)
 311         from += 4;
 312       return c;
 313     }
 314     else // > U+10FFFF
 315       return invalid_mb_sequence;
 316   }
 317
 318   bool
 319   write_utf8_code_point(range<char>& to, char32_t code_point)
 320   {
 321     if (code_point < 0x80)
 322       {
 323         if (to.size() < 1)
 324           return false;
 325         to = code_point;
 326       }
 327     else if (code_point <= 0x7FF)
 328       {
 329         if (to.size() < 2)
 330           return false;
 331         to = (code_point >> 6) + 0xC0;
 332         to = (code_point & 0x3F) + 0x80;
 333       }
 334     else if (code_point <= 0xFFFF)
 335       {
 336         if (to.size() < 3)
 337           return false;
 338         to = (code_point >> 12) + 0xE0;
 339         to = ((code_point >> 6) & 0x3F) + 0x80;
 340         to = (code_point & 0x3F) + 0x80;
 341       }
 342     else if (code_point <= 0x10FFFF)
 343       {
 344         if (to.size() < 4)
 345           return false;
 346         to = (code_point >> 18) + 0xF0;
 347         to = ((code_point >> 12) & 0x3F) + 0x80;
 348         to = ((code_point >> 6) & 0x3F) + 0x80;
 349         to = (code_point & 0x3F) + 0x80;
 350       }
 351     else
 352       return false;
 353     return true;
 354   }
 355
 356   inline char16_t
 357   adjust_byte_order(char16_t c, codecvt_mode mode)
 358   {
 359 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 360     return (mode & little_endian) ? __builtin_bswap16(c) : c;
 361 #else
 362     return (mode & little_endian) ? c : __builtin_bswap16(c);
 363 #endif
 364   }
 365
 366   // Return true if c is a high-surrogate (aka leading) code point.
 367   inline bool
 368   is_high_surrogate(char32_t c)
 369   {
 370     return c >= 0xD800 && c <= 0xDBFF;
 371   }
 372
 373   // Return true if c is a low-surrogate (aka trailing) code point.
 374   inline bool
 375   is_low_surrogate(char32_t c)
 376   {
 377     return c >= 0xDC00 && c <= 0xDFFF;
 378   }
 379
 380   inline char32_t
 381   surrogate_pair_to_code_point(char32_t high, char32_t low)
 382   {
 383     return (high << 10) + low - 0x35FDC00;
 384   }
 385
 386   // Read a codepoint from a UTF-16 multibyte sequence.
 387   // The sequence's endianness is indicated by (mode & little_endian).
 388   // Updates from.next if the codepoint is not greater than maxcode.
 389   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 390   template<bool Aligned>
 391     char32_t
 392     read_utf16_code_point(range<const char16_t, Aligned>& from,
 393                           unsigned long maxcode, codecvt_mode mode)
 394     {
 395       const size_t avail = from.size();
 396       if (avail == 0)
 397         return incomplete_mb_character;
 398       int inc = 1;
 399       char32_t c = adjust_byte_order(from[0], mode);
 400       if (is_high_surrogate(c))
 401         {
 402           if (avail < 2)
 403             return incomplete_mb_character;
 404           const char16_t c2 = adjust_byte_order(from[1], mode);
 405           if (is_low_surrogate(c2))
 406             {
 407               c = surrogate_pair_to_code_point(c, c2);
 408               inc = 2;
 409             }
 410           else
 411             return invalid_mb_sequence;
 412         }
 413       else if (is_low_surrogate(c))
 414         return invalid_mb_sequence;
 415       if (c <= maxcode)
 416         from += inc;
 417       return c;
 418     }
 419
 420   template<typename C, bool A>
 421   bool
 422   write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
 423   {
 424     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 425
 426     if (codepoint <= max_single_utf16_unit)
 427       {
 428         if (to.size() > 0)
 429           {
 430             to = adjust_byte_order(codepoint, mode);
 431             return true;
 432           }
 433       }
 434     else if (to.size() > 1)
 435       {
 436         // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
 437         const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
 438         char16_t lead = LEAD_OFFSET + (codepoint >> 10);
 439         char16_t trail = 0xDC00 + (codepoint & 0x3FF);
 440         to = adjust_byte_order(lead, mode);
 441         to = adjust_byte_order(trail, mode);
 442         return true;
 443       }
 444     return false;
 445   }
 446
 447   // utf8 -> ucs4
 448   codecvt_base::result
 449   ucs4_in(range<const char>& from, range<char32_t>& to,
 450           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 451   {
 452     read_utf8_bom(from, mode);
 453     while (from.size() && to.size())
 454       {
 455         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 456         if (codepoint == incomplete_mb_character)
 457           return codecvt_base::partial;
 458         if (codepoint > maxcode)
 459           return codecvt_base::error;
 460         to = codepoint;
 461       }
 462     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 463   }
 464
 465   // ucs4 -> utf8
 466   codecvt_base::result
 467   ucs4_out(range<const char32_t>& from, range<char>& to,
 468            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 469   {
 470     if (!write_utf8_bom(to, mode))
 471       return codecvt_base::partial;
 472     while (from.size())
 473       {
 474         const char32_t c = from[0];
 475         if (c > maxcode)
 476           return codecvt_base::error;
 477         if (!write_utf8_code_point(to, c))
 478           return codecvt_base::partial;
 479         ++from;
 480       }
 481     return codecvt_base::ok;
 482   }
 483
 484   // utf16 -> ucs4
 485   codecvt_base::result
 486   ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
 487           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 488   {
 489     read_utf16_bom(from, mode);
 490     while (from.size() && to.size())
 491       {
 492         const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
 493         if (codepoint == incomplete_mb_character)
 494           return codecvt_base::partial;
 495         if (codepoint > maxcode)
 496           return codecvt_base::error;
 497         to = codepoint;
 498       }
 499     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 500   }
 501
 502   // ucs4 -> utf16
 503   codecvt_base::result
 504   ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
 505            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 506   {
 507     if (!write_utf16_bom(to, mode))
 508       return codecvt_base::partial;
 509     while (from.size())
 510       {
 511         const char32_t c = from[0];
 512         if (c > maxcode)
 513           return codecvt_base::error;
 514         if (!write_utf16_code_point(to, c, mode))
 515           return codecvt_base::partial;
 516         ++from;
 517       }
 518     return codecvt_base::ok;
 519   }
 520
 521   // Flag indicating whether to process UTF-16 or UCS2
 522   enum class surrogates { allowed, disallowed };
 523
 524   // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
 525   template<typename C>
 526   codecvt_base::result
 527   utf16_in(range<const char>& from, range<C>& to,
 528            unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 529            surrogates s = surrogates::allowed)
 530   {
 531     read_utf8_bom(from, mode);
 532     while (from.size() && to.size())
 533       {
 534         auto orig = from;
 535         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 536         if (codepoint == incomplete_mb_character)
 537           {
 538             if (s == surrogates::allowed)
 539               return codecvt_base::partial;
 540             else
 541               return codecvt_base::error; // No surrogates in UCS2
 542           }
 543         if (codepoint > maxcode)
 544           return codecvt_base::error;
 545         if (!write_utf16_code_point(to, codepoint, mode))
 546           {
 547             from = orig; // rewind to previous position
 548             return codecvt_base::partial;
 549           }
 550       }
 551     return codecvt_base::ok;
 552   }
 553
 554   // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
 555   template<typename C>
 556   codecvt_base::result
 557   utf16_out(range<const C>& from, range<char>& to,
 558             unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 559             surrogates s = surrogates::allowed)
 560   {
 561     if (!write_utf8_bom(to, mode))
 562       return codecvt_base::partial;
 563     while (from.size())
 564       {
 565         char32_t c = from[0];
 566         int inc = 1;
 567         if (is_high_surrogate(c))
 568           {
 569             if (s == surrogates::disallowed)
 570               return codecvt_base::error; // No surrogates in UCS-2
 571
 572             if (from.size() < 2)
 573               return codecvt_base::ok; // stop converting at this point
 574
 575             const char32_t c2 = from[1];
 576             if (is_low_surrogate(c2))
 577               {
 578                 c = surrogate_pair_to_code_point(c, c2);
 579                 inc = 2;
 580               }
 581             else
 582               return codecvt_base::error;
 583           }
 584         else if (is_low_surrogate(c))
 585           return codecvt_base::error;
 586         if (c > maxcode)
 587           return codecvt_base::error;
 588         if (!write_utf8_code_point(to, c))
 589           return codecvt_base::partial;
 590         from += inc;
 591       }
 592     return codecvt_base::ok;
 593   }
 594
 595   // return pos such that [begin,pos) is valid UTF-16 string no longer than max
 596   const char*
 597   utf16_span(const char* begin, const char* end, size_t max,
 598              char32_t maxcode = max_code_point, codecvt_mode mode = {})
 599   {
 600     range<const char> from{ begin, end };
 601     read_utf8_bom(from, mode);
 602     size_t count = 0;
 603     while (count+1 < max)
 604       {
 605         char32_t c = read_utf8_code_point(from, maxcode);
 606         if (c > maxcode)
 607           return from.next;
 608         else if (c > max_single_utf16_unit)
 609           ++count;
 610         ++count;
 611       }
 612     if (count+1 == max) // take one more character if it fits in a single unit
 613       read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
 614     return from.next;
 615   }
 616
 617   // utf8 -> ucs2
 618   codecvt_base::result
 619   ucs2_in(range<const char>& from, range<char16_t>& to,
 620           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 621   {
 622     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 623     maxcode = std::min(max_single_utf16_unit, maxcode);
 624     return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
 625   }
 626
 627   // ucs2 -> utf8
 628   codecvt_base::result
 629   ucs2_out(range<const char16_t>& from, range<char>& to,
 630            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 631   {
 632     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 633     maxcode = std::min(max_single_utf16_unit, maxcode);
 634     return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
 635   }
 636
 637   // ucs2 -> utf16
 638   codecvt_base::result
 639   ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
 640            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 641   {
 642     if (!write_utf16_bom(to, mode))
 643       return codecvt_base::partial;
 644     while (from.size() && to.size())
 645       {
 646         char16_t c = from[0];
 647         if (is_high_surrogate(c))
 648           return codecvt_base::error;
 649         if (c > maxcode)
 650           return codecvt_base::error;
 651         to = adjust_byte_order(c, mode);
 652         ++from;
 653       }
 654     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 655   }
 656
 657   // utf16 -> ucs2
 658   codecvt_base::result
 659   ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
 660           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 661   {
 662     read_utf16_bom(from, mode);
 663     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 664     maxcode = std::min(max_single_utf16_unit, maxcode);
 665     while (from.size() && to.size())
 666       {
 667         const char32_t c = read_utf16_code_point(from, maxcode, mode);
 668         if (c == incomplete_mb_character)
 669           return codecvt_base::error; // UCS-2 only supports single units.
 670         if (c > maxcode)
 671           return codecvt_base::error;
 672         to = c;
 673       }
 674     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 675   }
 676
 677   const char16_t*
 678   ucs2_span(range<const char16_t, false>& from, size_t max,
 679             char32_t maxcode, codecvt_mode mode)
 680   {
 681     read_utf16_bom(from, mode);
 682     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 683     maxcode = std::min(max_single_utf16_unit, maxcode);
 684     char32_t c = 0;
 685     while (max-- && c <= maxcode)
 686       c = read_utf16_code_point(from, maxcode, mode);
 687     return reinterpret_cast<const char16_t*>(from.next);
 688   }
 689
 690   const char*
 691   ucs2_span(const char* begin, const char* end, size_t max,
 692             char32_t maxcode, codecvt_mode mode)
 693   {
 694     range<const char> from{ begin, end };
 695     read_utf8_bom(from, mode);
 696     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 697     maxcode = std::min(max_single_utf16_unit, maxcode);
 698     char32_t c = 0;
 699     while (max-- && c <= maxcode)
 700       c = read_utf8_code_point(from, maxcode);
 701     return from.next;
 702   }
 703
 704   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 705   const char*
 706   ucs4_span(const char* begin, const char* end, size_t max,
 707             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 708   {
 709     range<const char> from{ begin, end };
 710     read_utf8_bom(from, mode);
 711     char32_t c = 0;
 712     while (max-- && c <= maxcode)
 713       c = read_utf8_code_point(from, maxcode);
 714     return from.next;
 715   }
 716
 717   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 718   const char16_t*
 719   ucs4_span(range<const char16_t, false>& from, size_t max,
 720             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 721   {
 722     read_utf16_bom(from, mode);
 723     char32_t c = 0;
 724     while (max-- && c <= maxcode)
 725       c = read_utf16_code_point(from, maxcode, mode);
 726     return reinterpret_cast<const char16_t*>(from.next);
 727   }
 728 }
 729
 730 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
 731 // Converts from UTF-8 to UTF-16.
 732
 733 locale::id codecvt<char16_t, char, mbstate_t>::id;
 734
 735 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
 736
 737 codecvt_base::result
 738 codecvt<char16_t, char, mbstate_t>::
 739 do_out(state_type&,
 740        const intern_type* __from,
 741        const intern_type* __from_end, const intern_type*& __from_next,
 742        extern_type* __to, extern_type* __to_end,
 743        extern_type*& __to_next) const
 744 {
 745   range<const char16_t> from{ __from, __from_end };
 746   range<char> to{ __to, __to_end };
 747   auto res = utf16_out(from, to);
 748   __from_next = from.next;
 749   __to_next = to.next;
 750   return res;
 751 }
 752
 753 codecvt_base::result
 754 codecvt<char16_t, char, mbstate_t>::
 755 do_unshift(state_type&, extern_type* __to, extern_type*,
 756            extern_type*& __to_next) const
 757 {
 758   __to_next = __to;
 759   return noconv; // we don't use mbstate_t for the unicode facets
 760 }
 761
 762 codecvt_base::result
 763 codecvt<char16_t, char, mbstate_t>::
 764 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 765       const extern_type*& __from_next,
 766       intern_type* __to, intern_type* __to_end,
 767       intern_type*& __to_next) const
 768 {
 769   range<const char> from{ __from, __from_end };
 770   range<char16_t> to{ __to, __to_end };
 771 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 772   codecvt_mode mode = {};
 773 #else
 774   codecvt_mode mode = little_endian;
 775 #endif
 776   auto res = utf16_in(from, to, max_code_point, mode);
 777   __from_next = from.next;
 778   __to_next = to.next;
 779   return res;
 780 }
 781
 782 int
 783 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
 784 { return 0; } // UTF-8 is not a fixed-width encoding
 785
 786 bool
 787 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
 788 { return false; }
 789
 790 int
 791 codecvt<char16_t, char, mbstate_t>::
 792 do_length(state_type&, const extern_type* __from,
 793           const extern_type* __end, size_t __max) const
 794 {
 795   __end = utf16_span(__from, __end, __max);
 796   return __end - __from;
 797 }
 798
 799 int
 800 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
 801 {
 802   // A single character (one or two UTF-16 code units) requires
 803   // up to four UTF-8 code units.
 804   return 4;
 805 }
 806
 807 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
 808 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 809
 810 locale::id codecvt<char32_t, char, mbstate_t>::id;
 811
 812 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
 813
 814 codecvt_base::result
 815 codecvt<char32_t, char, mbstate_t>::
 816 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 817        const intern_type*& __from_next,
 818        extern_type* __to, extern_type* __to_end,
 819        extern_type*& __to_next) const
 820 {
 821   range<const char32_t> from{ __from, __from_end };
 822   range<char> to{ __to, __to_end };
 823   auto res = ucs4_out(from, to);
 824   __from_next = from.next;
 825   __to_next = to.next;
 826   return res;
 827 }
 828
 829 codecvt_base::result
 830 codecvt<char32_t, char, mbstate_t>::
 831 do_unshift(state_type&, extern_type* __to, extern_type*,
 832            extern_type*& __to_next) const
 833 {
 834   __to_next = __to;
 835   return noconv;
 836 }
 837
 838 codecvt_base::result
 839 codecvt<char32_t, char, mbstate_t>::
 840 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 841       const extern_type*& __from_next,
 842       intern_type* __to, intern_type* __to_end,
 843       intern_type*& __to_next) const
 844 {
 845   range<const char> from{ __from, __from_end };
 846   range<char32_t> to{ __to, __to_end };
 847   auto res = ucs4_in(from, to);
 848   __from_next = from.next;
 849   __to_next = to.next;
 850   return res;
 851 }
 852
 853 int
 854 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
 855 { return 0; } // UTF-8 is not a fixed-width encoding
 856
 857 bool
 858 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
 859 { return false; }
 860
 861 int
 862 codecvt<char32_t, char, mbstate_t>::
 863 do_length(state_type&, const extern_type* __from,
 864           const extern_type* __end, size_t __max) const
 865 {
 866   __end = ucs4_span(__from, __end, __max);
 867   return __end - __from;
 868 }
 869
 870 int
 871 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
 872 {
 873   // A single character (one UTF-32 code unit) requires
 874   // up to 4 UTF-8 code units.
 875   return 4;
 876 }
 877
 878 // Define members of codecvt_utf8<char16_t> base class implementation.
 879 // Converts from UTF-8 to UCS-2.
 880
 881 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
 882
 883 codecvt_base::result
 884 __codecvt_utf8_base<char16_t>::
 885 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 886        const intern_type*& __from_next,
 887        extern_type* __to, extern_type* __to_end,
 888        extern_type*& __to_next) const
 889 {
 890   range<const char16_t> from{ __from, __from_end };
 891   range<char> to{ __to, __to_end };
 892   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
 893   __from_next = from.next;
 894   __to_next = to.next;
 895   return res;
 896 }
 897
 898 codecvt_base::result
 899 __codecvt_utf8_base<char16_t>::
 900 do_unshift(state_type&, extern_type* __to, extern_type*,
 901            extern_type*& __to_next) const
 902 {
 903   __to_next = __to;
 904   return noconv;
 905 }
 906
 907 codecvt_base::result
 908 __codecvt_utf8_base<char16_t>::
 909 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 910       const extern_type*& __from_next,
 911       intern_type* __to, intern_type* __to_end,
 912       intern_type*& __to_next) const
 913 {
 914   range<const char> from{ __from, __from_end };
 915   range<char16_t> to{ __to, __to_end };
 916   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
 917 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
 918   mode = codecvt_mode(mode | little_endian);
 919 #endif
 920   auto res = ucs2_in(from, to, _M_maxcode, mode);
 921   __from_next = from.next;
 922   __to_next = to.next;
 923   return res;
 924 }
 925
 926 int
 927 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
 928 { return 0; } // UTF-8 is not a fixed-width encoding
 929
 930 bool
 931 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
 932 { return false; }
 933
 934 int
 935 __codecvt_utf8_base<char16_t>::
 936 do_length(state_type&, const extern_type* __from,
 937           const extern_type* __end, size_t __max) const
 938 {
 939   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
 940   return __end - __from;
 941 }
 942
 943 int
 944 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
 945 {
 946   // A single UCS-2 character requires up to three UTF-8 code units.
 947   // (UCS-2 cannot represent characters that use four UTF-8 code units).
 948   int max = 3;
 949   if (_M_mode & consume_header)
 950     max += sizeof(utf8_bom);
 951   return max;
 952 }
 953
 954 // Define members of codecvt_utf8<char32_t> base class implementation.
 955 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 956
 957 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
 958
 959 codecvt_base::result
 960 __codecvt_utf8_base<char32_t>::
 961 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 962        const intern_type*& __from_next,
 963        extern_type* __to, extern_type* __to_end,
 964        extern_type*& __to_next) const
 965 {
 966   range<const char32_t> from{ __from, __from_end };
 967   range<char> to{ __to, __to_end };
 968   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
 969   __from_next = from.next;
 970   __to_next = to.next;
 971   return res;
 972 }
 973
 974 codecvt_base::result
 975 __codecvt_utf8_base<char32_t>::
 976 do_unshift(state_type&, extern_type* __to, extern_type*,
 977            extern_type*& __to_next) const
 978 {
 979   __to_next = __to;
 980   return noconv;
 981 }
 982
 983 codecvt_base::result
 984 __codecvt_utf8_base<char32_t>::
 985 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 986       const extern_type*& __from_next,
 987       intern_type* __to, intern_type* __to_end,
 988       intern_type*& __to_next) const
 989 {
 990   range<const char> from{ __from, __from_end };
 991   range<char32_t> to{ __to, __to_end };
 992   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
 993   __from_next = from.next;
 994   __to_next = to.next;
 995   return res;
 996 }
 997
 998 int
 999 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1000 { return 0; } // UTF-8 is not a fixed-width encoding
1001
1002 bool
1003 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1004 { return false; }
1005
1006 int
1007 __codecvt_utf8_base<char32_t>::
1008 do_length(state_type&, const extern_type* __from,
1009           const extern_type* __end, size_t __max) const
1010 {
1011   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1012   return __end - __from;
1013 }
1014
1015 int
1016 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1017 {
1018   // A single UCS-4 character requires up to four UTF-8 code units.
1019   int max = 4;
1020   if (_M_mode & consume_header)
1021     max += sizeof(utf8_bom);
1022   return max;
1023 }
1024
1025 #ifdef _GLIBCXX_USE_WCHAR_T
1026
1027 #if __SIZEOF_WCHAR_T__ == 2
1028 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1029 #elif __SIZEOF_WCHAR_T__ == 4
1030 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1031 #endif
1032
1033 // Define members of codecvt_utf8<wchar_t> base class implementation.
1034 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1035
1036 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1037
1038 codecvt_base::result
1039 __codecvt_utf8_base<wchar_t>::
1040 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1041        const intern_type*& __from_next,
1042        extern_type* __to, extern_type* __to_end,
1043        extern_type*& __to_next) const
1044 {
1045   range<char> to{ __to, __to_end };
1046 #if __SIZEOF_WCHAR_T__ == 2
1047   range<const char16_t> from{
1048     reinterpret_cast<const char16_t*>(__from),
1049     reinterpret_cast<const char16_t*>(__from_end)
1050   };
1051   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1052 #elif __SIZEOF_WCHAR_T__ == 4
1053   range<const char32_t> from{
1054     reinterpret_cast<const char32_t*>(__from),
1055     reinterpret_cast<const char32_t*>(__from_end)
1056   };
1057   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1058 #else
1059   return codecvt_base::error;
1060 #endif
1061   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1062   __to_next = to.next;
1063   return res;
1064 }
1065
1066 codecvt_base::result
1067 __codecvt_utf8_base<wchar_t>::
1068 do_unshift(state_type&, extern_type* __to, extern_type*,
1069            extern_type*& __to_next) const
1070 {
1071   __to_next = __to;
1072   return noconv;
1073 }
1074
1075 codecvt_base::result
1076 __codecvt_utf8_base<wchar_t>::
1077 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1078       const extern_type*& __from_next,
1079       intern_type* __to, intern_type* __to_end,
1080       intern_type*& __to_next) const
1081 {
1082   range<const char> from{ __from, __from_end };
1083 #if __SIZEOF_WCHAR_T__ == 2
1084   range<char16_t> to{
1085     reinterpret_cast<char16_t*>(__to),
1086     reinterpret_cast<char16_t*>(__to_end)
1087   };
1088 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1089   codecvt_mode mode = {};
1090 #else
1091   codecvt_mode mode = little_endian;
1092 #endif
1093   auto res = ucs2_in(from, to, _M_maxcode, mode);
1094 #elif __SIZEOF_WCHAR_T__ == 4
1095   range<char32_t> to{
1096     reinterpret_cast<char32_t*>(__to),
1097     reinterpret_cast<char32_t*>(__to_end)
1098   };
1099   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1100 #else
1101   return codecvt_base::error;
1102 #endif
1103   __from_next = from.next;
1104   __to_next = reinterpret_cast<wchar_t*>(to.next);
1105   return res;
1106 }
1107
1108 int
1109 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1110 { return 0; } // UTF-8 is not a fixed-width encoding
1111
1112 bool
1113 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1114 { return false; }
1115
1116 int
1117 __codecvt_utf8_base<wchar_t>::
1118 do_length(state_type&, const extern_type* __from,
1119           const extern_type* __end, size_t __max) const
1120 {
1121 #if __SIZEOF_WCHAR_T__ == 2
1122   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1123 #elif __SIZEOF_WCHAR_T__ == 4
1124   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1125 #else
1126   __end = __from;
1127 #endif
1128   return __end - __from;
1129 }
1130
1131 int
1132 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1133 {
1134 #if __SIZEOF_WCHAR_T__ == 2
1135   int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1136 #else
1137   int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1138 #endif
1139   if (_M_mode & consume_header)
1140     max += sizeof(utf8_bom);
1141   return max;
1142 }
1143 #endif
1144
1145 // Define members of codecvt_utf16<char16_t> base class implementation.
1146 // Converts from UTF-16 to UCS-2.
1147
1148 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1149
1150 codecvt_base::result
1151 __codecvt_utf16_base<char16_t>::
1152 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1153        const intern_type*& __from_next,
1154        extern_type* __to, extern_type* __to_end,
1155        extern_type*& __to_next) const
1156 {
1157   range<const char16_t> from{ __from, __from_end };
1158   range<char16_t, false> to{ __to, __to_end };
1159   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1160   __from_next = from.next;
1161   __to_next = reinterpret_cast<char*>(to.next);
1162   return res;
1163 }
1164
1165 codecvt_base::result
1166 __codecvt_utf16_base<char16_t>::
1167 do_unshift(state_type&, extern_type* __to, extern_type*,
1168            extern_type*& __to_next) const
1169 {
1170   __to_next = __to;
1171   return noconv;
1172 }
1173
1174 codecvt_base::result
1175 __codecvt_utf16_base<char16_t>::
1176 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1177       const extern_type*& __from_next,
1178       intern_type* __to, intern_type* __to_end,
1179       intern_type*& __to_next) const
1180 {
1181   range<const char16_t, false> from{ __from, __from_end };
1182   range<char16_t> to{ __to, __to_end };
1183   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1184   __from_next = reinterpret_cast<const char*>(from.next);
1185   __to_next = to.next;
1186   if (res == codecvt_base::ok && __from_next != __from_end)
1187     res = codecvt_base::error;
1188   return res;
1189 }
1190
1191 int
1192 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1193 { return 0; } // UTF-16 is not a fixed-width encoding
1194
1195 bool
1196 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1197 { return false; }
1198
1199 int
1200 __codecvt_utf16_base<char16_t>::
1201 do_length(state_type&, const extern_type* __from,
1202           const extern_type* __end, size_t __max) const
1203 {
1204   range<const char16_t, false> from{ __from, __end };
1205   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1206   return reinterpret_cast<const char*>(next) - __from;
1207 }
1208
1209 int
1210 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1211 {
1212   // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1213   // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1214   int max = 2;
1215   if (_M_mode & consume_header)
1216     max += sizeof(utf16_bom);
1217   return max;
1218 }
1219
1220 // Define members of codecvt_utf16<char32_t> base class implementation.
1221 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1222
1223 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1224
1225 codecvt_base::result
1226 __codecvt_utf16_base<char32_t>::
1227 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1228        const intern_type*& __from_next,
1229        extern_type* __to, extern_type* __to_end,
1230        extern_type*& __to_next) const
1231 {
1232   range<const char32_t> from{ __from, __from_end };
1233   range<char16_t, false> to{ __to, __to_end };
1234   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1235   __from_next = from.next;
1236   __to_next = reinterpret_cast<char*>(to.next);
1237   return res;
1238 }
1239
1240 codecvt_base::result
1241 __codecvt_utf16_base<char32_t>::
1242 do_unshift(state_type&, extern_type* __to, extern_type*,
1243            extern_type*& __to_next) const
1244 {
1245   __to_next = __to;
1246   return noconv;
1247 }
1248
1249 codecvt_base::result
1250 __codecvt_utf16_base<char32_t>::
1251 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1252       const extern_type*& __from_next,
1253       intern_type* __to, intern_type* __to_end,
1254       intern_type*& __to_next) const
1255 {
1256   range<const char16_t, false> from{ __from, __from_end };
1257   range<char32_t> to{ __to, __to_end };
1258   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1259   __from_next = reinterpret_cast<const char*>(from.next);
1260   __to_next = to.next;
1261   if (res == codecvt_base::ok && __from_next != __from_end)
1262     res = codecvt_base::error;
1263   return res;
1264 }
1265
1266 int
1267 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1268 { return 0; } // UTF-16 is not a fixed-width encoding
1269
1270 bool
1271 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1272 { return false; }
1273
1274 int
1275 __codecvt_utf16_base<char32_t>::
1276 do_length(state_type&, const extern_type* __from,
1277           const extern_type* __end, size_t __max) const
1278 {
1279   range<const char16_t, false> from{ __from, __end };
1280   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1281   return reinterpret_cast<const char*>(next) - __from;
1282 }
1283
1284 int
1285 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1286 {
1287   // A single UCS-4 character requires one or two UTF-16 code units
1288   // (so up to four chars).
1289   int max = 4;
1290   if (_M_mode & consume_header)
1291     max += sizeof(utf16_bom);
1292   return max;
1293 }
1294
1295 #ifdef _GLIBCXX_USE_WCHAR_T
1296 // Define members of codecvt_utf16<wchar_t> base class implementation.
1297 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1298
1299 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1300
1301 codecvt_base::result
1302 __codecvt_utf16_base<wchar_t>::
1303 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1304        const intern_type*& __from_next,
1305        extern_type* __to, extern_type* __to_end,
1306        extern_type*& __to_next) const
1307 {
1308   range<char16_t, false> to{ __to, __to_end };
1309 #if __SIZEOF_WCHAR_T__ == 2
1310   range<const char16_t> from{
1311     reinterpret_cast<const char16_t*>(__from),
1312     reinterpret_cast<const char16_t*>(__from_end),
1313   };
1314   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1315 #elif __SIZEOF_WCHAR_T__ == 4
1316   range<const char32_t> from{
1317     reinterpret_cast<const char32_t*>(__from),
1318     reinterpret_cast<const char32_t*>(__from_end),
1319   };
1320   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1321 #else
1322   return codecvt_base::error;
1323 #endif
1324   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1325   __to_next = reinterpret_cast<char*>(to.next);
1326   return res;
1327 }
1328
1329 codecvt_base::result
1330 __codecvt_utf16_base<wchar_t>::
1331 do_unshift(state_type&, extern_type* __to, extern_type*,
1332            extern_type*& __to_next) const
1333 {
1334   __to_next = __to;
1335   return noconv;
1336 }
1337
1338 codecvt_base::result
1339 __codecvt_utf16_base<wchar_t>::
1340 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1341       const extern_type*& __from_next,
1342       intern_type* __to, intern_type* __to_end,
1343       intern_type*& __to_next) const
1344 {
1345   range<const char16_t, false> from{ __from, __from_end };
1346 #if __SIZEOF_WCHAR_T__ == 2
1347   range<char16_t> to{
1348     reinterpret_cast<char16_t*>(__to),
1349     reinterpret_cast<char16_t*>(__to_end),
1350   };
1351   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1352 #elif __SIZEOF_WCHAR_T__ == 4
1353   range<char32_t> to{
1354     reinterpret_cast<char32_t*>(__to),
1355     reinterpret_cast<char32_t*>(__to_end),
1356   };
1357   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1358 #else
1359   return codecvt_base::error;
1360 #endif
1361   __from_next = reinterpret_cast<const char*>(from.next);
1362   __to_next = reinterpret_cast<wchar_t*>(to.next);
1363   if (res == codecvt_base::ok && __from_next != __from_end)
1364     res = codecvt_base::error;
1365   return res;
1366 }
1367
1368 int
1369 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1370 { return 0; } // UTF-16 is not a fixed-width encoding
1371
1372 bool
1373 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1374 { return false; }
1375
1376 int
1377 __codecvt_utf16_base<wchar_t>::
1378 do_length(state_type&, const extern_type* __from,
1379           const extern_type* __end, size_t __max) const
1380 {
1381   range<const char16_t, false> from{ __from, __end };
1382 #if __SIZEOF_WCHAR_T__ == 2
1383   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1384 #elif __SIZEOF_WCHAR_T__ == 4
1385   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1386 #endif
1387   return reinterpret_cast<const char*>(next) - __from;
1388 }
1389
1390 int
1391 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1392 {
1393 #if __SIZEOF_WCHAR_T__ == 2
1394   int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1395 #else
1396   int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1397 #endif
1398   if (_M_mode & consume_header)
1399     max += sizeof(utf16_bom);
1400   return max;
1401 }
1402 #endif
1403
1404 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1405 // Converts from UTF-8 to UTF-16.
1406
1407 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1408
1409 codecvt_base::result
1410 __codecvt_utf8_utf16_base<char16_t>::
1411 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1412        const intern_type*& __from_next,
1413        extern_type* __to, extern_type* __to_end,
1414        extern_type*& __to_next) const
1415 {
1416   range<const char16_t> from{ __from, __from_end };
1417   range<char> to{ __to, __to_end };
1418   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1419   __from_next = from.next;
1420   __to_next = to.next;
1421   return res;
1422 }
1423
1424 codecvt_base::result
1425 __codecvt_utf8_utf16_base<char16_t>::
1426 do_unshift(state_type&, extern_type* __to, extern_type*,
1427            extern_type*& __to_next) const
1428 {
1429   __to_next = __to;
1430   return noconv;
1431 }
1432
1433 codecvt_base::result
1434 __codecvt_utf8_utf16_base<char16_t>::
1435 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1436       const extern_type*& __from_next,
1437       intern_type* __to, intern_type* __to_end,
1438       intern_type*& __to_next) const
1439 {
1440   range<const char> from{ __from, __from_end };
1441   range<char16_t> to{ __to, __to_end };
1442   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1443 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1444   mode = codecvt_mode(mode | little_endian);
1445 #endif
1446   auto res = utf16_in(from, to, _M_maxcode, mode);
1447   __from_next = from.next;
1448   __to_next = to.next;
1449   return res;
1450 }
1451
1452 int
1453 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1454 { return 0; } // UTF-8 is not a fixed-width encoding
1455
1456 bool
1457 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1458 { return false; }
1459
1460 int
1461 __codecvt_utf8_utf16_base<char16_t>::
1462 do_length(state_type&, const extern_type* __from,
1463           const extern_type* __end, size_t __max) const
1464 {
1465   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1466   return __end - __from;
1467 }
1468
1469 int
1470 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1471 {
1472   // A single character can be 1 or 2 UTF-16 code units,
1473   // requiring up to 4 UTF-8 code units.
1474   int max = 4;
1475   if (_M_mode & consume_header)
1476     max += sizeof(utf8_bom);
1477   return max;
1478 }
1479
1480 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1481 // Converts from UTF-8 to UTF-16.
1482
1483 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1484
1485 codecvt_base::result
1486 __codecvt_utf8_utf16_base<char32_t>::
1487 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1488        const intern_type*& __from_next,
1489        extern_type* __to, extern_type* __to_end,
1490        extern_type*& __to_next) const
1491 {
1492   range<const char32_t> from{ __from, __from_end };
1493   range<char> to{ __to, __to_end };
1494   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1495   __from_next = from.next;
1496   __to_next = to.next;
1497   return res;
1498 }
1499
1500 codecvt_base::result
1501 __codecvt_utf8_utf16_base<char32_t>::
1502 do_unshift(state_type&, extern_type* __to, extern_type*,
1503            extern_type*& __to_next) const
1504 {
1505   __to_next = __to;
1506   return noconv;
1507 }
1508
1509 codecvt_base::result
1510 __codecvt_utf8_utf16_base<char32_t>::
1511 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1512       const extern_type*& __from_next,
1513       intern_type* __to, intern_type* __to_end,
1514       intern_type*& __to_next) const
1515 {
1516   range<const char> from{ __from, __from_end };
1517   range<char32_t> to{ __to, __to_end };
1518   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1519 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1520   mode = codecvt_mode(mode | little_endian);
1521 #endif
1522   auto res = utf16_in(from, to, _M_maxcode, mode);
1523   __from_next = from.next;
1524   __to_next = to.next;
1525   return res;
1526 }
1527
1528 int
1529 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1530 { return 0; } // UTF-8 is not a fixed-width encoding
1531
1532 bool
1533 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1534 { return false; }
1535
1536 int
1537 __codecvt_utf8_utf16_base<char32_t>::
1538 do_length(state_type&, const extern_type* __from,
1539           const extern_type* __end, size_t __max) const
1540 {
1541   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1542   return __end - __from;
1543 }
1544
1545 int
1546 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1547 {
1548   // A single character can be 1 or 2 UTF-16 code units,
1549   // requiring up to 4 UTF-8 code units.
1550   int max = 4;
1551   if (_M_mode & consume_header)
1552     max += sizeof(utf8_bom);
1553   return max;
1554 }
1555
1556 #ifdef _GLIBCXX_USE_WCHAR_T
1557 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1558 // Converts from UTF-8 to UTF-16.
1559
1560 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1561
1562 codecvt_base::result
1563 __codecvt_utf8_utf16_base<wchar_t>::
1564 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1565        const intern_type*& __from_next,
1566        extern_type* __to, extern_type* __to_end,
1567        extern_type*& __to_next) const
1568 {
1569   range<const wchar_t> from{ __from, __from_end };
1570   range<char> to{ __to, __to_end };
1571   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1572   __from_next = from.next;
1573   __to_next = to.next;
1574   return res;
1575 }
1576
1577 codecvt_base::result
1578 __codecvt_utf8_utf16_base<wchar_t>::
1579 do_unshift(state_type&, extern_type* __to, extern_type*,
1580            extern_type*& __to_next) const
1581 {
1582   __to_next = __to;
1583   return noconv;
1584 }
1585
1586 codecvt_base::result
1587 __codecvt_utf8_utf16_base<wchar_t>::
1588 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1589       const extern_type*& __from_next,
1590       intern_type* __to, intern_type* __to_end,
1591       intern_type*& __to_next) const
1592 {
1593   range<const char> from{ __from, __from_end };
1594   range<wchar_t> to{ __to, __to_end };
1595   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1596 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1597   mode = codecvt_mode(mode | little_endian);
1598 #endif
1599   auto res = utf16_in(from, to, _M_maxcode, mode);
1600   __from_next = from.next;
1601   __to_next = to.next;
1602   return res;
1603 }
1604
1605 int
1606 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1607 { return 0; } // UTF-8 is not a fixed-width encoding
1608
1609 bool
1610 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1611 { return false; }
1612
1613 int
1614 __codecvt_utf8_utf16_base<wchar_t>::
1615 do_length(state_type&, const extern_type* __from,
1616           const extern_type* __end, size_t __max) const
1617 {
1618   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1619   return __end - __from;
1620 }
1621
1622 int
1623 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1624 {
1625   // A single character can be 1 or 2 UTF-16 code units,
1626   // requiring up to 4 UTF-8 code units.
1627   int max = 4;
1628   if (_M_mode & consume_header)
1629     max += sizeof(utf8_bom);
1630   return max;
1631 }
1632 #endif
1633
1634 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1635 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1636 template class codecvt_byname<char16_t, char, mbstate_t>;
1637 template class codecvt_byname<char32_t, char, mbstate_t>;
1638
1639 _GLIBCXX_END_NAMESPACE_VERSION
1640 }