libstdc++-v3/src/c++11/codecvt.cc

   1 // Locale support (codecvt) -*- C++ -*-
   2
   3 // Copyright (C) 2015-2018 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 #include <codecvt>
  26 #include <cstring>              // std::memcpy, std::memcmp
  27 #include <bits/stl_algobase.h>  // std::min
  28
  29 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
  30 namespace std _GLIBCXX_VISIBILITY(default)
  31 {
  32 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  33
  34   // The standard doesn't define these operators, which is annoying.
  35   static underlying_type<codecvt_mode>::type
  36   to_integer(codecvt_mode m)
  37   { return static_cast<underlying_type<codecvt_mode>::type>(m); }
  38
  39   static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
  40   { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
  41
  42   static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
  43   { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
  44
  45   static codecvt_mode operator~(codecvt_mode m)
  46   { return codecvt_mode(~to_integer(m)); }
  47
  48 namespace
  49 {
  50   // Largest code point that fits in a single UTF-16 code unit.
  51   const char32_t max_single_utf16_unit = 0xFFFF;
  52
  53   const char32_t max_code_point = 0x10FFFF;
  54
  55   // The functions below rely on maxcode < incomplete_mb_character
  56   // (which is enforced by the codecvt_utf* classes on construction).
  57   const char32_t incomplete_mb_character = char32_t(-2);
  58   const char32_t invalid_mb_sequence = char32_t(-1);
  59
  60   // Utility type for reading and writing code units of type Elem from
  61   // a range defined by a pair of pointers.
  62   template<typename Elem, bool Aligned = true>
  63     struct range
  64     {
  65       Elem* next;
  66       Elem* end;
  67
  68       // Write a code unit.
  69       range& operator=(Elem e)
  70       {
  71         *next++ = e;
  72         return *this;
  73       }
  74
  75       // Read the next code unit.
  76       Elem operator*() const { return *next; }
  77
  78       // Read the Nth code unit.
  79       Elem operator[](size_t n) const { return next[n]; }
  80
  81       // Move to the next code unit.
  82       range& operator++()
  83       {
  84         ++next;
  85         return *this;
  86       }
  87
  88       // Move to the Nth code unit.
  89       range& operator+=(size_t n)
  90       {
  91         next += n;
  92         return *this;
  93       }
  94
  95       // The number of code units remaining.
  96       size_t size() const { return end - next; }
  97
  98       // The number of bytes remaining.
  99       size_t nbytes() const { return (const char*)end - (const char*)next; }
 100     };
 101
 102   // This specialization is used when accessing char16_t values through
 103   // pointers to char, which might not be correctly aligned for char16_t.
 104   template<typename Elem>
 105     struct range<Elem, false>
 106     {
 107       using value_type = typename remove_const<Elem>::type;
 108
 109       using char_pointer = typename
 110         conditional<is_const<Elem>::value, const char*, char*>::type;
 111
 112       char_pointer next;
 113       char_pointer end;
 114
 115       // Write a code unit.
 116       range& operator=(Elem e)
 117       {
 118         memcpy(next, &e, sizeof(Elem));
 119         ++*this;
 120         return *this;
 121       }
 122
 123       // Read the next code unit.
 124       Elem operator*() const
 125       {
 126         value_type e;
 127         memcpy(&e, next, sizeof(Elem));
 128         return e;
 129       }
 130
 131       // Read the Nth code unit.
 132       Elem operator[](size_t n) const
 133       {
 134         value_type e;
 135         memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
 136         return e;
 137       }
 138
 139       // Move to the next code unit.
 140       range& operator++()
 141       {
 142         next += sizeof(Elem);
 143         return *this;
 144       }
 145
 146       // Move to the Nth code unit.
 147       range& operator+=(size_t n)
 148       {
 149         next += n * sizeof(Elem);
 150         return *this;
 151       }
 152
 153       // The number of code units remaining.
 154       size_t size() const { return nbytes() / sizeof(Elem); }
 155
 156       // The number of bytes remaining.
 157       size_t nbytes() const { return end - next; }
 158     };
 159
 160   // Multibyte sequences can have "header" consisting of Byte Order Mark
 161   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
 162   const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
 163   const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 164
 165   // Write a BOM (space permitting).
 166   template<typename C, bool A, size_t N>
 167     bool
 168     write_bom(range<C, A>& to, const unsigned char (&bom)[N])
 169     {
 170       static_assert( (N / sizeof(C)) != 0, "" );
 171       static_assert( (N % sizeof(C)) == 0, "" );
 172
 173       if (to.nbytes() < N)
 174         return false;
 175       memcpy(to.next, bom, N);
 176       to += (N / sizeof(C));
 177       return true;
 178     }
 179
 180   // Try to read a BOM.
 181   template<typename C, bool A, size_t N>
 182     bool
 183     read_bom(range<C, A>& from, const unsigned char (&bom)[N])
 184     {
 185       static_assert( (N / sizeof(C)) != 0, "" );
 186       static_assert( (N % sizeof(C)) == 0, "" );
 187
 188       if (from.nbytes() >= N && !memcmp(from.next, bom, N))
 189         {
 190           from += (N / sizeof(C));
 191           return true;
 192         }
 193       return false;
 194     }
 195
 196   // If generate_header is set in mode write out UTF-8 BOM.
 197   bool
 198   write_utf8_bom(range<char>& to, codecvt_mode mode)
 199   {
 200     if (mode & generate_header)
 201       return write_bom(to, utf8_bom);
 202     return true;
 203   }
 204
 205   // If generate_header is set in mode write out the UTF-16 BOM indicated
 206   // by whether little_endian is set in mode.
 207   template<bool Aligned>
 208   bool
 209   write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
 210   {
 211     if (mode & generate_header)
 212     {
 213       if (mode & little_endian)
 214         return write_bom(to, utf16le_bom);
 215       else
 216         return write_bom(to, utf16_bom);
 217     }
 218     return true;
 219   }
 220
 221   // If consume_header is set in mode update from.next to after any BOM.
 222   void
 223   read_utf8_bom(range<const char>& from, codecvt_mode mode)
 224   {
 225     if (mode & consume_header)
 226       read_bom(from, utf8_bom);
 227   }
 228
 229   // If consume_header is not set in mode, no effects.
 230   // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
 231   // - if the UTF-16BE BOM was found unset little_endian in mode, or
 232   // - if the UTF-16LE BOM was found set little_endian in mode.
 233   template<bool Aligned>
 234   void
 235   read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
 236   {
 237     if (mode & consume_header)
 238       {
 239         if (read_bom(from, utf16_bom))
 240           mode &= ~little_endian;
 241         else if (read_bom(from, utf16le_bom))
 242           mode |= little_endian;
 243       }
 244   }
 245
 246   // Read a codepoint from a UTF-8 multibyte sequence.
 247   // Updates from.next if the codepoint is not greater than maxcode.
 248   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 249   char32_t
 250   read_utf8_code_point(range<const char>& from, unsigned long maxcode)
 251   {
 252     const size_t avail = from.size();
 253     if (avail == 0)
 254       return incomplete_mb_character;
 255     unsigned char c1 = from[0];
 256     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
 257     if (c1 < 0x80)
 258     {
 259       ++from;
 260       return c1;
 261     }
 262     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
 263       return invalid_mb_sequence;
 264     else if (c1 < 0xE0) // 2-byte sequence
 265     {
 266       if (avail < 2)
 267         return incomplete_mb_character;
 268       unsigned char c2 = from[1];
 269       if ((c2 & 0xC0) != 0x80)
 270         return invalid_mb_sequence;
 271       char32_t c = (c1 << 6) + c2 - 0x3080;
 272       if (c <= maxcode)
 273         from += 2;
 274       return c;
 275     }
 276     else if (c1 < 0xF0) // 3-byte sequence
 277     {
 278       if (avail < 3)
 279         return incomplete_mb_character;
 280       unsigned char c2 = from[1];
 281       if ((c2 & 0xC0) != 0x80)
 282         return invalid_mb_sequence;
 283       if (c1 == 0xE0 && c2 < 0xA0) // overlong
 284         return invalid_mb_sequence;
 285       unsigned char c3 = from[2];
 286       if ((c3 & 0xC0) != 0x80)
 287         return invalid_mb_sequence;
 288       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
 289       if (c <= maxcode)
 290         from += 3;
 291       return c;
 292     }
 293     else if (c1 < 0xF5) // 4-byte sequence
 294     {
 295       if (avail < 4)
 296         return incomplete_mb_character;
 297       unsigned char c2 = from[1];
 298       if ((c2 & 0xC0) != 0x80)
 299         return invalid_mb_sequence;
 300       if (c1 == 0xF0 && c2 < 0x90) // overlong
 301         return invalid_mb_sequence;
 302       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
 303       return invalid_mb_sequence;
 304       unsigned char c3 = from[2];
 305       if ((c3 & 0xC0) != 0x80)
 306         return invalid_mb_sequence;
 307       unsigned char c4 = from[3];
 308       if ((c4 & 0xC0) != 0x80)
 309         return invalid_mb_sequence;
 310       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
 311       if (c <= maxcode)
 312         from += 4;
 313       return c;
 314     }
 315     else // > U+10FFFF
 316       return invalid_mb_sequence;
 317   }
 318
 319   bool
 320   write_utf8_code_point(range<char>& to, char32_t code_point)
 321   {
 322     if (code_point < 0x80)
 323       {
 324         if (to.size() < 1)
 325           return false;
 326         to = code_point;
 327       }
 328     else if (code_point <= 0x7FF)
 329       {
 330         if (to.size() < 2)
 331           return false;
 332         to = (code_point >> 6) + 0xC0;
 333         to = (code_point & 0x3F) + 0x80;
 334       }
 335     else if (code_point <= 0xFFFF)
 336       {
 337         if (to.size() < 3)
 338           return false;
 339         to = (code_point >> 12) + 0xE0;
 340         to = ((code_point >> 6) & 0x3F) + 0x80;
 341         to = (code_point & 0x3F) + 0x80;
 342       }
 343     else if (code_point <= 0x10FFFF)
 344       {
 345         if (to.size() < 4)
 346           return false;
 347         to = (code_point >> 18) + 0xF0;
 348         to = ((code_point >> 12) & 0x3F) + 0x80;
 349         to = ((code_point >> 6) & 0x3F) + 0x80;
 350         to = (code_point & 0x3F) + 0x80;
 351       }
 352     else
 353       return false;
 354     return true;
 355   }
 356
 357   inline char16_t
 358   adjust_byte_order(char16_t c, codecvt_mode mode)
 359   {
 360 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 361     return (mode & little_endian) ? __builtin_bswap16(c) : c;
 362 #else
 363     return (mode & little_endian) ? c : __builtin_bswap16(c);
 364 #endif
 365   }
 366
 367   // Return true if c is a high-surrogate (aka leading) code point.
 368   inline bool
 369   is_high_surrogate(char32_t c)
 370   {
 371     return c >= 0xD800 && c <= 0xDBFF;
 372   }
 373
 374   // Return true if c is a low-surrogate (aka trailing) code point.
 375   inline bool
 376   is_low_surrogate(char32_t c)
 377   {
 378     return c >= 0xDC00 && c <= 0xDFFF;
 379   }
 380
 381   inline char32_t
 382   surrogate_pair_to_code_point(char32_t high, char32_t low)
 383   {
 384     return (high << 10) + low - 0x35FDC00;
 385   }
 386
 387   // Read a codepoint from a UTF-16 multibyte sequence.
 388   // The sequence's endianness is indicated by (mode & little_endian).
 389   // Updates from.next if the codepoint is not greater than maxcode.
 390   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
 391   template<bool Aligned>
 392     char32_t
 393     read_utf16_code_point(range<const char16_t, Aligned>& from,
 394                           unsigned long maxcode, codecvt_mode mode)
 395     {
 396       const size_t avail = from.size();
 397       if (avail == 0)
 398         return incomplete_mb_character;
 399       int inc = 1;
 400       char32_t c = adjust_byte_order(from[0], mode);
 401       if (is_high_surrogate(c))
 402         {
 403           if (avail < 2)
 404             return incomplete_mb_character;
 405           const char16_t c2 = adjust_byte_order(from[1], mode);
 406           if (is_low_surrogate(c2))
 407             {
 408               c = surrogate_pair_to_code_point(c, c2);
 409               inc = 2;
 410             }
 411           else
 412             return invalid_mb_sequence;
 413         }
 414       else if (is_low_surrogate(c))
 415         return invalid_mb_sequence;
 416       if (c <= maxcode)
 417         from += inc;
 418       return c;
 419     }
 420
 421   template<typename C, bool A>
 422   bool
 423   write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
 424   {
 425     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 426
 427     if (codepoint <= max_single_utf16_unit)
 428       {
 429         if (to.size() > 0)
 430           {
 431             to = adjust_byte_order(codepoint, mode);
 432             return true;
 433           }
 434       }
 435     else if (to.size() > 1)
 436       {
 437         // Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
 438         const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
 439         char16_t lead = LEAD_OFFSET + (codepoint >> 10);
 440         char16_t trail = 0xDC00 + (codepoint & 0x3FF);
 441         to = adjust_byte_order(lead, mode);
 442         to = adjust_byte_order(trail, mode);
 443         return true;
 444       }
 445     return false;
 446   }
 447
 448   // utf8 -> ucs4
 449   codecvt_base::result
 450   ucs4_in(range<const char>& from, range<char32_t>& to,
 451           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 452   {
 453     read_utf8_bom(from, mode);
 454     while (from.size() && to.size())
 455       {
 456         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 457         if (codepoint == incomplete_mb_character)
 458           return codecvt_base::partial;
 459         if (codepoint > maxcode)
 460           return codecvt_base::error;
 461         to = codepoint;
 462       }
 463     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 464   }
 465
 466   // ucs4 -> utf8
 467   codecvt_base::result
 468   ucs4_out(range<const char32_t>& from, range<char>& to,
 469            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 470   {
 471     if (!write_utf8_bom(to, mode))
 472       return codecvt_base::partial;
 473     while (from.size())
 474       {
 475         const char32_t c = from[0];
 476         if (c > maxcode)
 477           return codecvt_base::error;
 478         if (!write_utf8_code_point(to, c))
 479           return codecvt_base::partial;
 480         ++from;
 481       }
 482     return codecvt_base::ok;
 483   }
 484
 485   // utf16 -> ucs4
 486   codecvt_base::result
 487   ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
 488           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 489   {
 490     read_utf16_bom(from, mode);
 491     while (from.size() && to.size())
 492       {
 493         const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
 494         if (codepoint == incomplete_mb_character)
 495           return codecvt_base::partial;
 496         if (codepoint > maxcode)
 497           return codecvt_base::error;
 498         to = codepoint;
 499       }
 500     return from.size() ? codecvt_base::partial : codecvt_base::ok;
 501   }
 502
 503   // ucs4 -> utf16
 504   codecvt_base::result
 505   ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
 506            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
 507   {
 508     if (!write_utf16_bom(to, mode))
 509       return codecvt_base::partial;
 510     while (from.size())
 511       {
 512         const char32_t c = from[0];
 513         if (c > maxcode)
 514           return codecvt_base::error;
 515         if (!write_utf16_code_point(to, c, mode))
 516           return codecvt_base::partial;
 517         ++from;
 518       }
 519     return codecvt_base::ok;
 520   }
 521
 522   // Flag indicating whether to process UTF-16 or UCS2
 523   enum class surrogates { allowed, disallowed };
 524
 525   // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
 526   template<typename C>
 527   codecvt_base::result
 528   utf16_in(range<const char>& from, range<C>& to,
 529            unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 530            surrogates s = surrogates::allowed)
 531   {
 532     read_utf8_bom(from, mode);
 533     while (from.size() && to.size())
 534       {
 535         auto orig = from;
 536         const char32_t codepoint = read_utf8_code_point(from, maxcode);
 537         if (codepoint == incomplete_mb_character)
 538           {
 539             if (s == surrogates::allowed)
 540               return codecvt_base::partial;
 541             else
 542               return codecvt_base::error; // No surrogates in UCS2
 543           }
 544         if (codepoint > maxcode)
 545           return codecvt_base::error;
 546         if (!write_utf16_code_point(to, codepoint, mode))
 547           {
 548             from = orig; // rewind to previous position
 549             return codecvt_base::partial;
 550           }
 551       }
 552     return codecvt_base::ok;
 553   }
 554
 555   // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
 556   template<typename C>
 557   codecvt_base::result
 558   utf16_out(range<const C>& from, range<char>& to,
 559             unsigned long maxcode = max_code_point, codecvt_mode mode = {},
 560             surrogates s = surrogates::allowed)
 561   {
 562     if (!write_utf8_bom(to, mode))
 563       return codecvt_base::partial;
 564     while (from.size())
 565       {
 566         char32_t c = from[0];
 567         int inc = 1;
 568         if (is_high_surrogate(c))
 569           {
 570             if (s == surrogates::disallowed)
 571               return codecvt_base::error; // No surrogates in UCS-2
 572
 573             if (from.size() < 2)
 574               return codecvt_base::ok; // stop converting at this point
 575
 576             const char32_t c2 = from[1];
 577             if (is_low_surrogate(c2))
 578               {
 579                 c = surrogate_pair_to_code_point(c, c2);
 580                 inc = 2;
 581               }
 582             else
 583               return codecvt_base::error;
 584           }
 585         else if (is_low_surrogate(c))
 586           return codecvt_base::error;
 587         if (c > maxcode)
 588           return codecvt_base::error;
 589         if (!write_utf8_code_point(to, c))
 590           return codecvt_base::partial;
 591         from += inc;
 592       }
 593     return codecvt_base::ok;
 594   }
 595
 596   // return pos such that [begin,pos) is valid UTF-16 string no longer than max
 597   const char*
 598   utf16_span(const char* begin, const char* end, size_t max,
 599              char32_t maxcode = max_code_point, codecvt_mode mode = {})
 600   {
 601     range<const char> from{ begin, end };
 602     read_utf8_bom(from, mode);
 603     size_t count = 0;
 604     while (count+1 < max)
 605       {
 606         char32_t c = read_utf8_code_point(from, maxcode);
 607         if (c > maxcode)
 608           return from.next;
 609         else if (c > max_single_utf16_unit)
 610           ++count;
 611         ++count;
 612       }
 613     if (count+1 == max) // take one more character if it fits in a single unit
 614       read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
 615     return from.next;
 616   }
 617
 618   // utf8 -> ucs2
 619   codecvt_base::result
 620   ucs2_in(range<const char>& from, range<char16_t>& to,
 621           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 622   {
 623     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 624     maxcode = std::min(max_single_utf16_unit, maxcode);
 625     return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
 626   }
 627
 628   // ucs2 -> utf8
 629   codecvt_base::result
 630   ucs2_out(range<const char16_t>& from, range<char>& to,
 631            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 632   {
 633     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 634     maxcode = std::min(max_single_utf16_unit, maxcode);
 635     return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
 636   }
 637
 638   // ucs2 -> utf16
 639   codecvt_base::result
 640   ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
 641            char32_t maxcode = max_code_point, codecvt_mode mode = {})
 642   {
 643     if (!write_utf16_bom(to, mode))
 644       return codecvt_base::partial;
 645     while (from.size() && to.size())
 646       {
 647         char16_t c = from[0];
 648         if (is_high_surrogate(c))
 649           return codecvt_base::error;
 650         if (c > maxcode)
 651           return codecvt_base::error;
 652         to = adjust_byte_order(c, mode);
 653         ++from;
 654       }
 655     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 656   }
 657
 658   // utf16 -> ucs2
 659   codecvt_base::result
 660   ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
 661           char32_t maxcode = max_code_point, codecvt_mode mode = {})
 662   {
 663     read_utf16_bom(from, mode);
 664     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 665     maxcode = std::min(max_single_utf16_unit, maxcode);
 666     while (from.size() && to.size())
 667       {
 668         const char32_t c = read_utf16_code_point(from, maxcode, mode);
 669         if (c == incomplete_mb_character)
 670           return codecvt_base::error; // UCS-2 only supports single units.
 671         if (c > maxcode)
 672           return codecvt_base::error;
 673         to = c;
 674       }
 675     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
 676   }
 677
 678   const char16_t*
 679   ucs2_span(range<const char16_t, false>& from, size_t max,
 680             char32_t maxcode, codecvt_mode mode)
 681   {
 682     read_utf16_bom(from, mode);
 683     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 684     maxcode = std::min(max_single_utf16_unit, maxcode);
 685     char32_t c = 0;
 686     while (max-- && c <= maxcode)
 687       c = read_utf16_code_point(from, maxcode, mode);
 688     return reinterpret_cast<const char16_t*>(from.next);
 689   }
 690
 691   const char*
 692   ucs2_span(const char* begin, const char* end, size_t max,
 693             char32_t maxcode, codecvt_mode mode)
 694   {
 695     range<const char> from{ begin, end };
 696     read_utf8_bom(from, mode);
 697     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
 698     maxcode = std::min(max_single_utf16_unit, maxcode);
 699     char32_t c = 0;
 700     while (max-- && c <= maxcode)
 701       c = read_utf8_code_point(from, maxcode);
 702     return from.next;
 703   }
 704
 705   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 706   const char*
 707   ucs4_span(const char* begin, const char* end, size_t max,
 708             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 709   {
 710     range<const char> from{ begin, end };
 711     read_utf8_bom(from, mode);
 712     char32_t c = 0;
 713     while (max-- && c <= maxcode)
 714       c = read_utf8_code_point(from, maxcode);
 715     return from.next;
 716   }
 717
 718   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
 719   const char16_t*
 720   ucs4_span(range<const char16_t, false>& from, size_t max,
 721             char32_t maxcode = max_code_point, codecvt_mode mode = {})
 722   {
 723     read_utf16_bom(from, mode);
 724     char32_t c = 0;
 725     while (max-- && c <= maxcode)
 726       c = read_utf16_code_point(from, maxcode, mode);
 727     return reinterpret_cast<const char16_t*>(from.next);
 728   }
 729 }
 730
 731 // Define members of codecvt<char16_t, char, mbstate_t> specialization.
 732 // Converts from UTF-8 to UTF-16.
 733
 734 locale::id codecvt<char16_t, char, mbstate_t>::id;
 735
 736 codecvt<char16_t, char, mbstate_t>::~codecvt() { }
 737
 738 codecvt_base::result
 739 codecvt<char16_t, char, mbstate_t>::
 740 do_out(state_type&,
 741        const intern_type* __from,
 742        const intern_type* __from_end, const intern_type*& __from_next,
 743        extern_type* __to, extern_type* __to_end,
 744        extern_type*& __to_next) const
 745 {
 746   range<const char16_t> from{ __from, __from_end };
 747   range<char> to{ __to, __to_end };
 748   auto res = utf16_out(from, to);
 749   __from_next = from.next;
 750   __to_next = to.next;
 751   return res;
 752 }
 753
 754 codecvt_base::result
 755 codecvt<char16_t, char, mbstate_t>::
 756 do_unshift(state_type&, extern_type* __to, extern_type*,
 757            extern_type*& __to_next) const
 758 {
 759   __to_next = __to;
 760   return noconv; // we don't use mbstate_t for the unicode facets
 761 }
 762
 763 codecvt_base::result
 764 codecvt<char16_t, char, mbstate_t>::
 765 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 766       const extern_type*& __from_next,
 767       intern_type* __to, intern_type* __to_end,
 768       intern_type*& __to_next) const
 769 {
 770   range<const char> from{ __from, __from_end };
 771   range<char16_t> to{ __to, __to_end };
 772 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 773   codecvt_mode mode = {};
 774 #else
 775   codecvt_mode mode = little_endian;
 776 #endif
 777   auto res = utf16_in(from, to, max_code_point, mode);
 778   __from_next = from.next;
 779   __to_next = to.next;
 780   return res;
 781 }
 782
 783 int
 784 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
 785 { return 0; } // UTF-8 is not a fixed-width encoding
 786
 787 bool
 788 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
 789 { return false; }
 790
 791 int
 792 codecvt<char16_t, char, mbstate_t>::
 793 do_length(state_type&, const extern_type* __from,
 794           const extern_type* __end, size_t __max) const
 795 {
 796   __end = utf16_span(__from, __end, __max);
 797   return __end - __from;
 798 }
 799
 800 int
 801 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
 802 {
 803   // A single character (one or two UTF-16 code units) requires
 804   // up to four UTF-8 code units.
 805   return 4;
 806 }
 807
 808 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
 809 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 810
 811 locale::id codecvt<char32_t, char, mbstate_t>::id;
 812
 813 codecvt<char32_t, char, mbstate_t>::~codecvt() { }
 814
 815 codecvt_base::result
 816 codecvt<char32_t, char, mbstate_t>::
 817 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 818        const intern_type*& __from_next,
 819        extern_type* __to, extern_type* __to_end,
 820        extern_type*& __to_next) const
 821 {
 822   range<const char32_t> from{ __from, __from_end };
 823   range<char> to{ __to, __to_end };
 824   auto res = ucs4_out(from, to);
 825   __from_next = from.next;
 826   __to_next = to.next;
 827   return res;
 828 }
 829
 830 codecvt_base::result
 831 codecvt<char32_t, char, mbstate_t>::
 832 do_unshift(state_type&, extern_type* __to, extern_type*,
 833            extern_type*& __to_next) const
 834 {
 835   __to_next = __to;
 836   return noconv;
 837 }
 838
 839 codecvt_base::result
 840 codecvt<char32_t, char, mbstate_t>::
 841 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 842       const extern_type*& __from_next,
 843       intern_type* __to, intern_type* __to_end,
 844       intern_type*& __to_next) const
 845 {
 846   range<const char> from{ __from, __from_end };
 847   range<char32_t> to{ __to, __to_end };
 848   auto res = ucs4_in(from, to);
 849   __from_next = from.next;
 850   __to_next = to.next;
 851   return res;
 852 }
 853
 854 int
 855 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
 856 { return 0; } // UTF-8 is not a fixed-width encoding
 857
 858 bool
 859 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
 860 { return false; }
 861
 862 int
 863 codecvt<char32_t, char, mbstate_t>::
 864 do_length(state_type&, const extern_type* __from,
 865           const extern_type* __end, size_t __max) const
 866 {
 867   __end = ucs4_span(__from, __end, __max);
 868   return __end - __from;
 869 }
 870
 871 int
 872 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
 873 {
 874   // A single character (one UTF-32 code unit) requires
 875   // up to 4 UTF-8 code units.
 876   return 4;
 877 }
 878
 879 // Define members of codecvt_utf8<char16_t> base class implementation.
 880 // Converts from UTF-8 to UCS-2.
 881
 882 __codecvt_utf8_base<char16_t>::~__codecvt_utf8_base() { }
 883
 884 codecvt_base::result
 885 __codecvt_utf8_base<char16_t>::
 886 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 887        const intern_type*& __from_next,
 888        extern_type* __to, extern_type* __to_end,
 889        extern_type*& __to_next) const
 890 {
 891   range<const char16_t> from{ __from, __from_end };
 892   range<char> to{ __to, __to_end };
 893   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
 894   __from_next = from.next;
 895   __to_next = to.next;
 896   return res;
 897 }
 898
 899 codecvt_base::result
 900 __codecvt_utf8_base<char16_t>::
 901 do_unshift(state_type&, extern_type* __to, extern_type*,
 902            extern_type*& __to_next) const
 903 {
 904   __to_next = __to;
 905   return noconv;
 906 }
 907
 908 codecvt_base::result
 909 __codecvt_utf8_base<char16_t>::
 910 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 911       const extern_type*& __from_next,
 912       intern_type* __to, intern_type* __to_end,
 913       intern_type*& __to_next) const
 914 {
 915   range<const char> from{ __from, __from_end };
 916   range<char16_t> to{ __to, __to_end };
 917   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
 918 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
 919   mode = codecvt_mode(mode | little_endian);
 920 #endif
 921   auto res = ucs2_in(from, to, _M_maxcode, mode);
 922   __from_next = from.next;
 923   __to_next = to.next;
 924   return res;
 925 }
 926
 927 int
 928 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
 929 { return 0; } // UTF-8 is not a fixed-width encoding
 930
 931 bool
 932 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
 933 { return false; }
 934
 935 int
 936 __codecvt_utf8_base<char16_t>::
 937 do_length(state_type&, const extern_type* __from,
 938           const extern_type* __end, size_t __max) const
 939 {
 940   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
 941   return __end - __from;
 942 }
 943
 944 int
 945 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
 946 {
 947   // A single UCS-2 character requires up to three UTF-8 code units.
 948   // (UCS-2 cannot represent characters that use four UTF-8 code units).
 949   int max = 3;
 950   if (_M_mode & consume_header)
 951     max += sizeof(utf8_bom);
 952   return max;
 953 }
 954
 955 // Define members of codecvt_utf8<char32_t> base class implementation.
 956 // Converts from UTF-8 to UTF-32 (aka UCS-4).
 957
 958 __codecvt_utf8_base<char32_t>::~__codecvt_utf8_base() { }
 959
 960 codecvt_base::result
 961 __codecvt_utf8_base<char32_t>::
 962 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
 963        const intern_type*& __from_next,
 964        extern_type* __to, extern_type* __to_end,
 965        extern_type*& __to_next) const
 966 {
 967   range<const char32_t> from{ __from, __from_end };
 968   range<char> to{ __to, __to_end };
 969   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
 970   __from_next = from.next;
 971   __to_next = to.next;
 972   return res;
 973 }
 974
 975 codecvt_base::result
 976 __codecvt_utf8_base<char32_t>::
 977 do_unshift(state_type&, extern_type* __to, extern_type*,
 978            extern_type*& __to_next) const
 979 {
 980   __to_next = __to;
 981   return noconv;
 982 }
 983
 984 codecvt_base::result
 985 __codecvt_utf8_base<char32_t>::
 986 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 987       const extern_type*& __from_next,
 988       intern_type* __to, intern_type* __to_end,
 989       intern_type*& __to_next) const
 990 {
 991   range<const char> from{ __from, __from_end };
 992   range<char32_t> to{ __to, __to_end };
 993   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
 994   __from_next = from.next;
 995   __to_next = to.next;
 996   return res;
 997 }
 998
 999 int
1000 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
1001 { return 0; } // UTF-8 is not a fixed-width encoding
1002
1003 bool
1004 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
1005 { return false; }
1006
1007 int
1008 __codecvt_utf8_base<char32_t>::
1009 do_length(state_type&, const extern_type* __from,
1010           const extern_type* __end, size_t __max) const
1011 {
1012   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1013   return __end - __from;
1014 }
1015
1016 int
1017 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
1018 {
1019   // A single UCS-4 character requires up to four UTF-8 code units.
1020   int max = 4;
1021   if (_M_mode & consume_header)
1022     max += sizeof(utf8_bom);
1023   return max;
1024 }
1025
1026 #ifdef _GLIBCXX_USE_WCHAR_T
1027
1028 #if __SIZEOF_WCHAR_T__ == 2
1029 static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
1030 #elif __SIZEOF_WCHAR_T__ == 4
1031 static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
1032 #endif
1033
1034 // Define members of codecvt_utf8<wchar_t> base class implementation.
1035 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1036
1037 __codecvt_utf8_base<wchar_t>::~__codecvt_utf8_base() { }
1038
1039 codecvt_base::result
1040 __codecvt_utf8_base<wchar_t>::
1041 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1042        const intern_type*& __from_next,
1043        extern_type* __to, extern_type* __to_end,
1044        extern_type*& __to_next) const
1045 {
1046   range<char> to{ __to, __to_end };
1047 #if __SIZEOF_WCHAR_T__ == 2
1048   range<const char16_t> from{
1049     reinterpret_cast<const char16_t*>(__from),
1050     reinterpret_cast<const char16_t*>(__from_end)
1051   };
1052   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1053 #elif __SIZEOF_WCHAR_T__ == 4
1054   range<const char32_t> from{
1055     reinterpret_cast<const char32_t*>(__from),
1056     reinterpret_cast<const char32_t*>(__from_end)
1057   };
1058   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1059 #else
1060   return codecvt_base::error;
1061 #endif
1062   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1063   __to_next = to.next;
1064   return res;
1065 }
1066
1067 codecvt_base::result
1068 __codecvt_utf8_base<wchar_t>::
1069 do_unshift(state_type&, extern_type* __to, extern_type*,
1070            extern_type*& __to_next) const
1071 {
1072   __to_next = __to;
1073   return noconv;
1074 }
1075
1076 codecvt_base::result
1077 __codecvt_utf8_base<wchar_t>::
1078 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1079       const extern_type*& __from_next,
1080       intern_type* __to, intern_type* __to_end,
1081       intern_type*& __to_next) const
1082 {
1083   range<const char> from{ __from, __from_end };
1084 #if __SIZEOF_WCHAR_T__ == 2
1085   range<char16_t> to{
1086     reinterpret_cast<char16_t*>(__to),
1087     reinterpret_cast<char16_t*>(__to_end)
1088   };
1089   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1090 #elif __SIZEOF_WCHAR_T__ == 4
1091   range<char32_t> to{
1092     reinterpret_cast<char32_t*>(__to),
1093     reinterpret_cast<char32_t*>(__to_end)
1094   };
1095   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1096 #else
1097   return codecvt_base::error;
1098 #endif
1099   __from_next = from.next;
1100   __to_next = reinterpret_cast<wchar_t*>(to.next);
1101   return res;
1102 }
1103
1104 int
1105 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
1106 { return 0; } // UTF-8 is not a fixed-width encoding
1107
1108 bool
1109 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
1110 { return false; }
1111
1112 int
1113 __codecvt_utf8_base<wchar_t>::
1114 do_length(state_type&, const extern_type* __from,
1115           const extern_type* __end, size_t __max) const
1116 {
1117 #if __SIZEOF_WCHAR_T__ == 2
1118   __end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
1119 #elif __SIZEOF_WCHAR_T__ == 4
1120   __end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
1121 #else
1122   __end = __from;
1123 #endif
1124   return __end - __from;
1125 }
1126
1127 int
1128 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
1129 {
1130 #if __SIZEOF_WCHAR_T__ == 2
1131   int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
1132 #else
1133   int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
1134 #endif
1135   if (_M_mode & consume_header)
1136     max += sizeof(utf8_bom);
1137   return max;
1138 }
1139 #endif
1140
1141 // Define members of codecvt_utf16<char16_t> base class implementation.
1142 // Converts from UTF-16 to UCS-2.
1143
1144 __codecvt_utf16_base<char16_t>::~__codecvt_utf16_base() { }
1145
1146 codecvt_base::result
1147 __codecvt_utf16_base<char16_t>::
1148 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1149        const intern_type*& __from_next,
1150        extern_type* __to, extern_type* __to_end,
1151        extern_type*& __to_next) const
1152 {
1153   range<const char16_t> from{ __from, __from_end };
1154   range<char16_t, false> to{ __to, __to_end };
1155   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1156   __from_next = from.next;
1157   __to_next = reinterpret_cast<char*>(to.next);
1158   return res;
1159 }
1160
1161 codecvt_base::result
1162 __codecvt_utf16_base<char16_t>::
1163 do_unshift(state_type&, extern_type* __to, extern_type*,
1164            extern_type*& __to_next) const
1165 {
1166   __to_next = __to;
1167   return noconv;
1168 }
1169
1170 codecvt_base::result
1171 __codecvt_utf16_base<char16_t>::
1172 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1173       const extern_type*& __from_next,
1174       intern_type* __to, intern_type* __to_end,
1175       intern_type*& __to_next) const
1176 {
1177   range<const char16_t, false> from{ __from, __from_end };
1178   range<char16_t> to{ __to, __to_end };
1179   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1180   __from_next = reinterpret_cast<const char*>(from.next);
1181   __to_next = to.next;
1182   if (res == codecvt_base::ok && __from_next != __from_end)
1183     res = codecvt_base::error;
1184   return res;
1185 }
1186
1187 int
1188 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
1189 { return 0; } // UTF-16 is not a fixed-width encoding
1190
1191 bool
1192 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
1193 { return false; }
1194
1195 int
1196 __codecvt_utf16_base<char16_t>::
1197 do_length(state_type&, const extern_type* __from,
1198           const extern_type* __end, size_t __max) const
1199 {
1200   range<const char16_t, false> from{ __from, __end };
1201   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1202   return reinterpret_cast<const char*>(next) - __from;
1203 }
1204
1205 int
1206 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
1207 {
1208   // A single UCS-2 character requires one UTF-16 code unit (so two chars).
1209   // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
1210   int max = 2;
1211   if (_M_mode & consume_header)
1212     max += sizeof(utf16_bom);
1213   return max;
1214 }
1215
1216 // Define members of codecvt_utf16<char32_t> base class implementation.
1217 // Converts from UTF-16 to UTF-32 (aka UCS-4).
1218
1219 __codecvt_utf16_base<char32_t>::~__codecvt_utf16_base() { }
1220
1221 codecvt_base::result
1222 __codecvt_utf16_base<char32_t>::
1223 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1224        const intern_type*& __from_next,
1225        extern_type* __to, extern_type* __to_end,
1226        extern_type*& __to_next) const
1227 {
1228   range<const char32_t> from{ __from, __from_end };
1229   range<char16_t, false> to{ __to, __to_end };
1230   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1231   __from_next = from.next;
1232   __to_next = reinterpret_cast<char*>(to.next);
1233   return res;
1234 }
1235
1236 codecvt_base::result
1237 __codecvt_utf16_base<char32_t>::
1238 do_unshift(state_type&, extern_type* __to, extern_type*,
1239            extern_type*& __to_next) const
1240 {
1241   __to_next = __to;
1242   return noconv;
1243 }
1244
1245 codecvt_base::result
1246 __codecvt_utf16_base<char32_t>::
1247 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1248       const extern_type*& __from_next,
1249       intern_type* __to, intern_type* __to_end,
1250       intern_type*& __to_next) const
1251 {
1252   range<const char16_t, false> from{ __from, __from_end };
1253   range<char32_t> to{ __to, __to_end };
1254   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1255   __from_next = reinterpret_cast<const char*>(from.next);
1256   __to_next = to.next;
1257   if (res == codecvt_base::ok && __from_next != __from_end)
1258     res = codecvt_base::error;
1259   return res;
1260 }
1261
1262 int
1263 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
1264 { return 0; } // UTF-16 is not a fixed-width encoding
1265
1266 bool
1267 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
1268 { return false; }
1269
1270 int
1271 __codecvt_utf16_base<char32_t>::
1272 do_length(state_type&, const extern_type* __from,
1273           const extern_type* __end, size_t __max) const
1274 {
1275   range<const char16_t, false> from{ __from, __end };
1276   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1277   return reinterpret_cast<const char*>(next) - __from;
1278 }
1279
1280 int
1281 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
1282 {
1283   // A single UCS-4 character requires one or two UTF-16 code units
1284   // (so up to four chars).
1285   int max = 4;
1286   if (_M_mode & consume_header)
1287     max += sizeof(utf16_bom);
1288   return max;
1289 }
1290
1291 #ifdef _GLIBCXX_USE_WCHAR_T
1292 // Define members of codecvt_utf16<wchar_t> base class implementation.
1293 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
1294
1295 __codecvt_utf16_base<wchar_t>::~__codecvt_utf16_base() { }
1296
1297 codecvt_base::result
1298 __codecvt_utf16_base<wchar_t>::
1299 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1300        const intern_type*& __from_next,
1301        extern_type* __to, extern_type* __to_end,
1302        extern_type*& __to_next) const
1303 {
1304   range<char16_t, false> to{ __to, __to_end };
1305 #if __SIZEOF_WCHAR_T__ == 2
1306   range<const char16_t> from{
1307     reinterpret_cast<const char16_t*>(__from),
1308     reinterpret_cast<const char16_t*>(__from_end),
1309   };
1310   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
1311 #elif __SIZEOF_WCHAR_T__ == 4
1312   range<const char32_t> from{
1313     reinterpret_cast<const char32_t*>(__from),
1314     reinterpret_cast<const char32_t*>(__from_end),
1315   };
1316   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
1317 #else
1318   return codecvt_base::error;
1319 #endif
1320   __from_next = reinterpret_cast<const wchar_t*>(from.next);
1321   __to_next = reinterpret_cast<char*>(to.next);
1322   return res;
1323 }
1324
1325 codecvt_base::result
1326 __codecvt_utf16_base<wchar_t>::
1327 do_unshift(state_type&, extern_type* __to, extern_type*,
1328            extern_type*& __to_next) const
1329 {
1330   __to_next = __to;
1331   return noconv;
1332 }
1333
1334 codecvt_base::result
1335 __codecvt_utf16_base<wchar_t>::
1336 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1337       const extern_type*& __from_next,
1338       intern_type* __to, intern_type* __to_end,
1339       intern_type*& __to_next) const
1340 {
1341   range<const char16_t, false> from{ __from, __from_end };
1342 #if __SIZEOF_WCHAR_T__ == 2
1343   range<char16_t> to{
1344     reinterpret_cast<char16_t*>(__to),
1345     reinterpret_cast<char16_t*>(__to_end),
1346   };
1347   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
1348 #elif __SIZEOF_WCHAR_T__ == 4
1349   range<char32_t> to{
1350     reinterpret_cast<char32_t*>(__to),
1351     reinterpret_cast<char32_t*>(__to_end),
1352   };
1353   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
1354 #else
1355   return codecvt_base::error;
1356 #endif
1357   __from_next = reinterpret_cast<const char*>(from.next);
1358   __to_next = reinterpret_cast<wchar_t*>(to.next);
1359   if (res == codecvt_base::ok && __from_next != __from_end)
1360     res = codecvt_base::error;
1361   return res;
1362 }
1363
1364 int
1365 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
1366 { return 0; } // UTF-16 is not a fixed-width encoding
1367
1368 bool
1369 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
1370 { return false; }
1371
1372 int
1373 __codecvt_utf16_base<wchar_t>::
1374 do_length(state_type&, const extern_type* __from,
1375           const extern_type* __end, size_t __max) const
1376 {
1377   range<const char16_t, false> from{ __from, __end };
1378 #if __SIZEOF_WCHAR_T__ == 2
1379   const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
1380 #elif __SIZEOF_WCHAR_T__ == 4
1381   const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
1382 #endif
1383   return reinterpret_cast<const char*>(next) - __from;
1384 }
1385
1386 int
1387 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
1388 {
1389 #if __SIZEOF_WCHAR_T__ == 2
1390   int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
1391 #else
1392   int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
1393 #endif
1394   if (_M_mode & consume_header)
1395     max += sizeof(utf16_bom);
1396   return max;
1397 }
1398 #endif
1399
1400 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
1401 // Converts from UTF-8 to UTF-16.
1402
1403 __codecvt_utf8_utf16_base<char16_t>::~__codecvt_utf8_utf16_base() { }
1404
1405 codecvt_base::result
1406 __codecvt_utf8_utf16_base<char16_t>::
1407 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1408        const intern_type*& __from_next,
1409        extern_type* __to, extern_type* __to_end,
1410        extern_type*& __to_next) const
1411 {
1412   range<const char16_t> from{ __from, __from_end };
1413   range<char> to{ __to, __to_end };
1414   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1415   __from_next = from.next;
1416   __to_next = to.next;
1417   return res;
1418 }
1419
1420 codecvt_base::result
1421 __codecvt_utf8_utf16_base<char16_t>::
1422 do_unshift(state_type&, extern_type* __to, extern_type*,
1423            extern_type*& __to_next) const
1424 {
1425   __to_next = __to;
1426   return noconv;
1427 }
1428
1429 codecvt_base::result
1430 __codecvt_utf8_utf16_base<char16_t>::
1431 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1432       const extern_type*& __from_next,
1433       intern_type* __to, intern_type* __to_end,
1434       intern_type*& __to_next) const
1435 {
1436   range<const char> from{ __from, __from_end };
1437   range<char16_t> to{ __to, __to_end };
1438   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1439 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1440   mode = codecvt_mode(mode | little_endian);
1441 #endif
1442   auto res = utf16_in(from, to, _M_maxcode, mode);
1443   __from_next = from.next;
1444   __to_next = to.next;
1445   return res;
1446 }
1447
1448 int
1449 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
1450 { return 0; } // UTF-8 is not a fixed-width encoding
1451
1452 bool
1453 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
1454 { return false; }
1455
1456 int
1457 __codecvt_utf8_utf16_base<char16_t>::
1458 do_length(state_type&, const extern_type* __from,
1459           const extern_type* __end, size_t __max) const
1460 {
1461   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1462   return __end - __from;
1463 }
1464
1465 int
1466 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
1467 {
1468   // A single character can be 1 or 2 UTF-16 code units,
1469   // requiring up to 4 UTF-8 code units.
1470   int max = 4;
1471   if (_M_mode & consume_header)
1472     max += sizeof(utf8_bom);
1473   return max;
1474 }
1475
1476 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
1477 // Converts from UTF-8 to UTF-16.
1478
1479 __codecvt_utf8_utf16_base<char32_t>::~__codecvt_utf8_utf16_base() { }
1480
1481 codecvt_base::result
1482 __codecvt_utf8_utf16_base<char32_t>::
1483 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1484        const intern_type*& __from_next,
1485        extern_type* __to, extern_type* __to_end,
1486        extern_type*& __to_next) const
1487 {
1488   range<const char32_t> from{ __from, __from_end };
1489   range<char> to{ __to, __to_end };
1490   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1491   __from_next = from.next;
1492   __to_next = to.next;
1493   return res;
1494 }
1495
1496 codecvt_base::result
1497 __codecvt_utf8_utf16_base<char32_t>::
1498 do_unshift(state_type&, extern_type* __to, extern_type*,
1499            extern_type*& __to_next) const
1500 {
1501   __to_next = __to;
1502   return noconv;
1503 }
1504
1505 codecvt_base::result
1506 __codecvt_utf8_utf16_base<char32_t>::
1507 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1508       const extern_type*& __from_next,
1509       intern_type* __to, intern_type* __to_end,
1510       intern_type*& __to_next) const
1511 {
1512   range<const char> from{ __from, __from_end };
1513   range<char32_t> to{ __to, __to_end };
1514   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1515 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1516   mode = codecvt_mode(mode | little_endian);
1517 #endif
1518   auto res = utf16_in(from, to, _M_maxcode, mode);
1519   __from_next = from.next;
1520   __to_next = to.next;
1521   return res;
1522 }
1523
1524 int
1525 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
1526 { return 0; } // UTF-8 is not a fixed-width encoding
1527
1528 bool
1529 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
1530 { return false; }
1531
1532 int
1533 __codecvt_utf8_utf16_base<char32_t>::
1534 do_length(state_type&, const extern_type* __from,
1535           const extern_type* __end, size_t __max) const
1536 {
1537   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1538   return __end - __from;
1539 }
1540
1541 int
1542 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
1543 {
1544   // A single character can be 1 or 2 UTF-16 code units,
1545   // requiring up to 4 UTF-8 code units.
1546   int max = 4;
1547   if (_M_mode & consume_header)
1548     max += sizeof(utf8_bom);
1549   return max;
1550 }
1551
1552 #ifdef _GLIBCXX_USE_WCHAR_T
1553 // Define members of codecvt_utf8_utf16<wchar_t> base class implementation.
1554 // Converts from UTF-8 to UTF-16.
1555
1556 __codecvt_utf8_utf16_base<wchar_t>::~__codecvt_utf8_utf16_base() { }
1557
1558 codecvt_base::result
1559 __codecvt_utf8_utf16_base<wchar_t>::
1560 do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
1561        const intern_type*& __from_next,
1562        extern_type* __to, extern_type* __to_end,
1563        extern_type*& __to_next) const
1564 {
1565   range<const wchar_t> from{ __from, __from_end };
1566   range<char> to{ __to, __to_end };
1567   auto res = utf16_out(from, to, _M_maxcode, _M_mode);
1568   __from_next = from.next;
1569   __to_next = to.next;
1570   return res;
1571 }
1572
1573 codecvt_base::result
1574 __codecvt_utf8_utf16_base<wchar_t>::
1575 do_unshift(state_type&, extern_type* __to, extern_type*,
1576            extern_type*& __to_next) const
1577 {
1578   __to_next = __to;
1579   return noconv;
1580 }
1581
1582 codecvt_base::result
1583 __codecvt_utf8_utf16_base<wchar_t>::
1584 do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
1585       const extern_type*& __from_next,
1586       intern_type* __to, intern_type* __to_end,
1587       intern_type*& __to_next) const
1588 {
1589   range<const char> from{ __from, __from_end };
1590   range<wchar_t> to{ __to, __to_end };
1591   codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
1592 #if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
1593   mode = codecvt_mode(mode | little_endian);
1594 #endif
1595   auto res = utf16_in(from, to, _M_maxcode, mode);
1596   __from_next = from.next;
1597   __to_next = to.next;
1598   return res;
1599 }
1600
1601 int
1602 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
1603 { return 0; } // UTF-8 is not a fixed-width encoding
1604
1605 bool
1606 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
1607 { return false; }
1608
1609 int
1610 __codecvt_utf8_utf16_base<wchar_t>::
1611 do_length(state_type&, const extern_type* __from,
1612           const extern_type* __end, size_t __max) const
1613 {
1614   __end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
1615   return __end - __from;
1616 }
1617
1618 int
1619 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
1620 {
1621   // A single character can be 1 or 2 UTF-16 code units,
1622   // requiring up to 4 UTF-8 code units.
1623   int max = 4;
1624   if (_M_mode & consume_header)
1625     max += sizeof(utf8_bom);
1626   return max;
1627 }
1628 #endif
1629
1630 inline template class __codecvt_abstract_base<char16_t, char, mbstate_t>;
1631 inline template class __codecvt_abstract_base<char32_t, char, mbstate_t>;
1632 template class codecvt_byname<char16_t, char, mbstate_t>;
1633 template class codecvt_byname<char32_t, char, mbstate_t>;
1634
1635 _GLIBCXX_END_NAMESPACE_VERSION
1636 }
1637 #endif // _GLIBCXX_USE_C99_STDINT_TR1