sysdeps/s390/s390-64/utf8-utf32-z9.c

   1 /* Conversion between UTF-8 and UTF-32 BE/internal.
   2
   3    This module uses the Z9-109 variants of the Convert Unicode
   4    instructions.
   5    Copyright (C) 1997-2015 Free Software Foundation, Inc.
   6
   7    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
   8    Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
   9
  10    Thanks to Daniel Appich who covered the relevant performance work
  11    in his diploma thesis.
  12
  13    This is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU Lesser General Public
  15    License as published by the Free Software Foundation; either
  16    version 2.1 of the License, or (at your option) any later version.
  17
  18    This is distributed in the hope that it will be useful,
  19    but WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    Lesser General Public License for more details.
  22
  23    You should have received a copy of the GNU Lesser General Public
  24    License along with the GNU C Library; if not, see
  25    <http://www.gnu.org/licenses/>.  */
  26
  27 #include <dlfcn.h>
  28 #include <stdint.h>
  29 #include <unistd.h>
  30 #include <dl-procinfo.h>
  31 #include <gconv.h>
  32
  33 /* UTF-32 big endian byte order mark.  */
  34 #define BOM                     0x0000feffu
  35
  36 #define DEFINE_INIT             0
  37 #define DEFINE_FINI             0
  38 /* These definitions apply to the UTF-8 to UTF-32 direction.  The
  39    software implementation for UTF-8 still supports multibyte
  40    characters up to 6 bytes whereas the hardware variant does not.  */
  41 #define MIN_NEEDED_FROM         1
  42 #define MAX_NEEDED_FROM         6
  43 #define MIN_NEEDED_TO           4
  44 #define FROM_LOOP               from_utf8_loop
  45 #define TO_LOOP                 to_utf8_loop
  46 #define FROM_DIRECTION          (dir == from_utf8)
  47 #define ONE_DIRECTION           0
  48 #define PREPARE_LOOP                                                    \
  49   enum direction dir = ((struct utf8_data *) step->__data)->dir;        \
  50   int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;         \
  51                                                                         \
  52   if (emit_bom && !data->__internal_use                                 \
  53       && data->__invocation_counter == 0)                               \
  54     {                                                                   \
  55       /* Emit the Byte Order Mark.  */                                  \
  56       if (__glibc_unlikely (outbuf + 4 > outend))                             \
  57         return __GCONV_FULL_OUTPUT;                                     \
  58                                                                         \
  59       put32u (outbuf, BOM);                                             \
  60       outbuf += 4;                                                      \
  61     }
  62
  63 /* Direction of the transformation.  */
  64 enum direction
  65 {
  66   illegal_dir,
  67   to_utf8,
  68   from_utf8
  69 };
  70
  71 struct utf8_data
  72 {
  73   enum direction dir;
  74   int emit_bom;
  75 };
  76
  77
  78 extern int gconv_init (struct __gconv_step *step);
  79 int
  80 gconv_init (struct __gconv_step *step)
  81 {
  82   /* Determine which direction.  */
  83   struct utf8_data *new_data;
  84   enum direction dir = illegal_dir;
  85   int emit_bom;
  86   int result;
  87
  88   emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0);
  89
  90   if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
  91       && (__strcasecmp (step->__to_name, "UTF-32//") == 0
  92           || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
  93           || __strcasecmp (step->__to_name, "INTERNAL") == 0))
  94     {
  95       dir = from_utf8;
  96     }
  97   else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0
  98            && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0
  99                || __strcasecmp (step->__from_name, "INTERNAL") == 0))
 100     {
 101       dir = to_utf8;
 102     }
 103
 104   result = __GCONV_NOCONV;
 105   if (dir != illegal_dir)
 106     {
 107       new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
 108
 109       result = __GCONV_NOMEM;
 110       if (new_data != NULL)
 111         {
 112           new_data->dir = dir;
 113           new_data->emit_bom = emit_bom;
 114           step->__data = new_data;
 115
 116           if (dir == from_utf8)
 117             {
 118               step->__min_needed_from = MIN_NEEDED_FROM;
 119               step->__max_needed_from = MIN_NEEDED_FROM;
 120               step->__min_needed_to = MIN_NEEDED_TO;
 121               step->__max_needed_to = MIN_NEEDED_TO;
 122             }
 123           else
 124             {
 125               step->__min_needed_from = MIN_NEEDED_TO;
 126               step->__max_needed_from = MIN_NEEDED_TO;
 127               step->__min_needed_to = MIN_NEEDED_FROM;
 128               step->__max_needed_to = MIN_NEEDED_FROM;
 129             }
 130
 131           step->__stateful = 0;
 132
 133           result = __GCONV_OK;
 134         }
 135     }
 136
 137   return result;
 138 }
 139
 140
 141 extern void gconv_end (struct __gconv_step *data);
 142 void
 143 gconv_end (struct __gconv_step *data)
 144 {
 145   free (data->__data);
 146 }
 147
 148 /* The macro for the hardware loop.  This is used for both
 149    directions.  */
 150 #define HARDWARE_CONVERT(INSTRUCTION)                                   \
 151   {                                                                     \
 152     register const unsigned char* pInput asm ("8") = inptr;             \
 153     register unsigned long long inlen asm ("9") = inend - inptr;        \
 154     register unsigned char* pOutput asm ("10") = outptr;                \
 155     register unsigned long long outlen asm("11") = outend - outptr;     \
 156     uint64_t cc = 0;                                                    \
 157                                                                         \
 158     asm volatile (".machine push       \n\t"                            \
 159                   ".machine \"z9-109\" \n\t"                            \
 160                   "0: " INSTRUCTION "  \n\t"                            \
 161                   ".machine pop        \n\t"                            \
 162                   "   jo     0b        \n\t"                            \
 163                   "   ipm    %2        \n"                              \
 164                   : "+a" (pOutput), "+a" (pInput), "+d" (cc),           \
 165                     "+d" (outlen), "+d" (inlen)                         \
 166                   :                                                     \
 167                   : "cc", "memory");                                    \
 168                                                                         \
 169     inptr = pInput;                                                     \
 170     outptr = pOutput;                                                   \
 171     cc >>= 28;                                                          \
 172                                                                         \
 173     if (cc == 1)                                                        \
 174       {                                                                 \
 175         result = __GCONV_FULL_OUTPUT;                                   \
 176         break;                                                          \
 177       }                                                                 \
 178     else if (cc == 2)                                                   \
 179       {                                                                 \
 180         result = __GCONV_ILLEGAL_INPUT;                                 \
 181         break;                                                          \
 182       }                                                                 \
 183   }
 184
 185 /* Conversion function from UTF-8 to UTF-32 internal/BE.  */
 186
 187 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
 188 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
 189 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
 190 #define LOOPFCT                 FROM_LOOP
 191 /* The software routine is copied from gconv_simple.c.  */
 192 #define BODY                                                            \
 193   {                                                                     \
 194     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                            \
 195       {                                                                 \
 196         HARDWARE_CONVERT ("cu14 %0, %1, 1");                            \
 197                                                                         \
 198         if (inptr != inend)                                             \
 199           {                                                             \
 200             int i;                                                      \
 201             for (i = 1; inptr + i < inend; ++i)                         \
 202               if ((inptr[i] & 0xc0) != 0x80)                            \
 203                 break;                                                  \
 204                                                                         \
 205             if (__glibc_likely (inptr + i == inend))                          \
 206               {                                                         \
 207                 result = __GCONV_INCOMPLETE_INPUT;                      \
 208                 break;                                                  \
 209               }                                                         \
 210             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 211           }                                                             \
 212         continue;                                                       \
 213       }                                                                 \
 214                                                                         \
 215     /* Next input byte.  */                                             \
 216     uint32_t ch = *inptr;                                               \
 217                                                                         \
 218     if (__glibc_likely (ch < 0x80))                                           \
 219       {                                                                 \
 220         /* One byte sequence.  */                                       \
 221         ++inptr;                                                        \
 222       }                                                                 \
 223     else                                                                \
 224       {                                                                 \
 225         uint_fast32_t cnt;                                              \
 226         uint_fast32_t i;                                                \
 227                                                                         \
 228         if (ch >= 0xc2 && ch < 0xe0)                                    \
 229           {                                                             \
 230             /* We expect two bytes.  The first byte cannot be 0xc0 or   \
 231                0xc1, otherwise the wide character could have been       \
 232                represented using a single byte.  */                     \
 233             cnt = 2;                                                    \
 234             ch &= 0x1f;                                                 \
 235           }                                                             \
 236         else if (__glibc_likely ((ch & 0xf0) == 0xe0))                        \
 237           {                                                             \
 238             /* We expect three bytes.  */                               \
 239             cnt = 3;                                                    \
 240             ch &= 0x0f;                                                 \
 241           }                                                             \
 242         else if (__glibc_likely ((ch & 0xf8) == 0xf0))                        \
 243           {                                                             \
 244             /* We expect four bytes.  */                                \
 245             cnt = 4;                                                    \
 246             ch &= 0x07;                                                 \
 247           }                                                             \
 248         else if (__glibc_likely ((ch & 0xfc) == 0xf8))                        \
 249           {                                                             \
 250             /* We expect five bytes.  */                                \
 251             cnt = 5;                                                    \
 252             ch &= 0x03;                                                 \
 253           }                                                             \
 254         else if (__glibc_likely ((ch & 0xfe) == 0xfc))                        \
 255           {                                                             \
 256             /* We expect six bytes.  */                                 \
 257             cnt = 6;                                                    \
 258             ch &= 0x01;                                                 \
 259           }                                                             \
 260         else                                                            \
 261           {                                                             \
 262             /* Search the end of this ill-formed UTF-8 character.  This \
 263                is the next byte with (x & 0xc0) != 0x80.  */            \
 264             i = 0;                                                      \
 265             do                                                          \
 266               ++i;                                                      \
 267             while (inptr + i < inend                                    \
 268                    && (*(inptr + i) & 0xc0) == 0x80                     \
 269                    && i < 5);                                           \
 270                                                                         \
 271           errout:                                                       \
 272             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 273           }                                                             \
 274                                                                         \
 275         if (__glibc_unlikely (inptr + cnt > inend))                           \
 276           {                                                             \
 277             /* We don't have enough input.  But before we report        \
 278                that check that all the bytes are correct.  */           \
 279             for (i = 1; inptr + i < inend; ++i)                         \
 280               if ((inptr[i] & 0xc0) != 0x80)                            \
 281                 break;                                                  \
 282                                                                         \
 283             if (__glibc_likely (inptr + i == inend))                          \
 284               {                                                         \
 285                 result = __GCONV_INCOMPLETE_INPUT;                      \
 286                 break;                                                  \
 287               }                                                         \
 288                                                                         \
 289             goto errout;                                                \
 290           }                                                             \
 291                                                                         \
 292         /* Read the possible remaining bytes.  */                       \
 293         for (i = 1; i < cnt; ++i)                                       \
 294           {                                                             \
 295             uint32_t byte = inptr[i];                                   \
 296                                                                         \
 297             if ((byte & 0xc0) != 0x80)                                  \
 298               /* This is an illegal encoding.  */                       \
 299               break;                                                    \
 300                                                                         \
 301             ch <<= 6;                                                   \
 302             ch |= byte & 0x3f;                                          \
 303           }                                                             \
 304                                                                         \
 305         /* If i < cnt, some trail byte was not >= 0x80, < 0xc0.         \
 306            If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
 307            have been represented with fewer than cnt bytes.  */         \
 308         if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0))         \
 309           {                                                             \
 310             /* This is an illegal encoding.  */                         \
 311             goto errout;                                                \
 312           }                                                             \
 313                                                                         \
 314         inptr += cnt;                                                   \
 315       }                                                                 \
 316                                                                         \
 317     /* Now adjust the pointers and store the result.  */                \
 318     *((uint32_t *) outptr) = ch;                                        \
 319     outptr += sizeof (uint32_t);                                        \
 320   }
 321 #define LOOP_NEED_FLAGS
 322
 323 #define STORE_REST                                                      \
 324   {                                                                           \
 325     /* We store the remaining bytes while converting them into the UCS4       \
 326        format.  We can assume that the first byte in the buffer is            \
 327        correct and that it requires a larger number of bytes than there       \
 328        are in the input buffer.  */                                           \
 329     wint_t ch = **inptrp;                                                     \
 330     size_t cnt, r;                                                            \
 331                                                                               \
 332     state->__count = inend - *inptrp;                                         \
 333                                                                               \
 334     if (ch >= 0xc2 && ch < 0xe0)                                              \
 335       {                                                                       \
 336         /* We expect two bytes.  The first byte cannot be 0xc0 or             \
 337            0xc1, otherwise the wide character could have been                 \
 338            represented using a single byte.  */                               \
 339         cnt = 2;                                                              \
 340         ch &= 0x1f;                                                           \
 341       }                                                                       \
 342     else if (__glibc_likely ((ch & 0xf0) == 0xe0))                            \
 343       {                                                                       \
 344         /* We expect three bytes.  */                                         \
 345         cnt = 3;                                                              \
 346         ch &= 0x0f;                                                           \
 347       }                                                                       \
 348     else if (__glibc_likely ((ch & 0xf8) == 0xf0))                            \
 349       {                                                                       \
 350         /* We expect four bytes.  */                                          \
 351         cnt = 4;                                                              \
 352         ch &= 0x07;                                                           \
 353       }                                                                       \
 354     else if (__glibc_likely ((ch & 0xfc) == 0xf8))                            \
 355       {                                                                       \
 356         /* We expect five bytes.  */                                          \
 357         cnt = 5;                                                              \
 358         ch &= 0x03;                                                           \
 359       }                                                                       \
 360     else                                                                      \
 361       {                                                                       \
 362         /* We expect six bytes.  */                                           \
 363         cnt = 6;                                                              \
 364         ch &= 0x01;                                                           \
 365       }                                                                       \
 366                                                                               \
 367     /* The first byte is already consumed.  */                                \
 368     r = cnt - 1;                                                              \
 369     while (++(*inptrp) < inend)                                               \
 370       {                                                                       \
 371         ch <<= 6;                                                             \
 372         ch |= **inptrp & 0x3f;                                                \
 373         --r;                                                                  \
 374       }                                                                       \
 375                                                                               \
 376     /* Shift for the so far missing bytes.  */                                \
 377     ch <<= r * 6;                                                             \
 378                                                                               \
 379     /* Store the number of bytes expected for the entire sequence.  */        \
 380     state->__count |= cnt << 8;                                               \
 381                                                                               \
 382     /* Store the value.  */                                                   \
 383     state->__value.__wch = ch;                                                \
 384   }
 385
 386 #define UNPACK_BYTES \
 387   {                                                                           \
 388     static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };  \
 389     wint_t wch = state->__value.__wch;                                        \
 390     size_t ntotal = state->__count >> 8;                                      \
 391                                                                               \
 392     inlen = state->__count & 255;                                             \
 393                                                                               \
 394     bytebuf[0] = inmask[ntotal - 2];                                          \
 395                                                                               \
 396     do                                                                        \
 397       {                                                                       \
 398         if (--ntotal < inlen)                                                 \
 399           bytebuf[ntotal] = 0x80 | (wch & 0x3f);                              \
 400         wch >>= 6;                                                            \
 401       }                                                                       \
 402     while (ntotal > 1);                                                       \
 403                                                                               \
 404     bytebuf[0] |= wch;                                                        \
 405   }
 406
 407 #define CLEAR_STATE \
 408   state->__count = 0
 409
 410 #include <iconv/loop.c>
 411
 412 /* Conversion from UTF-32 internal/BE to UTF-8.  */
 413
 414 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
 415 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
 416 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
 417 #define LOOPFCT                 TO_LOOP
 418 /* The software routine mimics the S/390 cu41 instruction.  */
 419 #define BODY                                                    \
 420   {                                                             \
 421     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                    \
 422       {                                                         \
 423         HARDWARE_CONVERT ("cu41 %0, %1");                       \
 424                                                                 \
 425         if (inptr != inend)                                     \
 426           {                                                     \
 427             result = __GCONV_INCOMPLETE_INPUT;                  \
 428             break;                                              \
 429           }                                                     \
 430         continue;                                               \
 431       }                                                         \
 432                                                                 \
 433     uint32_t wc = *((const uint32_t *) inptr);                  \
 434                                                                 \
 435     if (__glibc_likely (wc <= 0x7f))                                          \
 436       {                                                         \
 437         /* Single UTF-8 char.  */                               \
 438         *outptr = (uint8_t)wc;                                  \
 439         outptr++;                                               \
 440       }                                                         \
 441     else if (wc <= 0x7ff)                                       \
 442       {                                                         \
 443         /* Two UTF-8 chars.  */                                 \
 444         if (__glibc_unlikely (outptr + 2 > outend))                           \
 445           {                                                     \
 446             /* Overflow in the output buffer.  */               \
 447             result = __GCONV_FULL_OUTPUT;                       \
 448             break;                                              \
 449           }                                                     \
 450                                                                 \
 451         outptr[0] = 0xc0;                                       \
 452         outptr[0] |= wc >> 6;                                   \
 453                                                                 \
 454         outptr[1] = 0x80;                                       \
 455         outptr[1] |= wc & 0x3f;                                 \
 456                                                                 \
 457         outptr += 2;                                            \
 458       }                                                         \
 459     else if (wc <= 0xffff)                                      \
 460       {                                                         \
 461         /* Three UTF-8 chars.  */                               \
 462         if (__glibc_unlikely (outptr + 3 > outend))                           \
 463           {                                                     \
 464             /* Overflow in the output buffer.  */               \
 465             result = __GCONV_FULL_OUTPUT;                       \
 466             break;                                              \
 467           }                                                     \
 468         outptr[0] = 0xe0;                                       \
 469         outptr[0] |= wc >> 12;                                  \
 470                                                                 \
 471         outptr[1] = 0x80;                                       \
 472         outptr[1] |= (wc >> 6) & 0x3f;                          \
 473                                                                 \
 474         outptr[2] = 0x80;                                       \
 475         outptr[2] |= wc & 0x3f;                                 \
 476                                                                 \
 477         outptr += 3;                                            \
 478       }                                                         \
 479       else if (wc <= 0x10ffff)                                  \
 480         {                                                       \
 481           /* Four UTF-8 chars.  */                              \
 482           if (__glibc_unlikely (outptr + 4 > outend))                         \
 483             {                                                   \
 484               /* Overflow in the output buffer.  */             \
 485               result = __GCONV_FULL_OUTPUT;                     \
 486               break;                                            \
 487             }                                                   \
 488           outptr[0] = 0xf0;                                     \
 489           outptr[0] |= wc >> 18;                                \
 490                                                                 \
 491           outptr[1] = 0x80;                                     \
 492           outptr[1] |= (wc >> 12) & 0x3f;                       \
 493                                                                 \
 494           outptr[2] = 0x80;                                     \
 495           outptr[2] |= (wc >> 6) & 0x3f;                        \
 496                                                                 \
 497           outptr[3] = 0x80;                                     \
 498           outptr[3] |= wc & 0x3f;                               \
 499                                                                 \
 500           outptr += 4;                                          \
 501         }                                                       \
 502       else                                                      \
 503         {                                                       \
 504           STANDARD_TO_LOOP_ERR_HANDLER (4);                     \
 505         }                                                       \
 506     inptr += 4;                                                 \
 507   }
 508 #define LOOP_NEED_FLAGS
 509 #include <iconv/loop.c>
 510
 511 #include <iconv/skeleton.c>