sysdeps/s390/s390-64/utf8-utf32-z9.c

   1 /* Conversion between UTF-8 and UTF-32 BE/internal.
   2
   3    This module uses the Z9-109 variants of the Convert Unicode
   4    instructions.
   5    Copyright (C) 1997-2014 Free Software Foundation, Inc.
   6
   7    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
   8    Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
   9
  10    Thanks to Daniel Appich who covered the relevant performance work
  11    in his diploma thesis.
  12
  13    This is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU Lesser General Public
  15    License as published by the Free Software Foundation; either
  16    version 2.1 of the License, or (at your option) any later version.
  17
  18    This is distributed in the hope that it will be useful,
  19    but WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    Lesser General Public License for more details.
  22
  23    You should have received a copy of the GNU Lesser General Public
  24    License along with the GNU C Library; if not, see
  25    <http://www.gnu.org/licenses/>.  */
  26
  27 #include <dlfcn.h>
  28 #include <stdint.h>
  29 #include <unistd.h>
  30 #include <dl-procinfo.h>
  31 #include <gconv.h>
  32
  33 /* UTF-32 big endian byte order mark.  */
  34 #define BOM                     0x0000feffu
  35
  36 #define DEFINE_INIT             0
  37 #define DEFINE_FINI             0
  38 /* These definitions apply to the UTF-8 to UTF-32 direction.  The
  39    software implementation for UTF-8 still supports multibyte
  40    characters up to 6 bytes whereas the hardware variant does not.  */
  41 #define MIN_NEEDED_FROM         1
  42 #define MAX_NEEDED_FROM         6
  43 #define MIN_NEEDED_TO           4
  44 #define FROM_LOOP               from_utf8_loop
  45 #define TO_LOOP                 to_utf8_loop
  46 #define FROM_DIRECTION          (dir == from_utf8)
  47 #define PREPARE_LOOP                                                    \
  48   enum direction dir = ((struct utf8_data *) step->__data)->dir;        \
  49   int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;         \
  50                                                                         \
  51   if (emit_bom && !data->__internal_use                                 \
  52       && data->__invocation_counter == 0)                               \
  53     {                                                                   \
  54       /* Emit the Byte Order Mark.  */                                  \
  55       if (__builtin_expect (outbuf + 4 > outend, 0))                    \
  56         return __GCONV_FULL_OUTPUT;                                     \
  57                                                                         \
  58       put32u (outbuf, BOM);                                             \
  59       outbuf += 4;                                                      \
  60     }
  61
  62 /* Direction of the transformation.  */
  63 enum direction
  64 {
  65   illegal_dir,
  66   to_utf8,
  67   from_utf8
  68 };
  69
  70 struct utf8_data
  71 {
  72   enum direction dir;
  73   int emit_bom;
  74 };
  75
  76
  77 extern int gconv_init (struct __gconv_step *step);
  78 int
  79 gconv_init (struct __gconv_step *step)
  80 {
  81   /* Determine which direction.  */
  82   struct utf8_data *new_data;
  83   enum direction dir = illegal_dir;
  84   int emit_bom;
  85   int result;
  86
  87   emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0);
  88
  89   if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
  90       && (__strcasecmp (step->__to_name, "UTF-32//") == 0
  91           || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
  92           || __strcasecmp (step->__to_name, "INTERNAL") == 0))
  93     {
  94       dir = from_utf8;
  95     }
  96   else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0
  97            && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0
  98                || __strcasecmp (step->__from_name, "INTERNAL") == 0))
  99     {
 100       dir = to_utf8;
 101     }
 102
 103   result = __GCONV_NOCONV;
 104   if (dir != illegal_dir)
 105     {
 106       new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
 107
 108       result = __GCONV_NOMEM;
 109       if (new_data != NULL)
 110         {
 111           new_data->dir = dir;
 112           new_data->emit_bom = emit_bom;
 113           step->__data = new_data;
 114
 115           if (dir == from_utf8)
 116             {
 117               step->__min_needed_from = MIN_NEEDED_FROM;
 118               step->__max_needed_from = MIN_NEEDED_FROM;
 119               step->__min_needed_to = MIN_NEEDED_TO;
 120               step->__max_needed_to = MIN_NEEDED_TO;
 121             }
 122           else
 123             {
 124               step->__min_needed_from = MIN_NEEDED_TO;
 125               step->__max_needed_from = MIN_NEEDED_TO;
 126               step->__min_needed_to = MIN_NEEDED_FROM;
 127               step->__max_needed_to = MIN_NEEDED_FROM;
 128             }
 129
 130           step->__stateful = 0;
 131
 132           result = __GCONV_OK;
 133         }
 134     }
 135
 136   return result;
 137 }
 138
 139
 140 extern void gconv_end (struct __gconv_step *data);
 141 void
 142 gconv_end (struct __gconv_step *data)
 143 {
 144   free (data->__data);
 145 }
 146
 147 /* The macro for the hardware loop.  This is used for both
 148    directions.  */
 149 #define HARDWARE_CONVERT(INSTRUCTION)                                   \
 150   {                                                                     \
 151     register const unsigned char* pInput asm ("8") = inptr;             \
 152     register unsigned long long inlen asm ("9") = inend - inptr;        \
 153     register unsigned char* pOutput asm ("10") = outptr;                \
 154     register unsigned long long outlen asm("11") = outend - outptr;     \
 155     uint64_t cc = 0;                                                    \
 156                                                                         \
 157     asm volatile (".machine push       \n\t"                            \
 158                   ".machine \"z9-109\" \n\t"                            \
 159                   "0: " INSTRUCTION "  \n\t"                            \
 160                   ".machine pop        \n\t"                            \
 161                   "   jo     0b        \n\t"                            \
 162                   "   ipm    %2        \n"                              \
 163                   : "+a" (pOutput), "+a" (pInput), "+d" (cc),           \
 164                     "+d" (outlen), "+d" (inlen)                         \
 165                   :                                                     \
 166                   : "cc", "memory");                                    \
 167                                                                         \
 168     inptr = pInput;                                                     \
 169     outptr = pOutput;                                                   \
 170     cc >>= 28;                                                          \
 171                                                                         \
 172     if (cc == 1)                                                        \
 173       {                                                                 \
 174         result = __GCONV_FULL_OUTPUT;                                   \
 175         break;                                                          \
 176       }                                                                 \
 177     else if (cc == 2)                                                   \
 178       {                                                                 \
 179         result = __GCONV_ILLEGAL_INPUT;                                 \
 180         break;                                                          \
 181       }                                                                 \
 182   }
 183
 184 /* Conversion function from UTF-8 to UTF-32 internal/BE.  */
 185
 186 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
 187 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
 188 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
 189 #define LOOPFCT                 FROM_LOOP
 190 /* The software routine is copied from gconv_simple.c.  */
 191 #define BODY                                                            \
 192   {                                                                     \
 193     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                            \
 194       {                                                                 \
 195         HARDWARE_CONVERT ("cu14 %0, %1, 1");                            \
 196                                                                         \
 197         if (inptr != inend)                                             \
 198           {                                                             \
 199             int i;                                                      \
 200             for (i = 1; inptr + i < inend; ++i)                         \
 201               if ((inptr[i] & 0xc0) != 0x80)                            \
 202                 break;                                                  \
 203                                                                         \
 204             if (__builtin_expect (inptr + i == inend, 1))               \
 205               {                                                         \
 206                 result = __GCONV_INCOMPLETE_INPUT;                      \
 207                 break;                                                  \
 208               }                                                         \
 209             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 210           }                                                             \
 211         continue;                                                       \
 212       }                                                                 \
 213                                                                         \
 214     /* Next input byte.  */                                             \
 215     uint32_t ch = *inptr;                                               \
 216                                                                         \
 217     if (__builtin_expect (ch < 0x80, 1))                                \
 218       {                                                                 \
 219         /* One byte sequence.  */                                       \
 220         ++inptr;                                                        \
 221       }                                                                 \
 222     else                                                                \
 223       {                                                                 \
 224         uint_fast32_t cnt;                                              \
 225         uint_fast32_t i;                                                \
 226                                                                         \
 227         if (ch >= 0xc2 && ch < 0xe0)                                    \
 228           {                                                             \
 229             /* We expect two bytes.  The first byte cannot be 0xc0 or   \
 230                0xc1, otherwise the wide character could have been       \
 231                represented using a single byte.  */                     \
 232             cnt = 2;                                                    \
 233             ch &= 0x1f;                                                 \
 234           }                                                             \
 235         else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))             \
 236           {                                                             \
 237             /* We expect three bytes.  */                               \
 238             cnt = 3;                                                    \
 239             ch &= 0x0f;                                                 \
 240           }                                                             \
 241         else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))             \
 242           {                                                             \
 243             /* We expect four bytes.  */                                \
 244             cnt = 4;                                                    \
 245             ch &= 0x07;                                                 \
 246           }                                                             \
 247         else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1))             \
 248           {                                                             \
 249             /* We expect five bytes.  */                                \
 250             cnt = 5;                                                    \
 251             ch &= 0x03;                                                 \
 252           }                                                             \
 253         else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1))             \
 254           {                                                             \
 255             /* We expect six bytes.  */                                 \
 256             cnt = 6;                                                    \
 257             ch &= 0x01;                                                 \
 258           }                                                             \
 259         else                                                            \
 260           {                                                             \
 261             /* Search the end of this ill-formed UTF-8 character.  This \
 262                is the next byte with (x & 0xc0) != 0x80.  */            \
 263             i = 0;                                                      \
 264             do                                                          \
 265               ++i;                                                      \
 266             while (inptr + i < inend                                    \
 267                    && (*(inptr + i) & 0xc0) == 0x80                     \
 268                    && i < 5);                                           \
 269                                                                         \
 270           errout:                                                       \
 271             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 272           }                                                             \
 273                                                                         \
 274         if (__builtin_expect (inptr + cnt > inend, 0))                  \
 275           {                                                             \
 276             /* We don't have enough input.  But before we report        \
 277                that check that all the bytes are correct.  */           \
 278             for (i = 1; inptr + i < inend; ++i)                         \
 279               if ((inptr[i] & 0xc0) != 0x80)                            \
 280                 break;                                                  \
 281                                                                         \
 282             if (__builtin_expect (inptr + i == inend, 1))               \
 283               {                                                         \
 284                 result = __GCONV_INCOMPLETE_INPUT;                      \
 285                 break;                                                  \
 286               }                                                         \
 287                                                                         \
 288             goto errout;                                                \
 289           }                                                             \
 290                                                                         \
 291         /* Read the possible remaining bytes.  */                       \
 292         for (i = 1; i < cnt; ++i)                                       \
 293           {                                                             \
 294             uint32_t byte = inptr[i];                                   \
 295                                                                         \
 296             if ((byte & 0xc0) != 0x80)                                  \
 297               /* This is an illegal encoding.  */                       \
 298               break;                                                    \
 299                                                                         \
 300             ch <<= 6;                                                   \
 301             ch |= byte & 0x3f;                                          \
 302           }                                                             \
 303                                                                         \
 304         /* If i < cnt, some trail byte was not >= 0x80, < 0xc0.         \
 305            If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
 306            have been represented with fewer than cnt bytes.  */         \
 307         if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0))         \
 308           {                                                             \
 309             /* This is an illegal encoding.  */                         \
 310             goto errout;                                                \
 311           }                                                             \
 312                                                                         \
 313         inptr += cnt;                                                   \
 314       }                                                                 \
 315                                                                         \
 316     /* Now adjust the pointers and store the result.  */                \
 317     *((uint32_t *) outptr) = ch;                                        \
 318     outptr += sizeof (uint32_t);                                        \
 319   }
 320 #define LOOP_NEED_FLAGS
 321
 322 #define STORE_REST                                                      \
 323   {                                                                           \
 324     /* We store the remaining bytes while converting them into the UCS4       \
 325        format.  We can assume that the first byte in the buffer is            \
 326        correct and that it requires a larger number of bytes than there       \
 327        are in the input buffer.  */                                           \
 328     wint_t ch = **inptrp;                                                     \
 329     size_t cnt, r;                                                            \
 330                                                                               \
 331     state->__count = inend - *inptrp;                                         \
 332                                                                               \
 333     if (ch >= 0xc2 && ch < 0xe0)                                              \
 334       {                                                                       \
 335         /* We expect two bytes.  The first byte cannot be 0xc0 or             \
 336            0xc1, otherwise the wide character could have been                 \
 337            represented using a single byte.  */                               \
 338         cnt = 2;                                                              \
 339         ch &= 0x1f;                                                           \
 340       }                                                                       \
 341     else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))                       \
 342       {                                                                       \
 343         /* We expect three bytes.  */                                         \
 344         cnt = 3;                                                              \
 345         ch &= 0x0f;                                                           \
 346       }                                                                       \
 347     else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))                       \
 348       {                                                                       \
 349         /* We expect four bytes.  */                                          \
 350         cnt = 4;                                                              \
 351         ch &= 0x07;                                                           \
 352       }                                                                       \
 353     else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1))                       \
 354       {                                                                       \
 355         /* We expect five bytes.  */                                          \
 356         cnt = 5;                                                              \
 357         ch &= 0x03;                                                           \
 358       }                                                                       \
 359     else                                                                      \
 360       {                                                                       \
 361         /* We expect six bytes.  */                                           \
 362         cnt = 6;                                                              \
 363         ch &= 0x01;                                                           \
 364       }                                                                       \
 365                                                                               \
 366     /* The first byte is already consumed.  */                                \
 367     r = cnt - 1;                                                              \
 368     while (++(*inptrp) < inend)                                               \
 369       {                                                                       \
 370         ch <<= 6;                                                             \
 371         ch |= **inptrp & 0x3f;                                                \
 372         --r;                                                                  \
 373       }                                                                       \
 374                                                                               \
 375     /* Shift for the so far missing bytes.  */                                \
 376     ch <<= r * 6;                                                             \
 377                                                                               \
 378     /* Store the number of bytes expected for the entire sequence.  */        \
 379     state->__count |= cnt << 8;                                               \
 380                                                                               \
 381     /* Store the value.  */                                                   \
 382     state->__value.__wch = ch;                                                \
 383   }
 384
 385 #define UNPACK_BYTES \
 386   {                                                                           \
 387     static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };  \
 388     wint_t wch = state->__value.__wch;                                        \
 389     size_t ntotal = state->__count >> 8;                                      \
 390                                                                               \
 391     inlen = state->__count & 255;                                             \
 392                                                                               \
 393     bytebuf[0] = inmask[ntotal - 2];                                          \
 394                                                                               \
 395     do                                                                        \
 396       {                                                                       \
 397         if (--ntotal < inlen)                                                 \
 398           bytebuf[ntotal] = 0x80 | (wch & 0x3f);                              \
 399         wch >>= 6;                                                            \
 400       }                                                                       \
 401     while (ntotal > 1);                                                       \
 402                                                                               \
 403     bytebuf[0] |= wch;                                                        \
 404   }
 405
 406 #define CLEAR_STATE \
 407   state->__count = 0
 408
 409 #include <iconv/loop.c>
 410
 411 /* Conversion from UTF-32 internal/BE to UTF-8.  */
 412
 413 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
 414 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
 415 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
 416 #define LOOPFCT                 TO_LOOP
 417 /* The software routine mimics the S/390 cu41 instruction.  */
 418 #define BODY                                                    \
 419   {                                                             \
 420     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                    \
 421       {                                                         \
 422         HARDWARE_CONVERT ("cu41 %0, %1");                       \
 423                                                                 \
 424         if (inptr != inend)                                     \
 425           {                                                     \
 426             result = __GCONV_INCOMPLETE_INPUT;                  \
 427             break;                                              \
 428           }                                                     \
 429         continue;                                               \
 430       }                                                         \
 431                                                                 \
 432     uint32_t wc = *((const uint32_t *) inptr);                  \
 433                                                                 \
 434     if (__builtin_expect (wc <= 0x7f, 1))                       \
 435       {                                                         \
 436         /* Single UTF-8 char.  */                               \
 437         *outptr = (uint8_t)wc;                                  \
 438         outptr++;                                               \
 439       }                                                         \
 440     else if (wc <= 0x7ff)                                       \
 441       {                                                         \
 442         /* Two UTF-8 chars.  */                                 \
 443         if (__builtin_expect (outptr + 2 > outend, 0))          \
 444           {                                                     \
 445             /* Overflow in the output buffer.  */               \
 446             result = __GCONV_FULL_OUTPUT;                       \
 447             break;                                              \
 448           }                                                     \
 449                                                                 \
 450         outptr[0] = 0xc0;                                       \
 451         outptr[0] |= wc >> 6;                                   \
 452                                                                 \
 453         outptr[1] = 0x80;                                       \
 454         outptr[1] |= wc & 0x3f;                                 \
 455                                                                 \
 456         outptr += 2;                                            \
 457       }                                                         \
 458     else if (wc <= 0xffff)                                      \
 459       {                                                         \
 460         /* Three UTF-8 chars.  */                               \
 461         if (__builtin_expect (outptr + 3 > outend, 0))          \
 462           {                                                     \
 463             /* Overflow in the output buffer.  */               \
 464             result = __GCONV_FULL_OUTPUT;                       \
 465             break;                                              \
 466           }                                                     \
 467         outptr[0] = 0xe0;                                       \
 468         outptr[0] |= wc >> 12;                                  \
 469                                                                 \
 470         outptr[1] = 0x80;                                       \
 471         outptr[1] |= (wc >> 6) & 0x3f;                          \
 472                                                                 \
 473         outptr[2] = 0x80;                                       \
 474         outptr[2] |= wc & 0x3f;                                 \
 475                                                                 \
 476         outptr += 3;                                            \
 477       }                                                         \
 478       else if (wc <= 0x10ffff)                                  \
 479         {                                                       \
 480           /* Four UTF-8 chars.  */                              \
 481           if (__builtin_expect (outptr + 4 > outend, 0))        \
 482             {                                                   \
 483               /* Overflow in the output buffer.  */             \
 484               result = __GCONV_FULL_OUTPUT;                     \
 485               break;                                            \
 486             }                                                   \
 487           outptr[0] = 0xf0;                                     \
 488           outptr[0] |= wc >> 18;                                \
 489                                                                 \
 490           outptr[1] = 0x80;                                     \
 491           outptr[1] |= (wc >> 12) & 0x3f;                       \
 492                                                                 \
 493           outptr[2] = 0x80;                                     \
 494           outptr[2] |= (wc >> 6) & 0x3f;                        \
 495                                                                 \
 496           outptr[3] = 0x80;                                     \
 497           outptr[3] |= wc & 0x3f;                               \
 498                                                                 \
 499           outptr += 4;                                          \
 500         }                                                       \
 501       else                                                      \
 502         {                                                       \
 503           STANDARD_TO_LOOP_ERR_HANDLER (4);                     \
 504         }                                                       \
 505     inptr += 4;                                                 \
 506   }
 507 #define LOOP_NEED_FLAGS
 508 #include <iconv/loop.c>
 509
 510 #include <iconv/skeleton.c>