sysdeps/s390/s390-64/utf8-utf16-z9.c

   1 /* Conversion between UTF-16 and UTF-32 BE/internal.
   2
   3    This module uses the Z9-109 variants of the Convert Unicode
   4    instructions.
   5    Copyright (C) 1997-2009 Free Software Foundation, Inc.
   6
   7    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
   8    Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
   9
  10    Thanks to Daniel Appich who covered the relevant performance work
  11    in his diploma thesis.
  12
  13    This is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU Lesser General Public
  15    License as published by the Free Software Foundation; either
  16    version 2.1 of the License, or (at your option) any later version.
  17
  18    This is distributed in the hope that it will be useful,
  19    but WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    Lesser General Public License for more details.
  22
  23    You should have received a copy of the GNU Lesser General Public
  24    License along with the GNU C Library; if not, see
  25    <http://www.gnu.org/licenses/>.  */
  26
  27 #include <dlfcn.h>
  28 #include <stdint.h>
  29 #include <unistd.h>
  30 #include <dl-procinfo.h>
  31 #include <gconv.h>
  32
  33 /* UTF-16 big endian byte order mark.  */
  34 #define BOM_UTF16       0xfeff
  35
  36 #define DEFINE_INIT             0
  37 #define DEFINE_FINI             0
  38 #define MIN_NEEDED_FROM         1
  39 #define MAX_NEEDED_FROM         4
  40 #define MIN_NEEDED_TO           2
  41 #define MAX_NEEDED_TO           4
  42 #define FROM_LOOP               from_utf8_loop
  43 #define TO_LOOP                 to_utf8_loop
  44 #define FROM_DIRECTION          (dir == from_utf8)
  45 #define PREPARE_LOOP                                                    \
  46   enum direction dir = ((struct utf8_data *) step->__data)->dir;        \
  47   int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;         \
  48                                                                         \
  49   if (emit_bom && !data->__internal_use                                 \
  50       && data->__invocation_counter == 0)                               \
  51     {                                                                   \
  52       /* Emit the UTF-16 Byte Order Mark.  */                           \
  53       if (__builtin_expect (outbuf + 2 > outend, 0))                    \
  54         return __GCONV_FULL_OUTPUT;                                     \
  55                                                                         \
  56       put16u (outbuf, BOM_UTF16);                                       \
  57       outbuf += 2;                                                      \
  58     }
  59
  60 /* Direction of the transformation.  */
  61 enum direction
  62 {
  63   illegal_dir,
  64   to_utf8,
  65   from_utf8
  66 };
  67
  68 struct utf8_data
  69 {
  70   enum direction dir;
  71   int emit_bom;
  72 };
  73
  74
  75 extern int gconv_init (struct __gconv_step *step);
  76 int
  77 gconv_init (struct __gconv_step *step)
  78 {
  79   /* Determine which direction.  */
  80   struct utf8_data *new_data;
  81   enum direction dir = illegal_dir;
  82   int emit_bom;
  83   int result;
  84
  85   emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
  86
  87   if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
  88       && (__strcasecmp (step->__to_name, "UTF-16//") == 0
  89           || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
  90     {
  91       dir = from_utf8;
  92     }
  93   else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
  94            && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
  95     {
  96       dir = to_utf8;
  97     }
  98
  99   result = __GCONV_NOCONV;
 100   if (dir != illegal_dir)
 101     {
 102       new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
 103
 104       result = __GCONV_NOMEM;
 105       if (new_data != NULL)
 106         {
 107           new_data->dir = dir;
 108           new_data->emit_bom = emit_bom;
 109           step->__data = new_data;
 110
 111           if (dir == from_utf8)
 112             {
 113               step->__min_needed_from = MIN_NEEDED_FROM;
 114               step->__max_needed_from = MIN_NEEDED_FROM;
 115               step->__min_needed_to = MIN_NEEDED_TO;
 116               step->__max_needed_to = MIN_NEEDED_TO;
 117             }
 118           else
 119             {
 120               step->__min_needed_from = MIN_NEEDED_TO;
 121               step->__max_needed_from = MIN_NEEDED_TO;
 122               step->__min_needed_to = MIN_NEEDED_FROM;
 123               step->__max_needed_to = MIN_NEEDED_FROM;
 124             }
 125
 126           step->__stateful = 0;
 127
 128           result = __GCONV_OK;
 129         }
 130     }
 131
 132   return result;
 133 }
 134
 135
 136 extern void gconv_end (struct __gconv_step *data);
 137 void
 138 gconv_end (struct __gconv_step *data)
 139 {
 140   free (data->__data);
 141 }
 142
 143 /* The macro for the hardware loop.  This is used for both
 144    directions.  */
 145 #define HARDWARE_CONVERT(INSTRUCTION)                                   \
 146   {                                                                     \
 147     register const unsigned char* pInput asm ("8") = inptr;             \
 148     register unsigned long long inlen asm ("9") = inend - inptr;        \
 149     register unsigned char* pOutput asm ("10") = outptr;                \
 150     register unsigned long long outlen asm("11") = outend - outptr;     \
 151     uint64_t cc = 0;                                                    \
 152                                                                         \
 153     asm volatile (".machine push       \n\t"                            \
 154                   ".machine \"z9-109\" \n\t"                            \
 155                   "0: " INSTRUCTION "  \n\t"                            \
 156                   ".machine pop        \n\t"                            \
 157                   "   jo     0b        \n\t"                            \
 158                   "   ipm    %2        \n"                              \
 159                   : "+a" (pOutput), "+a" (pInput), "+d" (cc),           \
 160                     "+d" (outlen), "+d" (inlen)                         \
 161                   :                                                     \
 162                   : "cc", "memory");                                    \
 163                                                                         \
 164     inptr = pInput;                                                     \
 165     outptr = pOutput;                                                   \
 166     cc >>= 28;                                                          \
 167                                                                         \
 168     if (cc == 1)                                                        \
 169       {                                                                 \
 170         result = __GCONV_FULL_OUTPUT;                                   \
 171         break;                                                          \
 172       }                                                                 \
 173     else if (cc == 2)                                                   \
 174       {                                                                 \
 175         result = __GCONV_ILLEGAL_INPUT;                                 \
 176         break;                                                          \
 177       }                                                                 \
 178   }
 179
 180 /* Conversion function from UTF-8 to UTF-16.  */
 181
 182 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
 183 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
 184 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
 185 #define LOOPFCT                 FROM_LOOP
 186 /* The software implementation is based on the code in gconv_simple.c.  */
 187 #define BODY                                                            \
 188   {                                                                     \
 189     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                            \
 190       {                                                                 \
 191         HARDWARE_CONVERT ("cu12 %0, %1, 1");                            \
 192                                                                         \
 193         if (inptr != inend)                                             \
 194           {                                                             \
 195             int i;                                                      \
 196             for (i = 1; inptr + i < inend; ++i)                         \
 197               if ((inptr[i] & 0xc0) != 0x80)                            \
 198                 break;                                                  \
 199                                                                 \
 200             if (__builtin_expect (inptr + i == inend, 1))               \
 201               {                                                         \
 202                 result = __GCONV_INCOMPLETE_INPUT;                      \
 203                 break;                                                  \
 204               }                                                         \
 205             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 206           }                                                             \
 207         continue;                                                       \
 208     }                                                                   \
 209                                                                         \
 210     /* Next input byte.  */                                             \
 211     uint16_t ch = *inptr;                                               \
 212                                                                         \
 213     if (__builtin_expect (ch < 0x80, 1))                                \
 214       {                                                                 \
 215         /* One byte sequence.  */                                       \
 216         ++inptr;                                                        \
 217       }                                                                 \
 218     else                                                                \
 219       {                                                                 \
 220         uint_fast32_t cnt;                                              \
 221         uint_fast32_t i;                                                \
 222                                                                         \
 223         if (ch >= 0xc2 && ch < 0xe0)                                    \
 224           {                                                             \
 225             /* We expect two bytes.  The first byte cannot be 0xc0      \
 226                or 0xc1, otherwise the wide character could have been    \
 227                represented using a single byte.  */                     \
 228             cnt = 2;                                                    \
 229             ch &= 0x1f;                                                 \
 230           }                                                             \
 231         else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))             \
 232           {                                                             \
 233             /* We expect three bytes.  */                               \
 234             cnt = 3;                                                    \
 235             ch &= 0x0f;                                                 \
 236           }                                                             \
 237         else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))             \
 238           {                                                             \
 239             /* We expect four bytes.  */                                \
 240             cnt = 4;                                                    \
 241             ch &= 0x07;                                                 \
 242           }                                                             \
 243         else                                                            \
 244           {                                                             \
 245             /* Search the end of this ill-formed UTF-8 character.  This \
 246                is the next byte with (x & 0xc0) != 0x80.  */            \
 247             i = 0;                                                      \
 248             do                                                          \
 249               ++i;                                                      \
 250             while (inptr + i < inend                                    \
 251                    && (*(inptr + i) & 0xc0) == 0x80                     \
 252                    && i < 5);                                           \
 253                                                                         \
 254           errout:                                                       \
 255             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 256           }                                                             \
 257                                                                         \
 258         if (__builtin_expect (inptr + cnt > inend, 0))                  \
 259           {                                                             \
 260             /* We don't have enough input.  But before we report        \
 261                that check that all the bytes are correct.  */           \
 262             for (i = 1; inptr + i < inend; ++i)                         \
 263               if ((inptr[i] & 0xc0) != 0x80)                            \
 264                 break;                                                  \
 265                                                                         \
 266             if (__builtin_expect (inptr + i == inend, 1))               \
 267               {                                                         \
 268                 result = __GCONV_INCOMPLETE_INPUT;                      \
 269                 break;                                                  \
 270               }                                                         \
 271                                                                         \
 272             goto errout;                                                \
 273           }                                                             \
 274                                                                         \
 275         if (cnt == 4)                                                   \
 276           {                                                             \
 277             /* For 4 byte UTF-8 chars two UTF-16 chars (high and        \
 278                low) are needed.  */                                     \
 279             uint16_t zabcd, high, low;                                  \
 280                                                                         \
 281             if (__builtin_expect (outptr + 4 > outend, 0))              \
 282               {                                                         \
 283                 /* Overflow in the output buffer.  */                   \
 284                 result = __GCONV_FULL_OUTPUT;                           \
 285                 break;                                                  \
 286               }                                                         \
 287                                                                         \
 288             /* See Principles of Operations cu12.  */                   \
 289             zabcd = (((inptr[0] & 0x7) << 2) |                          \
 290                      ((inptr[1] & 0x30) >> 4)) - 1;                     \
 291                                                                         \
 292             /* z-bit must be zero after subtracting 1.  */              \
 293             if (zabcd & 0x10)                                           \
 294               STANDARD_FROM_LOOP_ERR_HANDLER (4)                        \
 295                                                                         \
 296             high = (uint16_t)(0xd8 << 8);       /* high surrogate id */ \
 297             high |= zabcd << 6;                         /* abcd bits */ \
 298             high |= (inptr[1] & 0xf) << 2;              /* efgh bits */ \
 299             high |= (inptr[2] & 0x30) >> 4;               /* ij bits */ \
 300                                                                         \
 301             low = (uint16_t)(0xdc << 8);         /* low surrogate id */ \
 302             low |= ((uint16_t)inptr[2] & 0xc) << 6;       /* kl bits */ \
 303             low |= (inptr[2] & 0x3) << 6;                 /* mn bits */ \
 304             low |= inptr[3] & 0x3f;                   /* opqrst bits */ \
 305                                                                         \
 306             put16 (outptr, high);                                       \
 307             outptr += 2;                                                \
 308             put16 (outptr, low);                                        \
 309             outptr += 2;                                                \
 310             inptr += 4;                                                 \
 311             continue;                                                   \
 312           }                                                             \
 313         else                                                            \
 314           {                                                             \
 315             /* Read the possible remaining bytes.  */                   \
 316             for (i = 1; i < cnt; ++i)                                   \
 317               {                                                         \
 318                 uint16_t byte = inptr[i];                               \
 319                                                                         \
 320                 if ((byte & 0xc0) != 0x80)                              \
 321                   /* This is an illegal encoding.  */                   \
 322                   break;                                                \
 323                                                                         \
 324                 ch <<= 6;                                               \
 325                 ch |= byte & 0x3f;                                      \
 326               }                                                         \
 327             inptr += cnt;                                               \
 328                                                                         \
 329           }                                                             \
 330       }                                                                 \
 331     /* Now adjust the pointers and store the result.  */                \
 332     *((uint16_t *) outptr) = ch;                                        \
 333     outptr += sizeof (uint16_t);                                        \
 334   }
 335
 336 #define LOOP_NEED_FLAGS
 337 #include <iconv/loop.c>
 338
 339 /* Conversion from UTF-16 to UTF-8.  */
 340
 341 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
 342 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
 343 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
 344 #define LOOPFCT                 TO_LOOP
 345 /* The software routine is based on the functionality of the S/390
 346    hardware instruction (cu21) as described in the Principles of
 347    Operation.  */
 348 #define BODY                                                            \
 349   {                                                                     \
 350     /* The hardware instruction currently fails to report an error for  \
 351        isolated low surrogates so we have to disable the instruction    \
 352        until this gets resolved.  */                                    \
 353     if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */                  \
 354       {                                                                 \
 355         HARDWARE_CONVERT ("cu21 %0, %1, 1");                            \
 356         if (inptr != inend)                                             \
 357           {                                                             \
 358             /* Check if the third byte is                               \
 359                a valid start of a UTF-16 surrogate.  */                 \
 360             if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc)        \
 361               STANDARD_TO_LOOP_ERR_HANDLER (3);                         \
 362                                                                         \
 363             result = __GCONV_INCOMPLETE_INPUT;                          \
 364             break;                                                      \
 365           }                                                             \
 366         continue;                                                       \
 367       }                                                                 \
 368                                                                         \
 369     uint16_t c = get16 (inptr);                                         \
 370                                                                         \
 371     if (__builtin_expect (c <= 0x007f, 1))                              \
 372       {                                                                 \
 373         /* Single byte UTF-8 char.  */                                  \
 374         *outptr = c & 0xff;                                             \
 375         outptr++;                                                       \
 376       }                                                                 \
 377     else if (c >= 0x0080 && c <= 0x07ff)                                \
 378       {                                                                 \
 379         /* Two byte UTF-8 char.  */                                     \
 380                                                                         \
 381         if (__builtin_expect (outptr + 2 > outend, 0))                  \
 382           {                                                             \
 383             /* Overflow in the output buffer.  */                       \
 384             result = __GCONV_FULL_OUTPUT;                               \
 385             break;                                                      \
 386           }                                                             \
 387                                                                         \
 388         outptr[0] = 0xc0;                                               \
 389         outptr[0] |= c >> 6;                                            \
 390                                                                         \
 391         outptr[1] = 0x80;                                               \
 392         outptr[1] |= c & 0x3f;                                          \
 393                                                                         \
 394         outptr += 2;                                                    \
 395       }                                                                 \
 396     else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff)                \
 397       {                                                                 \
 398         /* Three byte UTF-8 char.  */                                   \
 399                                                                         \
 400         if (__builtin_expect (outptr + 3 > outend, 0))                  \
 401           {                                                             \
 402             /* Overflow in the output buffer.  */                       \
 403             result = __GCONV_FULL_OUTPUT;                               \
 404             break;                                                      \
 405           }                                                             \
 406         outptr[0] = 0xe0;                                               \
 407         outptr[0] |= c >> 12;                                           \
 408                                                                         \
 409         outptr[1] = 0x80;                                               \
 410         outptr[1] |= (c >> 6) & 0x3f;                                   \
 411                                                                         \
 412         outptr[2] = 0x80;                                               \
 413         outptr[2] |= c & 0x3f;                                          \
 414                                                                         \
 415         outptr += 3;                                                    \
 416       }                                                                 \
 417     else if (c >= 0xd800 && c <= 0xdbff)                                \
 418       {                                                                 \
 419         /* Four byte UTF-8 char.  */                                    \
 420         uint16_t low, uvwxy;                                            \
 421                                                                         \
 422         if (__builtin_expect (outptr + 4 > outend, 0))                  \
 423           {                                                             \
 424             /* Overflow in the output buffer.  */                       \
 425             result = __GCONV_FULL_OUTPUT;                               \
 426             break;                                                      \
 427           }                                                             \
 428         inptr += 2;                                                     \
 429         if (__builtin_expect (inptr + 2 > inend, 0))                    \
 430           {                                                             \
 431             result = __GCONV_INCOMPLETE_INPUT;                          \
 432             break;                                                      \
 433           }                                                             \
 434                                                                         \
 435         low = get16 (inptr);                                            \
 436                                                                         \
 437         if ((low & 0xfc00) != 0xdc00)                                   \
 438           {                                                             \
 439             inptr -= 2;                                                 \
 440             STANDARD_TO_LOOP_ERR_HANDLER (2);                           \
 441           }                                                             \
 442         uvwxy = ((c >> 6) & 0xf) + 1;                                   \
 443         outptr[0] = 0xf0;                                               \
 444         outptr[0] |= uvwxy >> 2;                                        \
 445                                                                         \
 446         outptr[1] = 0x80;                                               \
 447         outptr[1] |= (uvwxy << 4) & 0x30;                               \
 448         outptr[1] |= (c >> 2) & 0x0f;                                   \
 449                                                                         \
 450         outptr[2] = 0x80;                                               \
 451         outptr[2] |= (c & 0x03) << 4;                                   \
 452         outptr[2] |= (low >> 6) & 0x0f;                                 \
 453                                                                         \
 454         outptr[3] = 0x80;                                               \
 455         outptr[3] |= low & 0x3f;                                        \
 456                                                                         \
 457         outptr += 4;                                                    \
 458       }                                                                 \
 459     else                                                                \
 460       {                                                                 \
 461         STANDARD_TO_LOOP_ERR_HANDLER (2);                               \
 462       }                                                                 \
 463     inptr += 2;                                                         \
 464   }
 465 #define LOOP_NEED_FLAGS
 466 #include <iconv/loop.c>
 467
 468 #include <iconv/skeleton.c>