sysdeps/s390/s390-64/utf8-utf16-z9.c

   1 /* Conversion between UTF-16 and UTF-32 BE/internal.
   2
   3    This module uses the Z9-109 variants of the Convert Unicode
   4    instructions.
   5    Copyright (C) 1997-2015 Free Software Foundation, Inc.
   6
   7    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
   8    Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
   9
  10    Thanks to Daniel Appich who covered the relevant performance work
  11    in his diploma thesis.
  12
  13    This is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU Lesser General Public
  15    License as published by the Free Software Foundation; either
  16    version 2.1 of the License, or (at your option) any later version.
  17
  18    This is distributed in the hope that it will be useful,
  19    but WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    Lesser General Public License for more details.
  22
  23    You should have received a copy of the GNU Lesser General Public
  24    License along with the GNU C Library; if not, see
  25    <http://www.gnu.org/licenses/>.  */
  26
  27 #include <dlfcn.h>
  28 #include <stdint.h>
  29 #include <unistd.h>
  30 #include <dl-procinfo.h>
  31 #include <gconv.h>
  32
  33 /* UTF-16 big endian byte order mark.  */
  34 #define BOM_UTF16       0xfeff
  35
  36 #define DEFINE_INIT             0
  37 #define DEFINE_FINI             0
  38 #define MIN_NEEDED_FROM         1
  39 #define MAX_NEEDED_FROM         4
  40 #define MIN_NEEDED_TO           2
  41 #define MAX_NEEDED_TO           4
  42 #define FROM_LOOP               from_utf8_loop
  43 #define TO_LOOP                 to_utf8_loop
  44 #define FROM_DIRECTION          (dir == from_utf8)
  45 #define ONE_DIRECTION           0
  46 #define PREPARE_LOOP                                                    \
  47   enum direction dir = ((struct utf8_data *) step->__data)->dir;        \
  48   int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;         \
  49                                                                         \
  50   if (emit_bom && !data->__internal_use                                 \
  51       && data->__invocation_counter == 0)                               \
  52     {                                                                   \
  53       /* Emit the UTF-16 Byte Order Mark.  */                           \
  54       if (__glibc_unlikely (outbuf + 2 > outend))                             \
  55         return __GCONV_FULL_OUTPUT;                                     \
  56                                                                         \
  57       put16u (outbuf, BOM_UTF16);                                       \
  58       outbuf += 2;                                                      \
  59     }
  60
  61 /* Direction of the transformation.  */
  62 enum direction
  63 {
  64   illegal_dir,
  65   to_utf8,
  66   from_utf8
  67 };
  68
  69 struct utf8_data
  70 {
  71   enum direction dir;
  72   int emit_bom;
  73 };
  74
  75
  76 extern int gconv_init (struct __gconv_step *step);
  77 int
  78 gconv_init (struct __gconv_step *step)
  79 {
  80   /* Determine which direction.  */
  81   struct utf8_data *new_data;
  82   enum direction dir = illegal_dir;
  83   int emit_bom;
  84   int result;
  85
  86   emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
  87
  88   if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
  89       && (__strcasecmp (step->__to_name, "UTF-16//") == 0
  90           || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
  91     {
  92       dir = from_utf8;
  93     }
  94   else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
  95            && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
  96     {
  97       dir = to_utf8;
  98     }
  99
 100   result = __GCONV_NOCONV;
 101   if (dir != illegal_dir)
 102     {
 103       new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
 104
 105       result = __GCONV_NOMEM;
 106       if (new_data != NULL)
 107         {
 108           new_data->dir = dir;
 109           new_data->emit_bom = emit_bom;
 110           step->__data = new_data;
 111
 112           if (dir == from_utf8)
 113             {
 114               step->__min_needed_from = MIN_NEEDED_FROM;
 115               step->__max_needed_from = MIN_NEEDED_FROM;
 116               step->__min_needed_to = MIN_NEEDED_TO;
 117               step->__max_needed_to = MIN_NEEDED_TO;
 118             }
 119           else
 120             {
 121               step->__min_needed_from = MIN_NEEDED_TO;
 122               step->__max_needed_from = MIN_NEEDED_TO;
 123               step->__min_needed_to = MIN_NEEDED_FROM;
 124               step->__max_needed_to = MIN_NEEDED_FROM;
 125             }
 126
 127           step->__stateful = 0;
 128
 129           result = __GCONV_OK;
 130         }
 131     }
 132
 133   return result;
 134 }
 135
 136
 137 extern void gconv_end (struct __gconv_step *data);
 138 void
 139 gconv_end (struct __gconv_step *data)
 140 {
 141   free (data->__data);
 142 }
 143
 144 /* The macro for the hardware loop.  This is used for both
 145    directions.  */
 146 #define HARDWARE_CONVERT(INSTRUCTION)                                   \
 147   {                                                                     \
 148     register const unsigned char* pInput asm ("8") = inptr;             \
 149     register unsigned long long inlen asm ("9") = inend - inptr;        \
 150     register unsigned char* pOutput asm ("10") = outptr;                \
 151     register unsigned long long outlen asm("11") = outend - outptr;     \
 152     uint64_t cc = 0;                                                    \
 153                                                                         \
 154     asm volatile (".machine push       \n\t"                            \
 155                   ".machine \"z9-109\" \n\t"                            \
 156                   "0: " INSTRUCTION "  \n\t"                            \
 157                   ".machine pop        \n\t"                            \
 158                   "   jo     0b        \n\t"                            \
 159                   "   ipm    %2        \n"                              \
 160                   : "+a" (pOutput), "+a" (pInput), "+d" (cc),           \
 161                     "+d" (outlen), "+d" (inlen)                         \
 162                   :                                                     \
 163                   : "cc", "memory");                                    \
 164                                                                         \
 165     inptr = pInput;                                                     \
 166     outptr = pOutput;                                                   \
 167     cc >>= 28;                                                          \
 168                                                                         \
 169     if (cc == 1)                                                        \
 170       {                                                                 \
 171         result = __GCONV_FULL_OUTPUT;                                   \
 172         break;                                                          \
 173       }                                                                 \
 174     else if (cc == 2)                                                   \
 175       {                                                                 \
 176         result = __GCONV_ILLEGAL_INPUT;                                 \
 177         break;                                                          \
 178       }                                                                 \
 179   }
 180
 181 /* Conversion function from UTF-8 to UTF-16.  */
 182
 183 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
 184 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
 185 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
 186 #define LOOPFCT                 FROM_LOOP
 187 /* The software implementation is based on the code in gconv_simple.c.  */
 188 #define BODY                                                            \
 189   {                                                                     \
 190     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)                            \
 191       {                                                                 \
 192         HARDWARE_CONVERT ("cu12 %0, %1, 1");                            \
 193                                                                         \
 194         if (inptr != inend)                                             \
 195           {                                                             \
 196             int i;                                                      \
 197             for (i = 1; inptr + i < inend; ++i)                         \
 198               if ((inptr[i] & 0xc0) != 0x80)                            \
 199                 break;                                                  \
 200                                                                 \
 201             if (__glibc_likely (inptr + i == inend))                          \
 202               {                                                         \
 203                 result = __GCONV_INCOMPLETE_INPUT;                      \
 204                 break;                                                  \
 205               }                                                         \
 206             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 207           }                                                             \
 208         continue;                                                       \
 209     }                                                                   \
 210                                                                         \
 211     /* Next input byte.  */                                             \
 212     uint16_t ch = *inptr;                                               \
 213                                                                         \
 214     if (__glibc_likely (ch < 0x80))                                           \
 215       {                                                                 \
 216         /* One byte sequence.  */                                       \
 217         ++inptr;                                                        \
 218       }                                                                 \
 219     else                                                                \
 220       {                                                                 \
 221         uint_fast32_t cnt;                                              \
 222         uint_fast32_t i;                                                \
 223                                                                         \
 224         if (ch >= 0xc2 && ch < 0xe0)                                    \
 225           {                                                             \
 226             /* We expect two bytes.  The first byte cannot be 0xc0      \
 227                or 0xc1, otherwise the wide character could have been    \
 228                represented using a single byte.  */                     \
 229             cnt = 2;                                                    \
 230             ch &= 0x1f;                                                 \
 231           }                                                             \
 232         else if (__glibc_likely ((ch & 0xf0) == 0xe0))                        \
 233           {                                                             \
 234             /* We expect three bytes.  */                               \
 235             cnt = 3;                                                    \
 236             ch &= 0x0f;                                                 \
 237           }                                                             \
 238         else if (__glibc_likely ((ch & 0xf8) == 0xf0))                        \
 239           {                                                             \
 240             /* We expect four bytes.  */                                \
 241             cnt = 4;                                                    \
 242             ch &= 0x07;                                                 \
 243           }                                                             \
 244         else                                                            \
 245           {                                                             \
 246             /* Search the end of this ill-formed UTF-8 character.  This \
 247                is the next byte with (x & 0xc0) != 0x80.  */            \
 248             i = 0;                                                      \
 249             do                                                          \
 250               ++i;                                                      \
 251             while (inptr + i < inend                                    \
 252                    && (*(inptr + i) & 0xc0) == 0x80                     \
 253                    && i < 5);                                           \
 254                                                                         \
 255           errout:                                                       \
 256             STANDARD_FROM_LOOP_ERR_HANDLER (i);                         \
 257           }                                                             \
 258                                                                         \
 259         if (__glibc_unlikely (inptr + cnt > inend))                           \
 260           {                                                             \
 261             /* We don't have enough input.  But before we report        \
 262                that check that all the bytes are correct.  */           \
 263             for (i = 1; inptr + i < inend; ++i)                         \
 264               if ((inptr[i] & 0xc0) != 0x80)                            \
 265                 break;                                                  \
 266                                                                         \
 267             if (__glibc_likely (inptr + i == inend))                          \
 268               {                                                         \
 269                 result = __GCONV_INCOMPLETE_INPUT;                      \
 270                 break;                                                  \
 271               }                                                         \
 272                                                                         \
 273             goto errout;                                                \
 274           }                                                             \
 275                                                                         \
 276         if (cnt == 4)                                                   \
 277           {                                                             \
 278             /* For 4 byte UTF-8 chars two UTF-16 chars (high and        \
 279                low) are needed.  */                                     \
 280             uint16_t zabcd, high, low;                                  \
 281                                                                         \
 282             if (__glibc_unlikely (outptr + 4 > outend))                       \
 283               {                                                         \
 284                 /* Overflow in the output buffer.  */                   \
 285                 result = __GCONV_FULL_OUTPUT;                           \
 286                 break;                                                  \
 287               }                                                         \
 288                                                                         \
 289             /* See Principles of Operations cu12.  */                   \
 290             zabcd = (((inptr[0] & 0x7) << 2) |                          \
 291                      ((inptr[1] & 0x30) >> 4)) - 1;                     \
 292                                                                         \
 293             /* z-bit must be zero after subtracting 1.  */              \
 294             if (zabcd & 0x10)                                           \
 295               STANDARD_FROM_LOOP_ERR_HANDLER (4)                        \
 296                                                                         \
 297             high = (uint16_t)(0xd8 << 8);       /* high surrogate id */ \
 298             high |= zabcd << 6;                         /* abcd bits */ \
 299             high |= (inptr[1] & 0xf) << 2;              /* efgh bits */ \
 300             high |= (inptr[2] & 0x30) >> 4;               /* ij bits */ \
 301                                                                         \
 302             low = (uint16_t)(0xdc << 8);         /* low surrogate id */ \
 303             low |= ((uint16_t)inptr[2] & 0xc) << 6;       /* kl bits */ \
 304             low |= (inptr[2] & 0x3) << 6;                 /* mn bits */ \
 305             low |= inptr[3] & 0x3f;                   /* opqrst bits */ \
 306                                                                         \
 307             put16 (outptr, high);                                       \
 308             outptr += 2;                                                \
 309             put16 (outptr, low);                                        \
 310             outptr += 2;                                                \
 311             inptr += 4;                                                 \
 312             continue;                                                   \
 313           }                                                             \
 314         else                                                            \
 315           {                                                             \
 316             /* Read the possible remaining bytes.  */                   \
 317             for (i = 1; i < cnt; ++i)                                   \
 318               {                                                         \
 319                 uint16_t byte = inptr[i];                               \
 320                                                                         \
 321                 if ((byte & 0xc0) != 0x80)                              \
 322                   /* This is an illegal encoding.  */                   \
 323                   break;                                                \
 324                                                                         \
 325                 ch <<= 6;                                               \
 326                 ch |= byte & 0x3f;                                      \
 327               }                                                         \
 328             inptr += cnt;                                               \
 329                                                                         \
 330           }                                                             \
 331       }                                                                 \
 332     /* Now adjust the pointers and store the result.  */                \
 333     *((uint16_t *) outptr) = ch;                                        \
 334     outptr += sizeof (uint16_t);                                        \
 335   }
 336
 337 #define LOOP_NEED_FLAGS
 338 #include <iconv/loop.c>
 339
 340 /* Conversion from UTF-16 to UTF-8.  */
 341
 342 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
 343 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
 344 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
 345 #define LOOPFCT                 TO_LOOP
 346 /* The software routine is based on the functionality of the S/390
 347    hardware instruction (cu21) as described in the Principles of
 348    Operation.  */
 349 #define BODY                                                            \
 350   {                                                                     \
 351     /* The hardware instruction currently fails to report an error for  \
 352        isolated low surrogates so we have to disable the instruction    \
 353        until this gets resolved.  */                                    \
 354     if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */                  \
 355       {                                                                 \
 356         HARDWARE_CONVERT ("cu21 %0, %1, 1");                            \
 357         if (inptr != inend)                                             \
 358           {                                                             \
 359             /* Check if the third byte is                               \
 360                a valid start of a UTF-16 surrogate.  */                 \
 361             if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc)        \
 362               STANDARD_TO_LOOP_ERR_HANDLER (3);                         \
 363                                                                         \
 364             result = __GCONV_INCOMPLETE_INPUT;                          \
 365             break;                                                      \
 366           }                                                             \
 367         continue;                                                       \
 368       }                                                                 \
 369                                                                         \
 370     uint16_t c = get16 (inptr);                                         \
 371                                                                         \
 372     if (__glibc_likely (c <= 0x007f))                                         \
 373       {                                                                 \
 374         /* Single byte UTF-8 char.  */                                  \
 375         *outptr = c & 0xff;                                             \
 376         outptr++;                                                       \
 377       }                                                                 \
 378     else if (c >= 0x0080 && c <= 0x07ff)                                \
 379       {                                                                 \
 380         /* Two byte UTF-8 char.  */                                     \
 381                                                                         \
 382         if (__glibc_unlikely (outptr + 2 > outend))                           \
 383           {                                                             \
 384             /* Overflow in the output buffer.  */                       \
 385             result = __GCONV_FULL_OUTPUT;                               \
 386             break;                                                      \
 387           }                                                             \
 388                                                                         \
 389         outptr[0] = 0xc0;                                               \
 390         outptr[0] |= c >> 6;                                            \
 391                                                                         \
 392         outptr[1] = 0x80;                                               \
 393         outptr[1] |= c & 0x3f;                                          \
 394                                                                         \
 395         outptr += 2;                                                    \
 396       }                                                                 \
 397     else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff)                \
 398       {                                                                 \
 399         /* Three byte UTF-8 char.  */                                   \
 400                                                                         \
 401         if (__glibc_unlikely (outptr + 3 > outend))                           \
 402           {                                                             \
 403             /* Overflow in the output buffer.  */                       \
 404             result = __GCONV_FULL_OUTPUT;                               \
 405             break;                                                      \
 406           }                                                             \
 407         outptr[0] = 0xe0;                                               \
 408         outptr[0] |= c >> 12;                                           \
 409                                                                         \
 410         outptr[1] = 0x80;                                               \
 411         outptr[1] |= (c >> 6) & 0x3f;                                   \
 412                                                                         \
 413         outptr[2] = 0x80;                                               \
 414         outptr[2] |= c & 0x3f;                                          \
 415                                                                         \
 416         outptr += 3;                                                    \
 417       }                                                                 \
 418     else if (c >= 0xd800 && c <= 0xdbff)                                \
 419       {                                                                 \
 420         /* Four byte UTF-8 char.  */                                    \
 421         uint16_t low, uvwxy;                                            \
 422                                                                         \
 423         if (__glibc_unlikely (outptr + 4 > outend))                           \
 424           {                                                             \
 425             /* Overflow in the output buffer.  */                       \
 426             result = __GCONV_FULL_OUTPUT;                               \
 427             break;                                                      \
 428           }                                                             \
 429         inptr += 2;                                                     \
 430         if (__glibc_unlikely (inptr + 2 > inend))                             \
 431           {                                                             \
 432             result = __GCONV_INCOMPLETE_INPUT;                          \
 433             break;                                                      \
 434           }                                                             \
 435                                                                         \
 436         low = get16 (inptr);                                            \
 437                                                                         \
 438         if ((low & 0xfc00) != 0xdc00)                                   \
 439           {                                                             \
 440             inptr -= 2;                                                 \
 441             STANDARD_TO_LOOP_ERR_HANDLER (2);                           \
 442           }                                                             \
 443         uvwxy = ((c >> 6) & 0xf) + 1;                                   \
 444         outptr[0] = 0xf0;                                               \
 445         outptr[0] |= uvwxy >> 2;                                        \
 446                                                                         \
 447         outptr[1] = 0x80;                                               \
 448         outptr[1] |= (uvwxy << 4) & 0x30;                               \
 449         outptr[1] |= (c >> 2) & 0x0f;                                   \
 450                                                                         \
 451         outptr[2] = 0x80;                                               \
 452         outptr[2] |= (c & 0x03) << 4;                                   \
 453         outptr[2] |= (low >> 6) & 0x0f;                                 \
 454                                                                         \
 455         outptr[3] = 0x80;                                               \
 456         outptr[3] |= low & 0x3f;                                        \
 457                                                                         \
 458         outptr += 4;                                                    \
 459       }                                                                 \
 460     else                                                                \
 461       {                                                                 \
 462         STANDARD_TO_LOOP_ERR_HANDLER (2);                               \
 463       }                                                                 \
 464     inptr += 2;                                                         \
 465   }
 466 #define LOOP_NEED_FLAGS
 467 #include <iconv/loop.c>
 468
 469 #include <iconv/skeleton.c>