Update copyright notices with scripts/update-copyrights
[glibc.git] / sysdeps / s390 / s390-64 / utf8-utf32-z9.c
blob51208370377891751d4a2065f5676028f984e77a
1 /* Conversion between UTF-8 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
4 instructions.
5 Copyright (C) 1997-2014 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
27 #include <dlfcn.h>
28 #include <stdint.h>
29 #include <unistd.h>
30 #include <dl-procinfo.h>
31 #include <gconv.h>
33 /* UTF-32 big endian byte order mark. */
34 #define BOM 0x0000feffu
36 #define DEFINE_INIT 0
37 #define DEFINE_FINI 0
38 /* These definitions apply to the UTF-8 to UTF-32 direction. The
39 software implementation for UTF-8 still supports multibyte
40 characters up to 6 bytes whereas the hardware variant does not. */
41 #define MIN_NEEDED_FROM 1
42 #define MAX_NEEDED_FROM 6
43 #define MIN_NEEDED_TO 4
44 #define FROM_LOOP from_utf8_loop
45 #define TO_LOOP to_utf8_loop
46 #define FROM_DIRECTION (dir == from_utf8)
47 #define PREPARE_LOOP \
48 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
49 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
51 if (emit_bom && !data->__internal_use \
52 && data->__invocation_counter == 0) \
53 { \
54 /* Emit the Byte Order Mark. */ \
55 if (__builtin_expect (outbuf + 4 > outend, 0)) \
56 return __GCONV_FULL_OUTPUT; \
58 put32u (outbuf, BOM); \
59 outbuf += 4; \
62 /* Direction of the transformation. */
63 enum direction
65 illegal_dir,
66 to_utf8,
67 from_utf8
70 struct utf8_data
72 enum direction dir;
73 int emit_bom;
77 extern int gconv_init (struct __gconv_step *step);
78 int
79 gconv_init (struct __gconv_step *step)
81 /* Determine which direction. */
82 struct utf8_data *new_data;
83 enum direction dir = illegal_dir;
84 int emit_bom;
85 int result;
87 emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0);
89 if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
90 && (__strcasecmp (step->__to_name, "UTF-32//") == 0
91 || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
92 || __strcasecmp (step->__to_name, "INTERNAL") == 0))
94 dir = from_utf8;
96 else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0
97 && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0
98 || __strcasecmp (step->__from_name, "INTERNAL") == 0))
100 dir = to_utf8;
103 result = __GCONV_NOCONV;
104 if (dir != illegal_dir)
106 new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
108 result = __GCONV_NOMEM;
109 if (new_data != NULL)
111 new_data->dir = dir;
112 new_data->emit_bom = emit_bom;
113 step->__data = new_data;
115 if (dir == from_utf8)
117 step->__min_needed_from = MIN_NEEDED_FROM;
118 step->__max_needed_from = MIN_NEEDED_FROM;
119 step->__min_needed_to = MIN_NEEDED_TO;
120 step->__max_needed_to = MIN_NEEDED_TO;
122 else
124 step->__min_needed_from = MIN_NEEDED_TO;
125 step->__max_needed_from = MIN_NEEDED_TO;
126 step->__min_needed_to = MIN_NEEDED_FROM;
127 step->__max_needed_to = MIN_NEEDED_FROM;
130 step->__stateful = 0;
132 result = __GCONV_OK;
136 return result;
140 extern void gconv_end (struct __gconv_step *data);
141 void
142 gconv_end (struct __gconv_step *data)
144 free (data->__data);
147 /* The macro for the hardware loop. This is used for both
148 directions. */
149 #define HARDWARE_CONVERT(INSTRUCTION) \
151 register const unsigned char* pInput asm ("8") = inptr; \
152 register unsigned long long inlen asm ("9") = inend - inptr; \
153 register unsigned char* pOutput asm ("10") = outptr; \
154 register unsigned long long outlen asm("11") = outend - outptr; \
155 uint64_t cc = 0; \
157 asm volatile (".machine push \n\t" \
158 ".machine \"z9-109\" \n\t" \
159 "0: " INSTRUCTION " \n\t" \
160 ".machine pop \n\t" \
161 " jo 0b \n\t" \
162 " ipm %2 \n" \
163 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
164 "+d" (outlen), "+d" (inlen) \
166 : "cc", "memory"); \
168 inptr = pInput; \
169 outptr = pOutput; \
170 cc >>= 28; \
172 if (cc == 1) \
174 result = __GCONV_FULL_OUTPUT; \
175 break; \
177 else if (cc == 2) \
179 result = __GCONV_ILLEGAL_INPUT; \
180 break; \
184 /* Conversion function from UTF-8 to UTF-32 internal/BE. */
186 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
187 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
188 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
189 #define LOOPFCT FROM_LOOP
190 /* The software routine is copied from gconv_simple.c. */
191 #define BODY \
193 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
195 HARDWARE_CONVERT ("cu14 %0, %1, 1"); \
197 if (inptr != inend) \
199 int i; \
200 for (i = 1; inptr + i < inend; ++i) \
201 if ((inptr[i] & 0xc0) != 0x80) \
202 break; \
204 if (__builtin_expect (inptr + i == inend, 1)) \
206 result = __GCONV_INCOMPLETE_INPUT; \
207 break; \
209 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
211 continue; \
214 /* Next input byte. */ \
215 uint32_t ch = *inptr; \
217 if (__builtin_expect (ch < 0x80, 1)) \
219 /* One byte sequence. */ \
220 ++inptr; \
222 else \
224 uint_fast32_t cnt; \
225 uint_fast32_t i; \
227 if (ch >= 0xc2 && ch < 0xe0) \
229 /* We expect two bytes. The first byte cannot be 0xc0 or \
230 0xc1, otherwise the wide character could have been \
231 represented using a single byte. */ \
232 cnt = 2; \
233 ch &= 0x1f; \
235 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
237 /* We expect three bytes. */ \
238 cnt = 3; \
239 ch &= 0x0f; \
241 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
243 /* We expect four bytes. */ \
244 cnt = 4; \
245 ch &= 0x07; \
247 else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \
249 /* We expect five bytes. */ \
250 cnt = 5; \
251 ch &= 0x03; \
253 else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \
255 /* We expect six bytes. */ \
256 cnt = 6; \
257 ch &= 0x01; \
259 else \
261 /* Search the end of this ill-formed UTF-8 character. This \
262 is the next byte with (x & 0xc0) != 0x80. */ \
263 i = 0; \
264 do \
265 ++i; \
266 while (inptr + i < inend \
267 && (*(inptr + i) & 0xc0) == 0x80 \
268 && i < 5); \
270 errout: \
271 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
274 if (__builtin_expect (inptr + cnt > inend, 0)) \
276 /* We don't have enough input. But before we report \
277 that check that all the bytes are correct. */ \
278 for (i = 1; inptr + i < inend; ++i) \
279 if ((inptr[i] & 0xc0) != 0x80) \
280 break; \
282 if (__builtin_expect (inptr + i == inend, 1)) \
284 result = __GCONV_INCOMPLETE_INPUT; \
285 break; \
288 goto errout; \
291 /* Read the possible remaining bytes. */ \
292 for (i = 1; i < cnt; ++i) \
294 uint32_t byte = inptr[i]; \
296 if ((byte & 0xc0) != 0x80) \
297 /* This is an illegal encoding. */ \
298 break; \
300 ch <<= 6; \
301 ch |= byte & 0x3f; \
304 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
305 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
306 have been represented with fewer than cnt bytes. */ \
307 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \
309 /* This is an illegal encoding. */ \
310 goto errout; \
313 inptr += cnt; \
316 /* Now adjust the pointers and store the result. */ \
317 *((uint32_t *) outptr) = ch; \
318 outptr += sizeof (uint32_t); \
320 #define LOOP_NEED_FLAGS
322 #define STORE_REST \
324 /* We store the remaining bytes while converting them into the UCS4 \
325 format. We can assume that the first byte in the buffer is \
326 correct and that it requires a larger number of bytes than there \
327 are in the input buffer. */ \
328 wint_t ch = **inptrp; \
329 size_t cnt, r; \
331 state->__count = inend - *inptrp; \
333 if (ch >= 0xc2 && ch < 0xe0) \
335 /* We expect two bytes. The first byte cannot be 0xc0 or \
336 0xc1, otherwise the wide character could have been \
337 represented using a single byte. */ \
338 cnt = 2; \
339 ch &= 0x1f; \
341 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
343 /* We expect three bytes. */ \
344 cnt = 3; \
345 ch &= 0x0f; \
347 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
349 /* We expect four bytes. */ \
350 cnt = 4; \
351 ch &= 0x07; \
353 else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \
355 /* We expect five bytes. */ \
356 cnt = 5; \
357 ch &= 0x03; \
359 else \
361 /* We expect six bytes. */ \
362 cnt = 6; \
363 ch &= 0x01; \
366 /* The first byte is already consumed. */ \
367 r = cnt - 1; \
368 while (++(*inptrp) < inend) \
370 ch <<= 6; \
371 ch |= **inptrp & 0x3f; \
372 --r; \
375 /* Shift for the so far missing bytes. */ \
376 ch <<= r * 6; \
378 /* Store the number of bytes expected for the entire sequence. */ \
379 state->__count |= cnt << 8; \
381 /* Store the value. */ \
382 state->__value.__wch = ch; \
385 #define UNPACK_BYTES \
387 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
388 wint_t wch = state->__value.__wch; \
389 size_t ntotal = state->__count >> 8; \
391 inlen = state->__count & 255; \
393 bytebuf[0] = inmask[ntotal - 2]; \
395 do \
397 if (--ntotal < inlen) \
398 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
399 wch >>= 6; \
401 while (ntotal > 1); \
403 bytebuf[0] |= wch; \
406 #define CLEAR_STATE \
407 state->__count = 0
409 #include <iconv/loop.c>
411 /* Conversion from UTF-32 internal/BE to UTF-8. */
413 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
414 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
415 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
416 #define LOOPFCT TO_LOOP
417 /* The software routine mimics the S/390 cu41 instruction. */
418 #define BODY \
420 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
422 HARDWARE_CONVERT ("cu41 %0, %1"); \
424 if (inptr != inend) \
426 result = __GCONV_INCOMPLETE_INPUT; \
427 break; \
429 continue; \
432 uint32_t wc = *((const uint32_t *) inptr); \
434 if (__builtin_expect (wc <= 0x7f, 1)) \
436 /* Single UTF-8 char. */ \
437 *outptr = (uint8_t)wc; \
438 outptr++; \
440 else if (wc <= 0x7ff) \
442 /* Two UTF-8 chars. */ \
443 if (__builtin_expect (outptr + 2 > outend, 0)) \
445 /* Overflow in the output buffer. */ \
446 result = __GCONV_FULL_OUTPUT; \
447 break; \
450 outptr[0] = 0xc0; \
451 outptr[0] |= wc >> 6; \
453 outptr[1] = 0x80; \
454 outptr[1] |= wc & 0x3f; \
456 outptr += 2; \
458 else if (wc <= 0xffff) \
460 /* Three UTF-8 chars. */ \
461 if (__builtin_expect (outptr + 3 > outend, 0)) \
463 /* Overflow in the output buffer. */ \
464 result = __GCONV_FULL_OUTPUT; \
465 break; \
467 outptr[0] = 0xe0; \
468 outptr[0] |= wc >> 12; \
470 outptr[1] = 0x80; \
471 outptr[1] |= (wc >> 6) & 0x3f; \
473 outptr[2] = 0x80; \
474 outptr[2] |= wc & 0x3f; \
476 outptr += 3; \
478 else if (wc <= 0x10ffff) \
480 /* Four UTF-8 chars. */ \
481 if (__builtin_expect (outptr + 4 > outend, 0)) \
483 /* Overflow in the output buffer. */ \
484 result = __GCONV_FULL_OUTPUT; \
485 break; \
487 outptr[0] = 0xf0; \
488 outptr[0] |= wc >> 18; \
490 outptr[1] = 0x80; \
491 outptr[1] |= (wc >> 12) & 0x3f; \
493 outptr[2] = 0x80; \
494 outptr[2] |= (wc >> 6) & 0x3f; \
496 outptr[3] = 0x80; \
497 outptr[3] |= wc & 0x3f; \
499 outptr += 4; \
501 else \
503 STANDARD_TO_LOOP_ERR_HANDLER (4); \
505 inptr += 4; \
507 #define LOOP_NEED_FLAGS
508 #include <iconv/loop.c>
510 #include <iconv/skeleton.c>