Update copyright dates with scripts/update-copyrights.
[glibc.git] / sysdeps / s390 / s390-64 / utf8-utf32-z9.c
blob9a744482858b9f39741b5506e2f95aa03c1e1c26
1 /* Conversion between UTF-8 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
4 instructions.
5 Copyright (C) 1997-2015 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
27 #include <dlfcn.h>
28 #include <stdint.h>
29 #include <unistd.h>
30 #include <dl-procinfo.h>
31 #include <gconv.h>
33 /* UTF-32 big endian byte order mark. */
34 #define BOM 0x0000feffu
36 #define DEFINE_INIT 0
37 #define DEFINE_FINI 0
38 /* These definitions apply to the UTF-8 to UTF-32 direction. The
39 software implementation for UTF-8 still supports multibyte
40 characters up to 6 bytes whereas the hardware variant does not. */
41 #define MIN_NEEDED_FROM 1
42 #define MAX_NEEDED_FROM 6
43 #define MIN_NEEDED_TO 4
44 #define FROM_LOOP from_utf8_loop
45 #define TO_LOOP to_utf8_loop
46 #define FROM_DIRECTION (dir == from_utf8)
47 #define ONE_DIRECTION 0
48 #define PREPARE_LOOP \
49 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
50 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
52 if (emit_bom && !data->__internal_use \
53 && data->__invocation_counter == 0) \
54 { \
55 /* Emit the Byte Order Mark. */ \
56 if (__glibc_unlikely (outbuf + 4 > outend)) \
57 return __GCONV_FULL_OUTPUT; \
59 put32u (outbuf, BOM); \
60 outbuf += 4; \
63 /* Direction of the transformation. */
64 enum direction
66 illegal_dir,
67 to_utf8,
68 from_utf8
71 struct utf8_data
73 enum direction dir;
74 int emit_bom;
78 extern int gconv_init (struct __gconv_step *step);
79 int
80 gconv_init (struct __gconv_step *step)
82 /* Determine which direction. */
83 struct utf8_data *new_data;
84 enum direction dir = illegal_dir;
85 int emit_bom;
86 int result;
88 emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0);
90 if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
91 && (__strcasecmp (step->__to_name, "UTF-32//") == 0
92 || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
93 || __strcasecmp (step->__to_name, "INTERNAL") == 0))
95 dir = from_utf8;
97 else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0
98 && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0
99 || __strcasecmp (step->__from_name, "INTERNAL") == 0))
101 dir = to_utf8;
104 result = __GCONV_NOCONV;
105 if (dir != illegal_dir)
107 new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
109 result = __GCONV_NOMEM;
110 if (new_data != NULL)
112 new_data->dir = dir;
113 new_data->emit_bom = emit_bom;
114 step->__data = new_data;
116 if (dir == from_utf8)
118 step->__min_needed_from = MIN_NEEDED_FROM;
119 step->__max_needed_from = MIN_NEEDED_FROM;
120 step->__min_needed_to = MIN_NEEDED_TO;
121 step->__max_needed_to = MIN_NEEDED_TO;
123 else
125 step->__min_needed_from = MIN_NEEDED_TO;
126 step->__max_needed_from = MIN_NEEDED_TO;
127 step->__min_needed_to = MIN_NEEDED_FROM;
128 step->__max_needed_to = MIN_NEEDED_FROM;
131 step->__stateful = 0;
133 result = __GCONV_OK;
137 return result;
141 extern void gconv_end (struct __gconv_step *data);
142 void
143 gconv_end (struct __gconv_step *data)
145 free (data->__data);
148 /* The macro for the hardware loop. This is used for both
149 directions. */
150 #define HARDWARE_CONVERT(INSTRUCTION) \
152 register const unsigned char* pInput asm ("8") = inptr; \
153 register unsigned long long inlen asm ("9") = inend - inptr; \
154 register unsigned char* pOutput asm ("10") = outptr; \
155 register unsigned long long outlen asm("11") = outend - outptr; \
156 uint64_t cc = 0; \
158 asm volatile (".machine push \n\t" \
159 ".machine \"z9-109\" \n\t" \
160 "0: " INSTRUCTION " \n\t" \
161 ".machine pop \n\t" \
162 " jo 0b \n\t" \
163 " ipm %2 \n" \
164 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
165 "+d" (outlen), "+d" (inlen) \
167 : "cc", "memory"); \
169 inptr = pInput; \
170 outptr = pOutput; \
171 cc >>= 28; \
173 if (cc == 1) \
175 result = __GCONV_FULL_OUTPUT; \
176 break; \
178 else if (cc == 2) \
180 result = __GCONV_ILLEGAL_INPUT; \
181 break; \
185 /* Conversion function from UTF-8 to UTF-32 internal/BE. */
187 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
188 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
189 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
190 #define LOOPFCT FROM_LOOP
191 /* The software routine is copied from gconv_simple.c. */
192 #define BODY \
194 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
196 HARDWARE_CONVERT ("cu14 %0, %1, 1"); \
198 if (inptr != inend) \
200 int i; \
201 for (i = 1; inptr + i < inend; ++i) \
202 if ((inptr[i] & 0xc0) != 0x80) \
203 break; \
205 if (__glibc_likely (inptr + i == inend)) \
207 result = __GCONV_INCOMPLETE_INPUT; \
208 break; \
210 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
212 continue; \
215 /* Next input byte. */ \
216 uint32_t ch = *inptr; \
218 if (__glibc_likely (ch < 0x80)) \
220 /* One byte sequence. */ \
221 ++inptr; \
223 else \
225 uint_fast32_t cnt; \
226 uint_fast32_t i; \
228 if (ch >= 0xc2 && ch < 0xe0) \
230 /* We expect two bytes. The first byte cannot be 0xc0 or \
231 0xc1, otherwise the wide character could have been \
232 represented using a single byte. */ \
233 cnt = 2; \
234 ch &= 0x1f; \
236 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
238 /* We expect three bytes. */ \
239 cnt = 3; \
240 ch &= 0x0f; \
242 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
244 /* We expect four bytes. */ \
245 cnt = 4; \
246 ch &= 0x07; \
248 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
250 /* We expect five bytes. */ \
251 cnt = 5; \
252 ch &= 0x03; \
254 else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
256 /* We expect six bytes. */ \
257 cnt = 6; \
258 ch &= 0x01; \
260 else \
262 /* Search the end of this ill-formed UTF-8 character. This \
263 is the next byte with (x & 0xc0) != 0x80. */ \
264 i = 0; \
265 do \
266 ++i; \
267 while (inptr + i < inend \
268 && (*(inptr + i) & 0xc0) == 0x80 \
269 && i < 5); \
271 errout: \
272 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
275 if (__glibc_unlikely (inptr + cnt > inend)) \
277 /* We don't have enough input. But before we report \
278 that check that all the bytes are correct. */ \
279 for (i = 1; inptr + i < inend; ++i) \
280 if ((inptr[i] & 0xc0) != 0x80) \
281 break; \
283 if (__glibc_likely (inptr + i == inend)) \
285 result = __GCONV_INCOMPLETE_INPUT; \
286 break; \
289 goto errout; \
292 /* Read the possible remaining bytes. */ \
293 for (i = 1; i < cnt; ++i) \
295 uint32_t byte = inptr[i]; \
297 if ((byte & 0xc0) != 0x80) \
298 /* This is an illegal encoding. */ \
299 break; \
301 ch <<= 6; \
302 ch |= byte & 0x3f; \
305 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
306 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
307 have been represented with fewer than cnt bytes. */ \
308 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \
310 /* This is an illegal encoding. */ \
311 goto errout; \
314 inptr += cnt; \
317 /* Now adjust the pointers and store the result. */ \
318 *((uint32_t *) outptr) = ch; \
319 outptr += sizeof (uint32_t); \
321 #define LOOP_NEED_FLAGS
323 #define STORE_REST \
325 /* We store the remaining bytes while converting them into the UCS4 \
326 format. We can assume that the first byte in the buffer is \
327 correct and that it requires a larger number of bytes than there \
328 are in the input buffer. */ \
329 wint_t ch = **inptrp; \
330 size_t cnt, r; \
332 state->__count = inend - *inptrp; \
334 if (ch >= 0xc2 && ch < 0xe0) \
336 /* We expect two bytes. The first byte cannot be 0xc0 or \
337 0xc1, otherwise the wide character could have been \
338 represented using a single byte. */ \
339 cnt = 2; \
340 ch &= 0x1f; \
342 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
344 /* We expect three bytes. */ \
345 cnt = 3; \
346 ch &= 0x0f; \
348 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
350 /* We expect four bytes. */ \
351 cnt = 4; \
352 ch &= 0x07; \
354 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
356 /* We expect five bytes. */ \
357 cnt = 5; \
358 ch &= 0x03; \
360 else \
362 /* We expect six bytes. */ \
363 cnt = 6; \
364 ch &= 0x01; \
367 /* The first byte is already consumed. */ \
368 r = cnt - 1; \
369 while (++(*inptrp) < inend) \
371 ch <<= 6; \
372 ch |= **inptrp & 0x3f; \
373 --r; \
376 /* Shift for the so far missing bytes. */ \
377 ch <<= r * 6; \
379 /* Store the number of bytes expected for the entire sequence. */ \
380 state->__count |= cnt << 8; \
382 /* Store the value. */ \
383 state->__value.__wch = ch; \
386 #define UNPACK_BYTES \
388 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
389 wint_t wch = state->__value.__wch; \
390 size_t ntotal = state->__count >> 8; \
392 inlen = state->__count & 255; \
394 bytebuf[0] = inmask[ntotal - 2]; \
396 do \
398 if (--ntotal < inlen) \
399 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
400 wch >>= 6; \
402 while (ntotal > 1); \
404 bytebuf[0] |= wch; \
407 #define CLEAR_STATE \
408 state->__count = 0
410 #include <iconv/loop.c>
412 /* Conversion from UTF-32 internal/BE to UTF-8. */
414 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
415 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
416 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
417 #define LOOPFCT TO_LOOP
418 /* The software routine mimics the S/390 cu41 instruction. */
419 #define BODY \
421 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
423 HARDWARE_CONVERT ("cu41 %0, %1"); \
425 if (inptr != inend) \
427 result = __GCONV_INCOMPLETE_INPUT; \
428 break; \
430 continue; \
433 uint32_t wc = *((const uint32_t *) inptr); \
435 if (__glibc_likely (wc <= 0x7f)) \
437 /* Single UTF-8 char. */ \
438 *outptr = (uint8_t)wc; \
439 outptr++; \
441 else if (wc <= 0x7ff) \
443 /* Two UTF-8 chars. */ \
444 if (__glibc_unlikely (outptr + 2 > outend)) \
446 /* Overflow in the output buffer. */ \
447 result = __GCONV_FULL_OUTPUT; \
448 break; \
451 outptr[0] = 0xc0; \
452 outptr[0] |= wc >> 6; \
454 outptr[1] = 0x80; \
455 outptr[1] |= wc & 0x3f; \
457 outptr += 2; \
459 else if (wc <= 0xffff) \
461 /* Three UTF-8 chars. */ \
462 if (__glibc_unlikely (outptr + 3 > outend)) \
464 /* Overflow in the output buffer. */ \
465 result = __GCONV_FULL_OUTPUT; \
466 break; \
468 outptr[0] = 0xe0; \
469 outptr[0] |= wc >> 12; \
471 outptr[1] = 0x80; \
472 outptr[1] |= (wc >> 6) & 0x3f; \
474 outptr[2] = 0x80; \
475 outptr[2] |= wc & 0x3f; \
477 outptr += 3; \
479 else if (wc <= 0x10ffff) \
481 /* Four UTF-8 chars. */ \
482 if (__glibc_unlikely (outptr + 4 > outend)) \
484 /* Overflow in the output buffer. */ \
485 result = __GCONV_FULL_OUTPUT; \
486 break; \
488 outptr[0] = 0xf0; \
489 outptr[0] |= wc >> 18; \
491 outptr[1] = 0x80; \
492 outptr[1] |= (wc >> 12) & 0x3f; \
494 outptr[2] = 0x80; \
495 outptr[2] |= (wc >> 6) & 0x3f; \
497 outptr[3] = 0x80; \
498 outptr[3] |= wc & 0x3f; \
500 outptr += 4; \
502 else \
504 STANDARD_TO_LOOP_ERR_HANDLER (4); \
506 inptr += 4; \
508 #define LOOP_NEED_FLAGS
509 #include <iconv/loop.c>
511 #include <iconv/skeleton.c>