1 /* Conversion between UTF-8 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
5 Copyright (C) 1997-2015 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
30 #include <dl-procinfo.h>
33 /* UTF-32 big endian byte order mark. */
34 #define BOM 0x0000feffu
38 /* These definitions apply to the UTF-8 to UTF-32 direction. The
39 software implementation for UTF-8 still supports multibyte
40 characters up to 6 bytes whereas the hardware variant does not. */
41 #define MIN_NEEDED_FROM 1
42 #define MAX_NEEDED_FROM 6
43 #define MIN_NEEDED_TO 4
44 #define FROM_LOOP from_utf8_loop
45 #define TO_LOOP to_utf8_loop
46 #define FROM_DIRECTION (dir == from_utf8)
47 #define ONE_DIRECTION 0
48 #define PREPARE_LOOP \
49 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
50 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
52 if (emit_bom && !data->__internal_use \
53 && data->__invocation_counter == 0) \
55 /* Emit the Byte Order Mark. */ \
56 if (__glibc_unlikely (outbuf + 4 > outend)) \
57 return __GCONV_FULL_OUTPUT; \
59 put32u (outbuf, BOM); \
63 /* Direction of the transformation. */
78 extern int gconv_init (struct __gconv_step
*step
);
80 gconv_init (struct __gconv_step
*step
)
82 /* Determine which direction. */
83 struct utf8_data
*new_data
;
84 enum direction dir
= illegal_dir
;
88 emit_bom
= (__strcasecmp (step
->__to_name
, "UTF-32//") == 0);
90 if (__strcasecmp (step
->__from_name
, "ISO-10646/UTF8/") == 0
91 && (__strcasecmp (step
->__to_name
, "UTF-32//") == 0
92 || __strcasecmp (step
->__to_name
, "UTF-32BE//") == 0
93 || __strcasecmp (step
->__to_name
, "INTERNAL") == 0))
97 else if (__strcasecmp (step
->__to_name
, "ISO-10646/UTF8/") == 0
98 && (__strcasecmp (step
->__from_name
, "UTF-32BE//") == 0
99 || __strcasecmp (step
->__from_name
, "INTERNAL") == 0))
104 result
= __GCONV_NOCONV
;
105 if (dir
!= illegal_dir
)
107 new_data
= (struct utf8_data
*) malloc (sizeof (struct utf8_data
));
109 result
= __GCONV_NOMEM
;
110 if (new_data
!= NULL
)
113 new_data
->emit_bom
= emit_bom
;
114 step
->__data
= new_data
;
116 if (dir
== from_utf8
)
118 step
->__min_needed_from
= MIN_NEEDED_FROM
;
119 step
->__max_needed_from
= MIN_NEEDED_FROM
;
120 step
->__min_needed_to
= MIN_NEEDED_TO
;
121 step
->__max_needed_to
= MIN_NEEDED_TO
;
125 step
->__min_needed_from
= MIN_NEEDED_TO
;
126 step
->__max_needed_from
= MIN_NEEDED_TO
;
127 step
->__min_needed_to
= MIN_NEEDED_FROM
;
128 step
->__max_needed_to
= MIN_NEEDED_FROM
;
131 step
->__stateful
= 0;
141 extern void gconv_end (struct __gconv_step
*data
);
143 gconv_end (struct __gconv_step
*data
)
148 /* The macro for the hardware loop. This is used for both
150 #define HARDWARE_CONVERT(INSTRUCTION) \
152 register const unsigned char* pInput asm ("8") = inptr; \
153 register unsigned long long inlen asm ("9") = inend - inptr; \
154 register unsigned char* pOutput asm ("10") = outptr; \
155 register unsigned long long outlen asm("11") = outend - outptr; \
158 asm volatile (".machine push \n\t" \
159 ".machine \"z9-109\" \n\t" \
160 "0: " INSTRUCTION " \n\t" \
161 ".machine pop \n\t" \
164 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
165 "+d" (outlen), "+d" (inlen) \
175 result = __GCONV_FULL_OUTPUT; \
180 result = __GCONV_ILLEGAL_INPUT; \
185 /* Conversion function from UTF-8 to UTF-32 internal/BE. */
187 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
188 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
189 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
190 #define LOOPFCT FROM_LOOP
191 /* The software routine is copied from gconv_simple.c. */
194 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
196 HARDWARE_CONVERT ("cu14 %0, %1, 1"); \
198 if (inptr != inend) \
201 for (i = 1; inptr + i < inend; ++i) \
202 if ((inptr[i] & 0xc0) != 0x80) \
205 if (__glibc_likely (inptr + i == inend)) \
207 result = __GCONV_INCOMPLETE_INPUT; \
210 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
215 /* Next input byte. */ \
216 uint32_t ch = *inptr; \
218 if (__glibc_likely (ch < 0x80)) \
220 /* One byte sequence. */ \
228 if (ch >= 0xc2 && ch < 0xe0) \
230 /* We expect two bytes. The first byte cannot be 0xc0 or \
231 0xc1, otherwise the wide character could have been \
232 represented using a single byte. */ \
236 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
238 /* We expect three bytes. */ \
242 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
244 /* We expect four bytes. */ \
248 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
250 /* We expect five bytes. */ \
254 else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
256 /* We expect six bytes. */ \
262 /* Search the end of this ill-formed UTF-8 character. This \
263 is the next byte with (x & 0xc0) != 0x80. */ \
267 while (inptr + i < inend \
268 && (*(inptr + i) & 0xc0) == 0x80 \
272 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
275 if (__glibc_unlikely (inptr + cnt > inend)) \
277 /* We don't have enough input. But before we report \
278 that check that all the bytes are correct. */ \
279 for (i = 1; inptr + i < inend; ++i) \
280 if ((inptr[i] & 0xc0) != 0x80) \
283 if (__glibc_likely (inptr + i == inend)) \
285 result = __GCONV_INCOMPLETE_INPUT; \
292 /* Read the possible remaining bytes. */ \
293 for (i = 1; i < cnt; ++i) \
295 uint32_t byte = inptr[i]; \
297 if ((byte & 0xc0) != 0x80) \
298 /* This is an illegal encoding. */ \
305 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
306 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
307 have been represented with fewer than cnt bytes. */ \
308 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \
310 /* This is an illegal encoding. */ \
317 /* Now adjust the pointers and store the result. */ \
318 *((uint32_t *) outptr) = ch; \
319 outptr += sizeof (uint32_t); \
321 #define LOOP_NEED_FLAGS
325 /* We store the remaining bytes while converting them into the UCS4 \
326 format. We can assume that the first byte in the buffer is \
327 correct and that it requires a larger number of bytes than there \
328 are in the input buffer. */ \
329 wint_t ch = **inptrp; \
332 state->__count = inend - *inptrp; \
334 if (ch >= 0xc2 && ch < 0xe0) \
336 /* We expect two bytes. The first byte cannot be 0xc0 or \
337 0xc1, otherwise the wide character could have been \
338 represented using a single byte. */ \
342 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
344 /* We expect three bytes. */ \
348 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
350 /* We expect four bytes. */ \
354 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
356 /* We expect five bytes. */ \
362 /* We expect six bytes. */ \
367 /* The first byte is already consumed. */ \
369 while (++(*inptrp) < inend) \
372 ch |= **inptrp & 0x3f; \
376 /* Shift for the so far missing bytes. */ \
379 /* Store the number of bytes expected for the entire sequence. */ \
380 state->__count |= cnt << 8; \
382 /* Store the value. */ \
383 state->__value.__wch = ch; \
386 #define UNPACK_BYTES \
388 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
389 wint_t wch = state->__value.__wch; \
390 size_t ntotal = state->__count >> 8; \
392 inlen = state->__count & 255; \
394 bytebuf[0] = inmask[ntotal - 2]; \
398 if (--ntotal < inlen) \
399 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
402 while (ntotal > 1); \
407 #define CLEAR_STATE \
410 #include <iconv/loop.c>
412 /* Conversion from UTF-32 internal/BE to UTF-8. */
414 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
415 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
416 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
417 #define LOOPFCT TO_LOOP
418 /* The software routine mimics the S/390 cu41 instruction. */
421 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
423 HARDWARE_CONVERT ("cu41 %0, %1"); \
425 if (inptr != inend) \
427 result = __GCONV_INCOMPLETE_INPUT; \
433 uint32_t wc = *((const uint32_t *) inptr); \
435 if (__glibc_likely (wc <= 0x7f)) \
437 /* Single UTF-8 char. */ \
438 *outptr = (uint8_t)wc; \
441 else if (wc <= 0x7ff) \
443 /* Two UTF-8 chars. */ \
444 if (__glibc_unlikely (outptr + 2 > outend)) \
446 /* Overflow in the output buffer. */ \
447 result = __GCONV_FULL_OUTPUT; \
452 outptr[0] |= wc >> 6; \
455 outptr[1] |= wc & 0x3f; \
459 else if (wc <= 0xffff) \
461 /* Three UTF-8 chars. */ \
462 if (__glibc_unlikely (outptr + 3 > outend)) \
464 /* Overflow in the output buffer. */ \
465 result = __GCONV_FULL_OUTPUT; \
469 outptr[0] |= wc >> 12; \
472 outptr[1] |= (wc >> 6) & 0x3f; \
475 outptr[2] |= wc & 0x3f; \
479 else if (wc <= 0x10ffff) \
481 /* Four UTF-8 chars. */ \
482 if (__glibc_unlikely (outptr + 4 > outend)) \
484 /* Overflow in the output buffer. */ \
485 result = __GCONV_FULL_OUTPUT; \
489 outptr[0] |= wc >> 18; \
492 outptr[1] |= (wc >> 12) & 0x3f; \
495 outptr[2] |= (wc >> 6) & 0x3f; \
498 outptr[3] |= wc & 0x3f; \
504 STANDARD_TO_LOOP_ERR_HANDLER (4); \
508 #define LOOP_NEED_FLAGS
509 #include <iconv/loop.c>
511 #include <iconv/skeleton.c>