1 /* Conversion between UTF-16 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
5 Copyright (C) 1997-2009 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
30 #include <dl-procinfo.h>
33 /* UTF-16 big endian byte order mark. */
34 #define BOM_UTF16 0xfeff
38 #define MIN_NEEDED_FROM 1
39 #define MAX_NEEDED_FROM 4
40 #define MIN_NEEDED_TO 2
41 #define MAX_NEEDED_TO 4
42 #define FROM_LOOP from_utf8_loop
43 #define TO_LOOP to_utf8_loop
44 #define FROM_DIRECTION (dir == from_utf8)
45 #define PREPARE_LOOP \
46 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
47 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
49 if (emit_bom && !data->__internal_use \
50 && data->__invocation_counter == 0) \
52 /* Emit the UTF-16 Byte Order Mark. */ \
53 if (__builtin_expect (outbuf + 2 > outend, 0)) \
54 return __GCONV_FULL_OUTPUT; \
56 put16u (outbuf, BOM_UTF16); \
60 /* Direction of the transformation. */
75 extern int gconv_init (struct __gconv_step
*step
);
77 gconv_init (struct __gconv_step
*step
)
79 /* Determine which direction. */
80 struct utf8_data
*new_data
;
81 enum direction dir
= illegal_dir
;
85 emit_bom
= (__strcasecmp (step
->__to_name
, "UTF-16//") == 0);
87 if (__strcasecmp (step
->__from_name
, "ISO-10646/UTF8/") == 0
88 && (__strcasecmp (step
->__to_name
, "UTF-16//") == 0
89 || __strcasecmp (step
->__to_name
, "UTF-16BE//") == 0))
93 else if (__strcasecmp (step
->__from_name
, "UTF-16BE//") == 0
94 && __strcasecmp (step
->__to_name
, "ISO-10646/UTF8/") == 0)
99 result
= __GCONV_NOCONV
;
100 if (dir
!= illegal_dir
)
102 new_data
= (struct utf8_data
*) malloc (sizeof (struct utf8_data
));
104 result
= __GCONV_NOMEM
;
105 if (new_data
!= NULL
)
108 new_data
->emit_bom
= emit_bom
;
109 step
->__data
= new_data
;
111 if (dir
== from_utf8
)
113 step
->__min_needed_from
= MIN_NEEDED_FROM
;
114 step
->__max_needed_from
= MIN_NEEDED_FROM
;
115 step
->__min_needed_to
= MIN_NEEDED_TO
;
116 step
->__max_needed_to
= MIN_NEEDED_TO
;
120 step
->__min_needed_from
= MIN_NEEDED_TO
;
121 step
->__max_needed_from
= MIN_NEEDED_TO
;
122 step
->__min_needed_to
= MIN_NEEDED_FROM
;
123 step
->__max_needed_to
= MIN_NEEDED_FROM
;
126 step
->__stateful
= 0;
136 extern void gconv_end (struct __gconv_step
*data
);
138 gconv_end (struct __gconv_step
*data
)
143 /* The macro for the hardware loop. This is used for both
145 #define HARDWARE_CONVERT(INSTRUCTION) \
147 register const unsigned char* pInput asm ("8") = inptr; \
148 register unsigned long long inlen asm ("9") = inend - inptr; \
149 register unsigned char* pOutput asm ("10") = outptr; \
150 register unsigned long long outlen asm("11") = outend - outptr; \
153 asm volatile (".machine push \n\t" \
154 ".machine \"z9-109\" \n\t" \
155 "0: " INSTRUCTION " \n\t" \
156 ".machine pop \n\t" \
159 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
160 "+d" (outlen), "+d" (inlen) \
170 result = __GCONV_FULL_OUTPUT; \
175 result = __GCONV_ILLEGAL_INPUT; \
180 /* Conversion function from UTF-8 to UTF-16. */
182 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
183 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
184 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
185 #define LOOPFCT FROM_LOOP
186 /* The software implementation is based on the code in gconv_simple.c. */
189 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
191 HARDWARE_CONVERT ("cu12 %0, %1, 1"); \
193 if (inptr != inend) \
196 for (i = 1; inptr + i < inend; ++i) \
197 if ((inptr[i] & 0xc0) != 0x80) \
200 if (__builtin_expect (inptr + i == inend, 1)) \
202 result = __GCONV_INCOMPLETE_INPUT; \
205 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
210 /* Next input byte. */ \
211 uint16_t ch = *inptr; \
213 if (__builtin_expect (ch < 0x80, 1)) \
215 /* One byte sequence. */ \
223 if (ch >= 0xc2 && ch < 0xe0) \
225 /* We expect two bytes. The first byte cannot be 0xc0 \
226 or 0xc1, otherwise the wide character could have been \
227 represented using a single byte. */ \
231 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
233 /* We expect three bytes. */ \
237 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
239 /* We expect four bytes. */ \
245 /* Search the end of this ill-formed UTF-8 character. This \
246 is the next byte with (x & 0xc0) != 0x80. */ \
250 while (inptr + i < inend \
251 && (*(inptr + i) & 0xc0) == 0x80 \
255 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
258 if (__builtin_expect (inptr + cnt > inend, 0)) \
260 /* We don't have enough input. But before we report \
261 that check that all the bytes are correct. */ \
262 for (i = 1; inptr + i < inend; ++i) \
263 if ((inptr[i] & 0xc0) != 0x80) \
266 if (__builtin_expect (inptr + i == inend, 1)) \
268 result = __GCONV_INCOMPLETE_INPUT; \
277 /* For 4 byte UTF-8 chars two UTF-16 chars (high and \
278 low) are needed. */ \
279 uint16_t zabcd, high, low; \
281 if (__builtin_expect (outptr + 4 > outend, 0)) \
283 /* Overflow in the output buffer. */ \
284 result = __GCONV_FULL_OUTPUT; \
288 /* See Principles of Operations cu12. */ \
289 zabcd = (((inptr[0] & 0x7) << 2) | \
290 ((inptr[1] & 0x30) >> 4)) - 1; \
292 /* z-bit must be zero after subtracting 1. */ \
294 STANDARD_FROM_LOOP_ERR_HANDLER (4) \
296 high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \
297 high |= zabcd << 6; /* abcd bits */ \
298 high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \
299 high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \
301 low = (uint16_t)(0xdc << 8); /* low surrogate id */ \
302 low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \
303 low |= (inptr[2] & 0x3) << 6; /* mn bits */ \
304 low |= inptr[3] & 0x3f; /* opqrst bits */ \
306 put16 (outptr, high); \
308 put16 (outptr, low); \
315 /* Read the possible remaining bytes. */ \
316 for (i = 1; i < cnt; ++i) \
318 uint16_t byte = inptr[i]; \
320 if ((byte & 0xc0) != 0x80) \
321 /* This is an illegal encoding. */ \
331 /* Now adjust the pointers and store the result. */ \
332 *((uint16_t *) outptr) = ch; \
333 outptr += sizeof (uint16_t); \
336 #define LOOP_NEED_FLAGS
337 #include <iconv/loop.c>
339 /* Conversion from UTF-16 to UTF-8. */
341 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
342 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
343 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
344 #define LOOPFCT TO_LOOP
345 /* The software routine is based on the functionality of the S/390
346 hardware instruction (cu21) as described in the Principles of
350 /* The hardware instruction currently fails to report an error for \
351 isolated low surrogates so we have to disable the instruction \
352 until this gets resolved. */ \
353 if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \
355 HARDWARE_CONVERT ("cu21 %0, %1, 1"); \
356 if (inptr != inend) \
358 /* Check if the third byte is \
359 a valid start of a UTF-16 surrogate. */ \
360 if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \
361 STANDARD_TO_LOOP_ERR_HANDLER (3); \
363 result = __GCONV_INCOMPLETE_INPUT; \
369 uint16_t c = get16 (inptr); \
371 if (__builtin_expect (c <= 0x007f, 1)) \
373 /* Single byte UTF-8 char. */ \
374 *outptr = c & 0xff; \
377 else if (c >= 0x0080 && c <= 0x07ff) \
379 /* Two byte UTF-8 char. */ \
381 if (__builtin_expect (outptr + 2 > outend, 0)) \
383 /* Overflow in the output buffer. */ \
384 result = __GCONV_FULL_OUTPUT; \
389 outptr[0] |= c >> 6; \
392 outptr[1] |= c & 0x3f; \
396 else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \
398 /* Three byte UTF-8 char. */ \
400 if (__builtin_expect (outptr + 3 > outend, 0)) \
402 /* Overflow in the output buffer. */ \
403 result = __GCONV_FULL_OUTPUT; \
407 outptr[0] |= c >> 12; \
410 outptr[1] |= (c >> 6) & 0x3f; \
413 outptr[2] |= c & 0x3f; \
417 else if (c >= 0xd800 && c <= 0xdbff) \
419 /* Four byte UTF-8 char. */ \
420 uint16_t low, uvwxy; \
422 if (__builtin_expect (outptr + 4 > outend, 0)) \
424 /* Overflow in the output buffer. */ \
425 result = __GCONV_FULL_OUTPUT; \
429 if (__builtin_expect (inptr + 2 > inend, 0)) \
431 result = __GCONV_INCOMPLETE_INPUT; \
435 low = get16 (inptr); \
437 if ((low & 0xfc00) != 0xdc00) \
440 STANDARD_TO_LOOP_ERR_HANDLER (2); \
442 uvwxy = ((c >> 6) & 0xf) + 1; \
444 outptr[0] |= uvwxy >> 2; \
447 outptr[1] |= (uvwxy << 4) & 0x30; \
448 outptr[1] |= (c >> 2) & 0x0f; \
451 outptr[2] |= (c & 0x03) << 4; \
452 outptr[2] |= (low >> 6) & 0x0f; \
455 outptr[3] |= low & 0x3f; \
461 STANDARD_TO_LOOP_ERR_HANDLER (2); \
465 #define LOOP_NEED_FLAGS
466 #include <iconv/loop.c>
468 #include <iconv/skeleton.c>