1 /* Conversion between UTF-16 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
5 Copyright (C) 1997-2014 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
30 #include <dl-procinfo.h>
33 /* UTF-32 big endian byte order mark. */
34 #define BOM_UTF32 0x0000feffu
36 /* UTF-16 big endian byte order mark. */
37 #define BOM_UTF16 0xfeff
41 #define MIN_NEEDED_FROM 2
42 #define MAX_NEEDED_FROM 4
43 #define MIN_NEEDED_TO 4
44 #define FROM_LOOP from_utf16_loop
45 #define TO_LOOP to_utf16_loop
46 #define FROM_DIRECTION (dir == from_utf16)
47 #define PREPARE_LOOP \
48 enum direction dir = ((struct utf16_data *) step->__data)->dir; \
49 int emit_bom = ((struct utf16_data *) step->__data)->emit_bom; \
51 if (emit_bom && !data->__internal_use \
52 && data->__invocation_counter == 0) \
54 if (dir == to_utf16) \
56 /* Emit the UTF-16 Byte Order Mark. */ \
57 if (__builtin_expect (outbuf + 2 > outend, 0)) \
58 return __GCONV_FULL_OUTPUT; \
60 put16u (outbuf, BOM_UTF16); \
65 /* Emit the UTF-32 Byte Order Mark. */ \
66 if (__builtin_expect (outbuf + 4 > outend, 0)) \
67 return __GCONV_FULL_OUTPUT; \
69 put32u (outbuf, BOM_UTF32); \
74 /* Direction of the transformation. */
89 extern int gconv_init (struct __gconv_step
*step
);
91 gconv_init (struct __gconv_step
*step
)
93 /* Determine which direction. */
94 struct utf16_data
*new_data
;
95 enum direction dir
= illegal_dir
;
99 emit_bom
= (__strcasecmp (step
->__to_name
, "UTF-32//") == 0
100 || __strcasecmp (step
->__to_name
, "UTF-16//") == 0);
102 if (__strcasecmp (step
->__from_name
, "UTF-16BE//") == 0
103 && (__strcasecmp (step
->__to_name
, "UTF-32//") == 0
104 || __strcasecmp (step
->__to_name
, "UTF-32BE//") == 0
105 || __strcasecmp (step
->__to_name
, "INTERNAL") == 0))
109 else if ((__strcasecmp (step
->__to_name
, "UTF-16//") == 0
110 || __strcasecmp (step
->__to_name
, "UTF-16BE//") == 0)
111 && (__strcasecmp (step
->__from_name
, "UTF-32BE//") == 0
112 || __strcasecmp (step
->__from_name
, "INTERNAL") == 0))
117 result
= __GCONV_NOCONV
;
118 if (dir
!= illegal_dir
)
120 new_data
= (struct utf16_data
*) malloc (sizeof (struct utf16_data
));
122 result
= __GCONV_NOMEM
;
123 if (new_data
!= NULL
)
126 new_data
->emit_bom
= emit_bom
;
127 step
->__data
= new_data
;
129 if (dir
== from_utf16
)
131 step
->__min_needed_from
= MIN_NEEDED_FROM
;
132 step
->__max_needed_from
= MIN_NEEDED_FROM
;
133 step
->__min_needed_to
= MIN_NEEDED_TO
;
134 step
->__max_needed_to
= MIN_NEEDED_TO
;
138 step
->__min_needed_from
= MIN_NEEDED_TO
;
139 step
->__max_needed_from
= MIN_NEEDED_TO
;
140 step
->__min_needed_to
= MIN_NEEDED_FROM
;
141 step
->__max_needed_to
= MIN_NEEDED_FROM
;
144 step
->__stateful
= 0;
154 extern void gconv_end (struct __gconv_step
*data
);
156 gconv_end (struct __gconv_step
*data
)
161 /* The macro for the hardware loop. This is used for both
163 #define HARDWARE_CONVERT(INSTRUCTION) \
165 register const unsigned char* pInput asm ("8") = inptr; \
166 register unsigned long long inlen asm ("9") = inend - inptr; \
167 register unsigned char* pOutput asm ("10") = outptr; \
168 register unsigned long long outlen asm("11") = outend - outptr; \
171 asm volatile (".machine push \n\t" \
172 ".machine \"z9-109\" \n\t" \
173 "0: " INSTRUCTION " \n\t" \
174 ".machine pop \n\t" \
177 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
178 "+d" (outlen), "+d" (inlen) \
188 result = __GCONV_FULL_OUTPUT; \
193 result = __GCONV_ILLEGAL_INPUT; \
198 /* Conversion function from UTF-16 to UTF-32 internal/BE. */
200 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
201 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
202 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
203 #define LOOPFCT FROM_LOOP
204 /* The software routine is copied from utf-16.c (minus bytes
208 /* The hardware instruction currently fails to report an error for \
209 isolated low surrogates so we have to disable the instruction \
210 until this gets resolved. */ \
211 if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \
213 HARDWARE_CONVERT ("cu24 %0, %1, 1"); \
214 if (inptr != inend) \
216 /* Check if the third byte is \
217 a valid start of a UTF-16 surrogate. */ \
218 if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \
219 STANDARD_FROM_LOOP_ERR_HANDLER (3); \
221 result = __GCONV_INCOMPLETE_INPUT; \
227 uint16_t u1 = get16 (inptr); \
229 if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff) \
231 /* No surrogate. */ \
232 put32 (outptr, u1); \
237 /* An isolated low-surrogate was found. This has to be \
238 considered ill-formed. */ \
239 if (__builtin_expect (u1 >= 0xdc00, 0)) \
241 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
243 /* It's a surrogate character. At least the first word says \
245 if (__builtin_expect (inptr + 4 > inend, 0)) \
247 /* We don't have enough input for another complete input \
249 result = __GCONV_INCOMPLETE_INPUT; \
254 uint16_t u2 = get16 (inptr); \
255 if (__builtin_expect (u2 < 0xdc00, 0) \
256 || __builtin_expect (u2 > 0xdfff, 0)) \
258 /* This is no valid second word for a surrogate. */ \
260 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
263 put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00)); \
268 #define LOOP_NEED_FLAGS
269 #include <iconv/loop.c>
271 /* Conversion from UTF-32 internal/BE to UTF-16. */
273 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
274 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
275 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
276 #define LOOPFCT TO_LOOP
277 /* The software routine is copied from utf-16.c (minus bytes
281 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
283 HARDWARE_CONVERT ("cu42 %0, %1"); \
285 if (inptr != inend) \
287 result = __GCONV_INCOMPLETE_INPUT; \
293 uint32_t c = get32 (inptr); \
295 if (__builtin_expect (c <= 0xd7ff, 1) \
296 || (c >=0xdc00 && c <= 0xffff)) \
298 /* Two UTF-16 chars. */ \
301 else if (__builtin_expect (c >= 0x10000, 1) \
302 && __builtin_expect (c <= 0x10ffff, 1)) \
304 /* Four UTF-16 chars. */ \
305 uint16_t zabcd = ((c & 0x1f0000) >> 16) - 1; \
308 /* Generate a surrogate character. */ \
309 if (__builtin_expect (outptr + 4 > outend, 0)) \
311 /* Overflow in the output buffer. */ \
312 result = __GCONV_FULL_OUTPUT; \
317 out |= (zabcd & 0xff) << 6; \
318 out |= (c >> 10) & 0x3f; \
319 put16 (outptr, out); \
324 put16 (outptr, out); \
328 STANDARD_TO_LOOP_ERR_HANDLER (4); \
333 #define LOOP_NEED_FLAGS
334 #include <iconv/loop.c>
336 #include <iconv/skeleton.c>