1 /* Conversion between UTF-16 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
5 Copyright (C) 1997-2015 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
30 #include <dl-procinfo.h>
33 /* UTF-16 big endian byte order mark. */
34 #define BOM_UTF16 0xfeff
38 #define MIN_NEEDED_FROM 1
39 #define MAX_NEEDED_FROM 4
40 #define MIN_NEEDED_TO 2
41 #define MAX_NEEDED_TO 4
42 #define FROM_LOOP from_utf8_loop
43 #define TO_LOOP to_utf8_loop
44 #define FROM_DIRECTION (dir == from_utf8)
45 #define ONE_DIRECTION 0
46 #define PREPARE_LOOP \
47 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
48 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
50 if (emit_bom && !data->__internal_use \
51 && data->__invocation_counter == 0) \
53 /* Emit the UTF-16 Byte Order Mark. */ \
54 if (__glibc_unlikely (outbuf + 2 > outend)) \
55 return __GCONV_FULL_OUTPUT; \
57 put16u (outbuf, BOM_UTF16); \
61 /* Direction of the transformation. */
76 extern int gconv_init (struct __gconv_step
*step
);
78 gconv_init (struct __gconv_step
*step
)
80 /* Determine which direction. */
81 struct utf8_data
*new_data
;
82 enum direction dir
= illegal_dir
;
86 emit_bom
= (__strcasecmp (step
->__to_name
, "UTF-16//") == 0);
88 if (__strcasecmp (step
->__from_name
, "ISO-10646/UTF8/") == 0
89 && (__strcasecmp (step
->__to_name
, "UTF-16//") == 0
90 || __strcasecmp (step
->__to_name
, "UTF-16BE//") == 0))
94 else if (__strcasecmp (step
->__from_name
, "UTF-16BE//") == 0
95 && __strcasecmp (step
->__to_name
, "ISO-10646/UTF8/") == 0)
100 result
= __GCONV_NOCONV
;
101 if (dir
!= illegal_dir
)
103 new_data
= (struct utf8_data
*) malloc (sizeof (struct utf8_data
));
105 result
= __GCONV_NOMEM
;
106 if (new_data
!= NULL
)
109 new_data
->emit_bom
= emit_bom
;
110 step
->__data
= new_data
;
112 if (dir
== from_utf8
)
114 step
->__min_needed_from
= MIN_NEEDED_FROM
;
115 step
->__max_needed_from
= MIN_NEEDED_FROM
;
116 step
->__min_needed_to
= MIN_NEEDED_TO
;
117 step
->__max_needed_to
= MIN_NEEDED_TO
;
121 step
->__min_needed_from
= MIN_NEEDED_TO
;
122 step
->__max_needed_from
= MIN_NEEDED_TO
;
123 step
->__min_needed_to
= MIN_NEEDED_FROM
;
124 step
->__max_needed_to
= MIN_NEEDED_FROM
;
127 step
->__stateful
= 0;
137 extern void gconv_end (struct __gconv_step
*data
);
139 gconv_end (struct __gconv_step
*data
)
144 /* The macro for the hardware loop. This is used for both
146 #define HARDWARE_CONVERT(INSTRUCTION) \
148 register const unsigned char* pInput asm ("8") = inptr; \
149 register unsigned long long inlen asm ("9") = inend - inptr; \
150 register unsigned char* pOutput asm ("10") = outptr; \
151 register unsigned long long outlen asm("11") = outend - outptr; \
154 asm volatile (".machine push \n\t" \
155 ".machine \"z9-109\" \n\t" \
156 "0: " INSTRUCTION " \n\t" \
157 ".machine pop \n\t" \
160 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
161 "+d" (outlen), "+d" (inlen) \
171 result = __GCONV_FULL_OUTPUT; \
176 result = __GCONV_ILLEGAL_INPUT; \
181 /* Conversion function from UTF-8 to UTF-16. */
183 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
184 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
185 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
186 #define LOOPFCT FROM_LOOP
187 /* The software implementation is based on the code in gconv_simple.c. */
190 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
192 HARDWARE_CONVERT ("cu12 %0, %1, 1"); \
194 if (inptr != inend) \
197 for (i = 1; inptr + i < inend; ++i) \
198 if ((inptr[i] & 0xc0) != 0x80) \
201 if (__glibc_likely (inptr + i == inend)) \
203 result = __GCONV_INCOMPLETE_INPUT; \
206 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
211 /* Next input byte. */ \
212 uint16_t ch = *inptr; \
214 if (__glibc_likely (ch < 0x80)) \
216 /* One byte sequence. */ \
224 if (ch >= 0xc2 && ch < 0xe0) \
226 /* We expect two bytes. The first byte cannot be 0xc0 \
227 or 0xc1, otherwise the wide character could have been \
228 represented using a single byte. */ \
232 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
234 /* We expect three bytes. */ \
238 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
240 /* We expect four bytes. */ \
246 /* Search the end of this ill-formed UTF-8 character. This \
247 is the next byte with (x & 0xc0) != 0x80. */ \
251 while (inptr + i < inend \
252 && (*(inptr + i) & 0xc0) == 0x80 \
256 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
259 if (__glibc_unlikely (inptr + cnt > inend)) \
261 /* We don't have enough input. But before we report \
262 that check that all the bytes are correct. */ \
263 for (i = 1; inptr + i < inend; ++i) \
264 if ((inptr[i] & 0xc0) != 0x80) \
267 if (__glibc_likely (inptr + i == inend)) \
269 result = __GCONV_INCOMPLETE_INPUT; \
278 /* For 4 byte UTF-8 chars two UTF-16 chars (high and \
279 low) are needed. */ \
280 uint16_t zabcd, high, low; \
282 if (__glibc_unlikely (outptr + 4 > outend)) \
284 /* Overflow in the output buffer. */ \
285 result = __GCONV_FULL_OUTPUT; \
289 /* See Principles of Operations cu12. */ \
290 zabcd = (((inptr[0] & 0x7) << 2) | \
291 ((inptr[1] & 0x30) >> 4)) - 1; \
293 /* z-bit must be zero after subtracting 1. */ \
295 STANDARD_FROM_LOOP_ERR_HANDLER (4) \
297 high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \
298 high |= zabcd << 6; /* abcd bits */ \
299 high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \
300 high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \
302 low = (uint16_t)(0xdc << 8); /* low surrogate id */ \
303 low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \
304 low |= (inptr[2] & 0x3) << 6; /* mn bits */ \
305 low |= inptr[3] & 0x3f; /* opqrst bits */ \
307 put16 (outptr, high); \
309 put16 (outptr, low); \
316 /* Read the possible remaining bytes. */ \
317 for (i = 1; i < cnt; ++i) \
319 uint16_t byte = inptr[i]; \
321 if ((byte & 0xc0) != 0x80) \
322 /* This is an illegal encoding. */ \
332 /* Now adjust the pointers and store the result. */ \
333 *((uint16_t *) outptr) = ch; \
334 outptr += sizeof (uint16_t); \
337 #define LOOP_NEED_FLAGS
338 #include <iconv/loop.c>
340 /* Conversion from UTF-16 to UTF-8. */
342 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
343 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
344 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
345 #define LOOPFCT TO_LOOP
346 /* The software routine is based on the functionality of the S/390
347 hardware instruction (cu21) as described in the Principles of
351 /* The hardware instruction currently fails to report an error for \
352 isolated low surrogates so we have to disable the instruction \
353 until this gets resolved. */ \
354 if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \
356 HARDWARE_CONVERT ("cu21 %0, %1, 1"); \
357 if (inptr != inend) \
359 /* Check if the third byte is \
360 a valid start of a UTF-16 surrogate. */ \
361 if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \
362 STANDARD_TO_LOOP_ERR_HANDLER (3); \
364 result = __GCONV_INCOMPLETE_INPUT; \
370 uint16_t c = get16 (inptr); \
372 if (__glibc_likely (c <= 0x007f)) \
374 /* Single byte UTF-8 char. */ \
375 *outptr = c & 0xff; \
378 else if (c >= 0x0080 && c <= 0x07ff) \
380 /* Two byte UTF-8 char. */ \
382 if (__glibc_unlikely (outptr + 2 > outend)) \
384 /* Overflow in the output buffer. */ \
385 result = __GCONV_FULL_OUTPUT; \
390 outptr[0] |= c >> 6; \
393 outptr[1] |= c & 0x3f; \
397 else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \
399 /* Three byte UTF-8 char. */ \
401 if (__glibc_unlikely (outptr + 3 > outend)) \
403 /* Overflow in the output buffer. */ \
404 result = __GCONV_FULL_OUTPUT; \
408 outptr[0] |= c >> 12; \
411 outptr[1] |= (c >> 6) & 0x3f; \
414 outptr[2] |= c & 0x3f; \
418 else if (c >= 0xd800 && c <= 0xdbff) \
420 /* Four byte UTF-8 char. */ \
421 uint16_t low, uvwxy; \
423 if (__glibc_unlikely (outptr + 4 > outend)) \
425 /* Overflow in the output buffer. */ \
426 result = __GCONV_FULL_OUTPUT; \
430 if (__glibc_unlikely (inptr + 2 > inend)) \
432 result = __GCONV_INCOMPLETE_INPUT; \
436 low = get16 (inptr); \
438 if ((low & 0xfc00) != 0xdc00) \
441 STANDARD_TO_LOOP_ERR_HANDLER (2); \
443 uvwxy = ((c >> 6) & 0xf) + 1; \
445 outptr[0] |= uvwxy >> 2; \
448 outptr[1] |= (uvwxy << 4) & 0x30; \
449 outptr[1] |= (c >> 2) & 0x0f; \
452 outptr[2] |= (c & 0x03) << 4; \
453 outptr[2] |= (low >> 6) & 0x0f; \
456 outptr[3] |= low & 0x3f; \
462 STANDARD_TO_LOOP_ERR_HANDLER (2); \
466 #define LOOP_NEED_FLAGS
467 #include <iconv/loop.c>
469 #include <iconv/skeleton.c>