Replace FSF snail mail address with URLs.
[glibc.git] / sysdeps / s390 / s390-64 / utf8-utf16-z9.c
blob9f5917769fe7cd325387fe1018f5fd62bae4f749
1 /* Conversion between UTF-16 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
4 instructions.
5 Copyright (C) 1997-2009 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
27 #include <dlfcn.h>
28 #include <stdint.h>
29 #include <unistd.h>
30 #include <dl-procinfo.h>
31 #include <gconv.h>
33 /* UTF-16 big endian byte order mark. */
34 #define BOM_UTF16 0xfeff
36 #define DEFINE_INIT 0
37 #define DEFINE_FINI 0
38 #define MIN_NEEDED_FROM 1
39 #define MAX_NEEDED_FROM 4
40 #define MIN_NEEDED_TO 2
41 #define MAX_NEEDED_TO 4
42 #define FROM_LOOP from_utf8_loop
43 #define TO_LOOP to_utf8_loop
44 #define FROM_DIRECTION (dir == from_utf8)
45 #define PREPARE_LOOP \
46 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
47 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
49 if (emit_bom && !data->__internal_use \
50 && data->__invocation_counter == 0) \
51 { \
52 /* Emit the UTF-16 Byte Order Mark. */ \
53 if (__builtin_expect (outbuf + 2 > outend, 0)) \
54 return __GCONV_FULL_OUTPUT; \
56 put16u (outbuf, BOM_UTF16); \
57 outbuf += 2; \
60 /* Direction of the transformation. */
61 enum direction
63 illegal_dir,
64 to_utf8,
65 from_utf8
68 struct utf8_data
70 enum direction dir;
71 int emit_bom;
75 extern int gconv_init (struct __gconv_step *step);
76 int
77 gconv_init (struct __gconv_step *step)
79 /* Determine which direction. */
80 struct utf8_data *new_data;
81 enum direction dir = illegal_dir;
82 int emit_bom;
83 int result;
85 emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
87 if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
88 && (__strcasecmp (step->__to_name, "UTF-16//") == 0
89 || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
91 dir = from_utf8;
93 else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
94 && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
96 dir = to_utf8;
99 result = __GCONV_NOCONV;
100 if (dir != illegal_dir)
102 new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
104 result = __GCONV_NOMEM;
105 if (new_data != NULL)
107 new_data->dir = dir;
108 new_data->emit_bom = emit_bom;
109 step->__data = new_data;
111 if (dir == from_utf8)
113 step->__min_needed_from = MIN_NEEDED_FROM;
114 step->__max_needed_from = MIN_NEEDED_FROM;
115 step->__min_needed_to = MIN_NEEDED_TO;
116 step->__max_needed_to = MIN_NEEDED_TO;
118 else
120 step->__min_needed_from = MIN_NEEDED_TO;
121 step->__max_needed_from = MIN_NEEDED_TO;
122 step->__min_needed_to = MIN_NEEDED_FROM;
123 step->__max_needed_to = MIN_NEEDED_FROM;
126 step->__stateful = 0;
128 result = __GCONV_OK;
132 return result;
136 extern void gconv_end (struct __gconv_step *data);
137 void
138 gconv_end (struct __gconv_step *data)
140 free (data->__data);
143 /* The macro for the hardware loop. This is used for both
144 directions. */
145 #define HARDWARE_CONVERT(INSTRUCTION) \
147 register const unsigned char* pInput asm ("8") = inptr; \
148 register unsigned long long inlen asm ("9") = inend - inptr; \
149 register unsigned char* pOutput asm ("10") = outptr; \
150 register unsigned long long outlen asm("11") = outend - outptr; \
151 uint64_t cc = 0; \
153 asm volatile (".machine push \n\t" \
154 ".machine \"z9-109\" \n\t" \
155 "0: " INSTRUCTION " \n\t" \
156 ".machine pop \n\t" \
157 " jo 0b \n\t" \
158 " ipm %2 \n" \
159 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
160 "+d" (outlen), "+d" (inlen) \
162 : "cc", "memory"); \
164 inptr = pInput; \
165 outptr = pOutput; \
166 cc >>= 28; \
168 if (cc == 1) \
170 result = __GCONV_FULL_OUTPUT; \
171 break; \
173 else if (cc == 2) \
175 result = __GCONV_ILLEGAL_INPUT; \
176 break; \
180 /* Conversion function from UTF-8 to UTF-16. */
182 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
183 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
184 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
185 #define LOOPFCT FROM_LOOP
186 /* The software implementation is based on the code in gconv_simple.c. */
187 #define BODY \
189 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
191 HARDWARE_CONVERT ("cu12 %0, %1, 1"); \
193 if (inptr != inend) \
195 int i; \
196 for (i = 1; inptr + i < inend; ++i) \
197 if ((inptr[i] & 0xc0) != 0x80) \
198 break; \
200 if (__builtin_expect (inptr + i == inend, 1)) \
202 result = __GCONV_INCOMPLETE_INPUT; \
203 break; \
205 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
207 continue; \
210 /* Next input byte. */ \
211 uint16_t ch = *inptr; \
213 if (__builtin_expect (ch < 0x80, 1)) \
215 /* One byte sequence. */ \
216 ++inptr; \
218 else \
220 uint_fast32_t cnt; \
221 uint_fast32_t i; \
223 if (ch >= 0xc2 && ch < 0xe0) \
225 /* We expect two bytes. The first byte cannot be 0xc0 \
226 or 0xc1, otherwise the wide character could have been \
227 represented using a single byte. */ \
228 cnt = 2; \
229 ch &= 0x1f; \
231 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \
233 /* We expect three bytes. */ \
234 cnt = 3; \
235 ch &= 0x0f; \
237 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \
239 /* We expect four bytes. */ \
240 cnt = 4; \
241 ch &= 0x07; \
243 else \
245 /* Search the end of this ill-formed UTF-8 character. This \
246 is the next byte with (x & 0xc0) != 0x80. */ \
247 i = 0; \
248 do \
249 ++i; \
250 while (inptr + i < inend \
251 && (*(inptr + i) & 0xc0) == 0x80 \
252 && i < 5); \
254 errout: \
255 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
258 if (__builtin_expect (inptr + cnt > inend, 0)) \
260 /* We don't have enough input. But before we report \
261 that check that all the bytes are correct. */ \
262 for (i = 1; inptr + i < inend; ++i) \
263 if ((inptr[i] & 0xc0) != 0x80) \
264 break; \
266 if (__builtin_expect (inptr + i == inend, 1)) \
268 result = __GCONV_INCOMPLETE_INPUT; \
269 break; \
272 goto errout; \
275 if (cnt == 4) \
277 /* For 4 byte UTF-8 chars two UTF-16 chars (high and \
278 low) are needed. */ \
279 uint16_t zabcd, high, low; \
281 if (__builtin_expect (outptr + 4 > outend, 0)) \
283 /* Overflow in the output buffer. */ \
284 result = __GCONV_FULL_OUTPUT; \
285 break; \
288 /* See Principles of Operations cu12. */ \
289 zabcd = (((inptr[0] & 0x7) << 2) | \
290 ((inptr[1] & 0x30) >> 4)) - 1; \
292 /* z-bit must be zero after subtracting 1. */ \
293 if (zabcd & 0x10) \
294 STANDARD_FROM_LOOP_ERR_HANDLER (4) \
296 high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \
297 high |= zabcd << 6; /* abcd bits */ \
298 high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \
299 high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \
301 low = (uint16_t)(0xdc << 8); /* low surrogate id */ \
302 low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \
303 low |= (inptr[2] & 0x3) << 6; /* mn bits */ \
304 low |= inptr[3] & 0x3f; /* opqrst bits */ \
306 put16 (outptr, high); \
307 outptr += 2; \
308 put16 (outptr, low); \
309 outptr += 2; \
310 inptr += 4; \
311 continue; \
313 else \
315 /* Read the possible remaining bytes. */ \
316 for (i = 1; i < cnt; ++i) \
318 uint16_t byte = inptr[i]; \
320 if ((byte & 0xc0) != 0x80) \
321 /* This is an illegal encoding. */ \
322 break; \
324 ch <<= 6; \
325 ch |= byte & 0x3f; \
327 inptr += cnt; \
331 /* Now adjust the pointers and store the result. */ \
332 *((uint16_t *) outptr) = ch; \
333 outptr += sizeof (uint16_t); \
336 #define LOOP_NEED_FLAGS
337 #include <iconv/loop.c>
339 /* Conversion from UTF-16 to UTF-8. */
341 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
342 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
343 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
344 #define LOOPFCT TO_LOOP
345 /* The software routine is based on the functionality of the S/390
346 hardware instruction (cu21) as described in the Principles of
347 Operation. */
348 #define BODY \
350 /* The hardware instruction currently fails to report an error for \
351 isolated low surrogates so we have to disable the instruction \
352 until this gets resolved. */ \
353 if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \
355 HARDWARE_CONVERT ("cu21 %0, %1, 1"); \
356 if (inptr != inend) \
358 /* Check if the third byte is \
359 a valid start of a UTF-16 surrogate. */ \
360 if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \
361 STANDARD_TO_LOOP_ERR_HANDLER (3); \
363 result = __GCONV_INCOMPLETE_INPUT; \
364 break; \
366 continue; \
369 uint16_t c = get16 (inptr); \
371 if (__builtin_expect (c <= 0x007f, 1)) \
373 /* Single byte UTF-8 char. */ \
374 *outptr = c & 0xff; \
375 outptr++; \
377 else if (c >= 0x0080 && c <= 0x07ff) \
379 /* Two byte UTF-8 char. */ \
381 if (__builtin_expect (outptr + 2 > outend, 0)) \
383 /* Overflow in the output buffer. */ \
384 result = __GCONV_FULL_OUTPUT; \
385 break; \
388 outptr[0] = 0xc0; \
389 outptr[0] |= c >> 6; \
391 outptr[1] = 0x80; \
392 outptr[1] |= c & 0x3f; \
394 outptr += 2; \
396 else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \
398 /* Three byte UTF-8 char. */ \
400 if (__builtin_expect (outptr + 3 > outend, 0)) \
402 /* Overflow in the output buffer. */ \
403 result = __GCONV_FULL_OUTPUT; \
404 break; \
406 outptr[0] = 0xe0; \
407 outptr[0] |= c >> 12; \
409 outptr[1] = 0x80; \
410 outptr[1] |= (c >> 6) & 0x3f; \
412 outptr[2] = 0x80; \
413 outptr[2] |= c & 0x3f; \
415 outptr += 3; \
417 else if (c >= 0xd800 && c <= 0xdbff) \
419 /* Four byte UTF-8 char. */ \
420 uint16_t low, uvwxy; \
422 if (__builtin_expect (outptr + 4 > outend, 0)) \
424 /* Overflow in the output buffer. */ \
425 result = __GCONV_FULL_OUTPUT; \
426 break; \
428 inptr += 2; \
429 if (__builtin_expect (inptr + 2 > inend, 0)) \
431 result = __GCONV_INCOMPLETE_INPUT; \
432 break; \
435 low = get16 (inptr); \
437 if ((low & 0xfc00) != 0xdc00) \
439 inptr -= 2; \
440 STANDARD_TO_LOOP_ERR_HANDLER (2); \
442 uvwxy = ((c >> 6) & 0xf) + 1; \
443 outptr[0] = 0xf0; \
444 outptr[0] |= uvwxy >> 2; \
446 outptr[1] = 0x80; \
447 outptr[1] |= (uvwxy << 4) & 0x30; \
448 outptr[1] |= (c >> 2) & 0x0f; \
450 outptr[2] = 0x80; \
451 outptr[2] |= (c & 0x03) << 4; \
452 outptr[2] |= (low >> 6) & 0x0f; \
454 outptr[3] = 0x80; \
455 outptr[3] |= low & 0x3f; \
457 outptr += 4; \
459 else \
461 STANDARD_TO_LOOP_ERR_HANDLER (2); \
463 inptr += 2; \
465 #define LOOP_NEED_FLAGS
466 #include <iconv/loop.c>
468 #include <iconv/skeleton.c>