Update copyright dates with scripts/update-copyrights.
[glibc.git] / sysdeps / s390 / s390-64 / utf8-utf16-z9.c
blob1425cb116a41c66efce5f6b91bb459396db99c58
1 /* Conversion between UTF-16 and UTF-32 BE/internal.
3 This module uses the Z9-109 variants of the Convert Unicode
4 instructions.
5 Copyright (C) 1997-2015 Free Software Foundation, Inc.
7 Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
8 Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997.
10 Thanks to Daniel Appich who covered the relevant performance work
11 in his diploma thesis.
13 This is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 2.1 of the License, or (at your option) any later version.
18 This is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with the GNU C Library; if not, see
25 <http://www.gnu.org/licenses/>. */
27 #include <dlfcn.h>
28 #include <stdint.h>
29 #include <unistd.h>
30 #include <dl-procinfo.h>
31 #include <gconv.h>
33 /* UTF-16 big endian byte order mark. */
34 #define BOM_UTF16 0xfeff
36 #define DEFINE_INIT 0
37 #define DEFINE_FINI 0
38 #define MIN_NEEDED_FROM 1
39 #define MAX_NEEDED_FROM 4
40 #define MIN_NEEDED_TO 2
41 #define MAX_NEEDED_TO 4
42 #define FROM_LOOP from_utf8_loop
43 #define TO_LOOP to_utf8_loop
44 #define FROM_DIRECTION (dir == from_utf8)
45 #define ONE_DIRECTION 0
46 #define PREPARE_LOOP \
47 enum direction dir = ((struct utf8_data *) step->__data)->dir; \
48 int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \
50 if (emit_bom && !data->__internal_use \
51 && data->__invocation_counter == 0) \
52 { \
53 /* Emit the UTF-16 Byte Order Mark. */ \
54 if (__glibc_unlikely (outbuf + 2 > outend)) \
55 return __GCONV_FULL_OUTPUT; \
57 put16u (outbuf, BOM_UTF16); \
58 outbuf += 2; \
61 /* Direction of the transformation. */
62 enum direction
64 illegal_dir,
65 to_utf8,
66 from_utf8
69 struct utf8_data
71 enum direction dir;
72 int emit_bom;
76 extern int gconv_init (struct __gconv_step *step);
77 int
78 gconv_init (struct __gconv_step *step)
80 /* Determine which direction. */
81 struct utf8_data *new_data;
82 enum direction dir = illegal_dir;
83 int emit_bom;
84 int result;
86 emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
88 if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
89 && (__strcasecmp (step->__to_name, "UTF-16//") == 0
90 || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
92 dir = from_utf8;
94 else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
95 && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
97 dir = to_utf8;
100 result = __GCONV_NOCONV;
101 if (dir != illegal_dir)
103 new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
105 result = __GCONV_NOMEM;
106 if (new_data != NULL)
108 new_data->dir = dir;
109 new_data->emit_bom = emit_bom;
110 step->__data = new_data;
112 if (dir == from_utf8)
114 step->__min_needed_from = MIN_NEEDED_FROM;
115 step->__max_needed_from = MIN_NEEDED_FROM;
116 step->__min_needed_to = MIN_NEEDED_TO;
117 step->__max_needed_to = MIN_NEEDED_TO;
119 else
121 step->__min_needed_from = MIN_NEEDED_TO;
122 step->__max_needed_from = MIN_NEEDED_TO;
123 step->__min_needed_to = MIN_NEEDED_FROM;
124 step->__max_needed_to = MIN_NEEDED_FROM;
127 step->__stateful = 0;
129 result = __GCONV_OK;
133 return result;
137 extern void gconv_end (struct __gconv_step *data);
138 void
139 gconv_end (struct __gconv_step *data)
141 free (data->__data);
144 /* The macro for the hardware loop. This is used for both
145 directions. */
146 #define HARDWARE_CONVERT(INSTRUCTION) \
148 register const unsigned char* pInput asm ("8") = inptr; \
149 register unsigned long long inlen asm ("9") = inend - inptr; \
150 register unsigned char* pOutput asm ("10") = outptr; \
151 register unsigned long long outlen asm("11") = outend - outptr; \
152 uint64_t cc = 0; \
154 asm volatile (".machine push \n\t" \
155 ".machine \"z9-109\" \n\t" \
156 "0: " INSTRUCTION " \n\t" \
157 ".machine pop \n\t" \
158 " jo 0b \n\t" \
159 " ipm %2 \n" \
160 : "+a" (pOutput), "+a" (pInput), "+d" (cc), \
161 "+d" (outlen), "+d" (inlen) \
163 : "cc", "memory"); \
165 inptr = pInput; \
166 outptr = pOutput; \
167 cc >>= 28; \
169 if (cc == 1) \
171 result = __GCONV_FULL_OUTPUT; \
172 break; \
174 else if (cc == 2) \
176 result = __GCONV_ILLEGAL_INPUT; \
177 break; \
181 /* Conversion function from UTF-8 to UTF-16. */
183 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
184 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
185 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
186 #define LOOPFCT FROM_LOOP
187 /* The software implementation is based on the code in gconv_simple.c. */
188 #define BODY \
190 if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \
192 HARDWARE_CONVERT ("cu12 %0, %1, 1"); \
194 if (inptr != inend) \
196 int i; \
197 for (i = 1; inptr + i < inend; ++i) \
198 if ((inptr[i] & 0xc0) != 0x80) \
199 break; \
201 if (__glibc_likely (inptr + i == inend)) \
203 result = __GCONV_INCOMPLETE_INPUT; \
204 break; \
206 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
208 continue; \
211 /* Next input byte. */ \
212 uint16_t ch = *inptr; \
214 if (__glibc_likely (ch < 0x80)) \
216 /* One byte sequence. */ \
217 ++inptr; \
219 else \
221 uint_fast32_t cnt; \
222 uint_fast32_t i; \
224 if (ch >= 0xc2 && ch < 0xe0) \
226 /* We expect two bytes. The first byte cannot be 0xc0 \
227 or 0xc1, otherwise the wide character could have been \
228 represented using a single byte. */ \
229 cnt = 2; \
230 ch &= 0x1f; \
232 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
234 /* We expect three bytes. */ \
235 cnt = 3; \
236 ch &= 0x0f; \
238 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
240 /* We expect four bytes. */ \
241 cnt = 4; \
242 ch &= 0x07; \
244 else \
246 /* Search the end of this ill-formed UTF-8 character. This \
247 is the next byte with (x & 0xc0) != 0x80. */ \
248 i = 0; \
249 do \
250 ++i; \
251 while (inptr + i < inend \
252 && (*(inptr + i) & 0xc0) == 0x80 \
253 && i < 5); \
255 errout: \
256 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
259 if (__glibc_unlikely (inptr + cnt > inend)) \
261 /* We don't have enough input. But before we report \
262 that check that all the bytes are correct. */ \
263 for (i = 1; inptr + i < inend; ++i) \
264 if ((inptr[i] & 0xc0) != 0x80) \
265 break; \
267 if (__glibc_likely (inptr + i == inend)) \
269 result = __GCONV_INCOMPLETE_INPUT; \
270 break; \
273 goto errout; \
276 if (cnt == 4) \
278 /* For 4 byte UTF-8 chars two UTF-16 chars (high and \
279 low) are needed. */ \
280 uint16_t zabcd, high, low; \
282 if (__glibc_unlikely (outptr + 4 > outend)) \
284 /* Overflow in the output buffer. */ \
285 result = __GCONV_FULL_OUTPUT; \
286 break; \
289 /* See Principles of Operations cu12. */ \
290 zabcd = (((inptr[0] & 0x7) << 2) | \
291 ((inptr[1] & 0x30) >> 4)) - 1; \
293 /* z-bit must be zero after subtracting 1. */ \
294 if (zabcd & 0x10) \
295 STANDARD_FROM_LOOP_ERR_HANDLER (4) \
297 high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \
298 high |= zabcd << 6; /* abcd bits */ \
299 high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \
300 high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \
302 low = (uint16_t)(0xdc << 8); /* low surrogate id */ \
303 low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \
304 low |= (inptr[2] & 0x3) << 6; /* mn bits */ \
305 low |= inptr[3] & 0x3f; /* opqrst bits */ \
307 put16 (outptr, high); \
308 outptr += 2; \
309 put16 (outptr, low); \
310 outptr += 2; \
311 inptr += 4; \
312 continue; \
314 else \
316 /* Read the possible remaining bytes. */ \
317 for (i = 1; i < cnt; ++i) \
319 uint16_t byte = inptr[i]; \
321 if ((byte & 0xc0) != 0x80) \
322 /* This is an illegal encoding. */ \
323 break; \
325 ch <<= 6; \
326 ch |= byte & 0x3f; \
328 inptr += cnt; \
332 /* Now adjust the pointers and store the result. */ \
333 *((uint16_t *) outptr) = ch; \
334 outptr += sizeof (uint16_t); \
337 #define LOOP_NEED_FLAGS
338 #include <iconv/loop.c>
340 /* Conversion from UTF-16 to UTF-8. */
342 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
343 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
344 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
345 #define LOOPFCT TO_LOOP
346 /* The software routine is based on the functionality of the S/390
347 hardware instruction (cu21) as described in the Principles of
348 Operation. */
349 #define BODY \
351 /* The hardware instruction currently fails to report an error for \
352 isolated low surrogates so we have to disable the instruction \
353 until this gets resolved. */ \
354 if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \
356 HARDWARE_CONVERT ("cu21 %0, %1, 1"); \
357 if (inptr != inend) \
359 /* Check if the third byte is \
360 a valid start of a UTF-16 surrogate. */ \
361 if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \
362 STANDARD_TO_LOOP_ERR_HANDLER (3); \
364 result = __GCONV_INCOMPLETE_INPUT; \
365 break; \
367 continue; \
370 uint16_t c = get16 (inptr); \
372 if (__glibc_likely (c <= 0x007f)) \
374 /* Single byte UTF-8 char. */ \
375 *outptr = c & 0xff; \
376 outptr++; \
378 else if (c >= 0x0080 && c <= 0x07ff) \
380 /* Two byte UTF-8 char. */ \
382 if (__glibc_unlikely (outptr + 2 > outend)) \
384 /* Overflow in the output buffer. */ \
385 result = __GCONV_FULL_OUTPUT; \
386 break; \
389 outptr[0] = 0xc0; \
390 outptr[0] |= c >> 6; \
392 outptr[1] = 0x80; \
393 outptr[1] |= c & 0x3f; \
395 outptr += 2; \
397 else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff) \
399 /* Three byte UTF-8 char. */ \
401 if (__glibc_unlikely (outptr + 3 > outend)) \
403 /* Overflow in the output buffer. */ \
404 result = __GCONV_FULL_OUTPUT; \
405 break; \
407 outptr[0] = 0xe0; \
408 outptr[0] |= c >> 12; \
410 outptr[1] = 0x80; \
411 outptr[1] |= (c >> 6) & 0x3f; \
413 outptr[2] = 0x80; \
414 outptr[2] |= c & 0x3f; \
416 outptr += 3; \
418 else if (c >= 0xd800 && c <= 0xdbff) \
420 /* Four byte UTF-8 char. */ \
421 uint16_t low, uvwxy; \
423 if (__glibc_unlikely (outptr + 4 > outend)) \
425 /* Overflow in the output buffer. */ \
426 result = __GCONV_FULL_OUTPUT; \
427 break; \
429 inptr += 2; \
430 if (__glibc_unlikely (inptr + 2 > inend)) \
432 result = __GCONV_INCOMPLETE_INPUT; \
433 break; \
436 low = get16 (inptr); \
438 if ((low & 0xfc00) != 0xdc00) \
440 inptr -= 2; \
441 STANDARD_TO_LOOP_ERR_HANDLER (2); \
443 uvwxy = ((c >> 6) & 0xf) + 1; \
444 outptr[0] = 0xf0; \
445 outptr[0] |= uvwxy >> 2; \
447 outptr[1] = 0x80; \
448 outptr[1] |= (uvwxy << 4) & 0x30; \
449 outptr[1] |= (c >> 2) & 0x0f; \
451 outptr[2] = 0x80; \
452 outptr[2] |= (c & 0x03) << 4; \
453 outptr[2] |= (low >> 6) & 0x0f; \
455 outptr[3] = 0x80; \
456 outptr[3] |= low & 0x3f; \
458 outptr += 4; \
460 else \
462 STANDARD_TO_LOOP_ERR_HANDLER (2); \
464 inptr += 2; \
466 #define LOOP_NEED_FLAGS
467 #include <iconv/loop.c>
469 #include <iconv/skeleton.c>