5158 sed dumps core in new multibyte code
[illumos-gate.git] / usr / src / lib / libc / port / locale / euc.c
blob36aad83ac2ab0174cdc3c8eac142252b369536b1
1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5 * Copyright (c) 1993
6 * The Regents of the University of California. All rights reserved.
8 * This code is derived from software contributed to Berkeley by
9 * Paul Borman at Krystal Technologies.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
36 #include "lint.h"
37 #include <errno.h>
38 #include <limits.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <wchar.h>
42 #include <sys/types.h>
43 #include <sys/euc.h>
44 #include "mblocal.h"
45 #include "lctype.h"
47 static size_t _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
48 const char *_RESTRICT_KYWD,
49 size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
50 static size_t _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
51 mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
53 static size_t _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
54 const char *_RESTRICT_KYWD,
55 size_t, mbstate_t *_RESTRICT_KYWD);
56 static size_t _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
57 const char *_RESTRICT_KYWD,
58 size_t, mbstate_t *_RESTRICT_KYWD);
59 static size_t _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
60 const char *_RESTRICT_KYWD,
61 size_t, mbstate_t *_RESTRICT_KYWD);
62 static size_t _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
63 const char *_RESTRICT_KYWD,
64 size_t, mbstate_t *_RESTRICT_KYWD);
66 static size_t _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
67 mbstate_t *_RESTRICT_KYWD);
68 static size_t _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
69 mbstate_t *_RESTRICT_KYWD);
70 static size_t _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
71 mbstate_t *_RESTRICT_KYWD);
72 static size_t _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
73 mbstate_t *_RESTRICT_KYWD);
75 static size_t _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
76 const char **_RESTRICT_KYWD, size_t, size_t,
77 mbstate_t *_RESTRICT_KYWD);
78 static size_t _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
79 const char **_RESTRICT_KYWD, size_t, size_t,
80 mbstate_t *_RESTRICT_KYWD);
81 static size_t _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
82 const char **_RESTRICT_KYWD, size_t, size_t,
83 mbstate_t *_RESTRICT_KYWD);
84 static size_t _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
85 const char **_RESTRICT_KYWD, size_t, size_t,
86 mbstate_t *_RESTRICT_KYWD);
88 static size_t _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD,
89 const wchar_t **_RESTRICT_KYWD, size_t, size_t,
90 mbstate_t *_RESTRICT_KYWD);
91 static size_t _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD,
92 const wchar_t **_RESTRICT_KYWD, size_t, size_t,
93 mbstate_t *_RESTRICT_KYWD);
94 static size_t _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD,
95 const wchar_t **_RESTRICT_KYWD, size_t, size_t,
96 mbstate_t *_RESTRICT_KYWD);
97 static size_t _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD,
98 const wchar_t **_RESTRICT_KYWD, size_t, size_t,
99 mbstate_t *_RESTRICT_KYWD);
101 static int _EUC_mbsinit(const mbstate_t *);
103 typedef struct {
104 wchar_t ch;
105 int set;
106 int want;
107 } _EucState;
110 _EUC_mbsinit(const mbstate_t *ps)
113 return (ps == NULL || ((const _EucState *)ps)->want == 0);
117 * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
119 void
120 _EUC_CN_init(struct lc_ctype *lct)
122 lct->lc_mbrtowc = _EUC_CN_mbrtowc;
123 lct->lc_wcrtomb = _EUC_CN_wcrtomb;
124 lct->lc_mbsnrtowcs = _EUC_CN_mbsnrtowcs;
125 lct->lc_wcsnrtombs = _EUC_CN_wcsnrtombs;
126 lct->lc_mbsinit = _EUC_mbsinit;
128 lct->lc_max_mblen = 4;
129 lct->lc_is_ascii = 0;
132 static size_t
133 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
134 size_t n, mbstate_t *_RESTRICT_KYWD ps)
136 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
139 static size_t
140 _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
141 const char **_RESTRICT_KYWD src,
142 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
144 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
147 static size_t
148 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
149 mbstate_t *_RESTRICT_KYWD ps)
151 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
154 static size_t
155 _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
156 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
158 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
162 * EUC-KR uses only CS0 and CS1.
164 void
165 _EUC_KR_init(struct lc_ctype *lct)
167 lct->lc_mbrtowc = _EUC_KR_mbrtowc;
168 lct->lc_wcrtomb = _EUC_KR_wcrtomb;
169 lct->lc_mbsnrtowcs = _EUC_KR_mbsnrtowcs;
170 lct->lc_wcsnrtombs = _EUC_KR_wcsnrtombs;
171 lct->lc_mbsinit = _EUC_mbsinit;
173 lct->lc_max_mblen = 2;
174 lct->lc_is_ascii = 0;
177 static size_t
178 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
179 size_t n, mbstate_t *_RESTRICT_KYWD ps)
181 return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
184 static size_t
185 _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
186 const char **_RESTRICT_KYWD src,
187 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
189 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
192 static size_t
193 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
194 mbstate_t *_RESTRICT_KYWD ps)
196 return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
199 static size_t
200 _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
201 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
203 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
207 * EUC-JP uses CS0, CS1, CS2, and CS3.
209 void
210 _EUC_JP_init(struct lc_ctype *lct)
212 lct->lc_mbrtowc = _EUC_JP_mbrtowc;
213 lct->lc_wcrtomb = _EUC_JP_wcrtomb;
214 lct->lc_mbsnrtowcs = _EUC_JP_mbsnrtowcs;
215 lct->lc_wcsnrtombs = _EUC_JP_wcsnrtombs;
216 lct->lc_mbsinit = _EUC_mbsinit;
218 lct->lc_max_mblen = 3;
219 lct->lc_is_ascii = 0;
222 static size_t
223 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
224 size_t n, mbstate_t *_RESTRICT_KYWD ps)
226 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
229 static size_t
230 _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
231 const char **_RESTRICT_KYWD src,
232 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
234 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
237 static size_t
238 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
239 mbstate_t *_RESTRICT_KYWD ps)
241 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
244 static size_t
245 _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
246 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
248 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
252 * EUC-TW uses CS0, CS1, and CS2.
254 void
255 _EUC_TW_init(struct lc_ctype *lct)
257 lct->lc_mbrtowc = _EUC_TW_mbrtowc;
258 lct->lc_wcrtomb = _EUC_TW_wcrtomb;
259 lct->lc_mbsnrtowcs = _EUC_TW_mbsnrtowcs;
260 lct->lc_wcsnrtombs = _EUC_TW_wcsnrtombs;
261 lct->lc_mbsinit = _EUC_mbsinit;
263 lct->lc_max_mblen = 4;
264 lct->lc_is_ascii = 0;
267 static size_t
268 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
269 size_t n, mbstate_t *_RESTRICT_KYWD ps)
271 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
274 static size_t
275 _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
276 const char **_RESTRICT_KYWD src,
277 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
279 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
282 static size_t
283 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
284 mbstate_t *_RESTRICT_KYWD ps)
286 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
289 static size_t
290 _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
291 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
293 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
297 * Common EUC code.
300 static size_t
301 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
302 size_t n, mbstate_t *_RESTRICT_KYWD ps,
303 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
305 _EucState *es;
306 int i, want;
307 wchar_t wc;
308 unsigned char ch;
310 es = (_EucState *)ps;
312 if (es->want < 0 || es->want > MB_CUR_MAX) {
313 errno = EINVAL;
314 return ((size_t)-1);
317 if (s == NULL) {
318 s = "";
319 n = 1;
320 pwc = NULL;
323 if (n == 0)
324 /* Incomplete multibyte sequence */
325 return ((size_t)-2);
327 if (es->want == 0) {
328 /* Fast path for plain ASCII (CS0) */
329 if (((ch = (unsigned char)*s) & 0x80) == 0) {
330 if (pwc != NULL)
331 *pwc = ch;
332 return (ch != '\0' ? 1 : 0);
335 if (ch >= 0xa1) {
336 /* CS1 */
337 want = 2;
338 } else if (ch == cs2) {
339 want = cs2width;
340 } else if (ch == cs3) {
341 want = cs3width;
342 } else {
343 errno = EILSEQ;
344 return ((size_t)-1);
348 es->want = want;
349 es->ch = 0;
350 } else {
351 want = es->want;
352 wc = es->ch;
355 for (i = 0; i < MIN(want, n); i++) {
356 wc <<= 8;
357 wc |= *s;
358 s++;
360 if (i < want) {
361 /* Incomplete multibyte sequence */
362 es->want = want - i;
363 es->ch = wc;
364 return ((size_t)-2);
366 if (pwc != NULL)
367 *pwc = wc;
368 es->want = 0;
369 return (wc == L'\0' ? 0 : want);
372 static size_t
373 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
374 mbstate_t *_RESTRICT_KYWD ps,
375 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
377 _EucState *es;
378 int i, len;
379 wchar_t nm;
381 es = (_EucState *)ps;
383 if (es->want != 0) {
384 errno = EINVAL;
385 return ((size_t)-1);
388 if (s == NULL)
389 /* Reset to initial shift state (no-op) */
390 return (1);
392 if ((wc & ~0x7f) == 0) {
393 /* Fast path for plain ASCII (CS0) */
394 *s = (char)wc;
395 return (1);
398 /* Determine the "length" */
399 if ((unsigned)wc > 0xffffff) {
400 len = 4;
401 } else if ((unsigned)wc > 0xffff) {
402 len = 3;
403 } else if ((unsigned)wc > 0xff) {
404 len = 2;
405 } else {
406 len = 1;
409 if (len > MB_CUR_MAX) {
410 errno = EILSEQ;
411 return ((size_t)-1);
414 /* This first check excludes CS1, which is implicitly valid. */
415 if ((wc < 0xa100) || (wc > 0xffff)) {
416 /* Check for valid CS2 or CS3 */
417 nm = (wc >> ((len - 1) * 8));
418 if (nm == cs2) {
419 if (len != cs2width) {
420 errno = EILSEQ;
421 return ((size_t)-1);
423 } else if (nm == cs3) {
424 if (len != cs3width) {
425 errno = EILSEQ;
426 return ((size_t)-1);
428 } else {
429 errno = EILSEQ;
430 return ((size_t)-1);
434 /* Stash the bytes, least significant last */
435 for (i = len - 1; i >= 0; i--) {
436 s[i] = (wc & 0xff);
437 wc >>= 8;
439 return (len);