5158 sed dumps core in new multibyte code
[illumos-gate.git] / usr / src / lib / libc / port / locale / gb18030.c
blob232daade50e5d4b760ca41f373d016a4c4f7a698
1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
30 * PRC National Standard GB 18030-2000 encoding of Chinese text.
32 * See gb18030(5) for details.
35 #include "lint.h"
36 #include <sys/types.h>
37 #include <errno.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 #include "lctype.h"
45 static size_t _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
46 const char *_RESTRICT_KYWD,
47 size_t, mbstate_t *_RESTRICT_KYWD);
48 static int _GB18030_mbsinit(const mbstate_t *);
49 static size_t _GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
50 mbstate_t *_RESTRICT_KYWD);
51 static size_t _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
52 const char **_RESTRICT_KYWD, size_t, size_t,
53 mbstate_t *_RESTRICT_KYWD);
54 static size_t _GB18030_wcsnrtombs(char *_RESTRICT_KYWD,
55 const wchar_t **_RESTRICT_KYWD, size_t, size_t,
56 mbstate_t *_RESTRICT_KYWD);
59 typedef struct {
60 int count;
61 uchar_t bytes[4];
62 } _GB18030State;
64 void
65 _GB18030_init(struct lc_ctype *lct)
68 lct->lc_mbrtowc = _GB18030_mbrtowc;
69 lct->lc_wcrtomb = _GB18030_wcrtomb;
70 lct->lc_mbsinit = _GB18030_mbsinit;
71 lct->lc_mbsnrtowcs = _GB18030_mbsnrtowcs;
72 lct->lc_wcsnrtombs = _GB18030_wcsnrtombs;
73 lct->lc_max_mblen = 4;
74 lct->lc_is_ascii = 0;
77 static int
78 _GB18030_mbsinit(const mbstate_t *ps)
81 return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
84 static size_t
85 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
86 size_t n, mbstate_t *_RESTRICT_KYWD ps)
88 _GB18030State *gs;
89 wchar_t wch;
90 int ch, len, ocount;
91 size_t ncopy;
93 gs = (_GB18030State *)ps;
95 if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
96 errno = EINVAL;
97 return ((size_t)-1);
100 if (s == NULL) {
101 s = "";
102 n = 1;
103 pwc = NULL;
106 ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
107 (void) memcpy(gs->bytes + gs->count, s, ncopy);
108 ocount = gs->count;
109 gs->count += ncopy;
110 s = (char *)gs->bytes;
111 n = gs->count;
113 if (n == 0)
114 /* Incomplete multibyte sequence */
115 return ((size_t)-2);
118 * Single byte: [00-7f]
119 * Two byte: [81-fe][40-7e,80-fe]
120 * Four byte: [81-fe][30-39][81-fe][30-39]
122 ch = (unsigned char)*s++;
123 if (ch <= 0x7f) {
124 len = 1;
125 wch = ch;
126 } else if (ch >= 0x81 && ch <= 0xfe) {
127 wch = ch;
128 if (n < 2)
129 return ((size_t)-2);
130 ch = (unsigned char)*s++;
131 if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
132 wch = (wch << 8) | ch;
133 len = 2;
134 } else if (ch >= 0x30 && ch <= 0x39) {
136 * Strip high bit off the wide character we will
137 * eventually output so that it is positive when
138 * cast to wint_t on 32-bit twos-complement machines.
140 wch = ((wch & 0x7f) << 8) | ch;
141 if (n < 3)
142 return ((size_t)-2);
143 ch = (unsigned char)*s++;
144 if (ch < 0x81 || ch > 0xfe)
145 goto ilseq;
146 wch = (wch << 8) | ch;
147 if (n < 4)
148 return ((size_t)-2);
149 ch = (unsigned char)*s++;
150 if (ch < 0x30 || ch > 0x39)
151 goto ilseq;
152 wch = (wch << 8) | ch;
153 len = 4;
154 } else
155 goto ilseq;
156 } else
157 goto ilseq;
159 if (pwc != NULL)
160 *pwc = wch;
161 gs->count = 0;
162 return (wch == L'\0' ? 0 : len - ocount);
163 ilseq:
164 errno = EILSEQ;
165 return ((size_t)-1);
168 static size_t
169 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
170 mbstate_t *_RESTRICT_KYWD ps)
172 _GB18030State *gs;
173 size_t len;
174 int c;
176 gs = (_GB18030State *)ps;
178 if (gs->count != 0) {
179 errno = EINVAL;
180 return ((size_t)-1);
183 if (s == NULL)
184 /* Reset to initial shift state (no-op) */
185 return (1);
186 if ((wc & ~0x7fffffff) != 0)
187 goto ilseq;
188 if (wc & 0x7f000000) {
189 /* Replace high bit that mbrtowc() removed. */
190 wc |= 0x80000000;
191 c = (wc >> 24) & 0xff;
192 if (c < 0x81 || c > 0xfe)
193 goto ilseq;
194 *s++ = c;
195 c = (wc >> 16) & 0xff;
196 if (c < 0x30 || c > 0x39)
197 goto ilseq;
198 *s++ = c;
199 c = (wc >> 8) & 0xff;
200 if (c < 0x81 || c > 0xfe)
201 goto ilseq;
202 *s++ = c;
203 c = wc & 0xff;
204 if (c < 0x30 || c > 0x39)
205 goto ilseq;
206 *s++ = c;
207 len = 4;
208 } else if (wc & 0x00ff0000)
209 goto ilseq;
210 else if (wc & 0x0000ff00) {
211 c = (wc >> 8) & 0xff;
212 if (c < 0x81 || c > 0xfe)
213 goto ilseq;
214 *s++ = c;
215 c = wc & 0xff;
216 if (c < 0x40 || c == 0x7f || c == 0xff)
217 goto ilseq;
218 *s++ = c;
219 len = 2;
220 } else if (wc <= 0x7f) {
221 *s++ = wc;
222 len = 1;
223 } else
224 goto ilseq;
226 return (len);
227 ilseq:
228 errno = EILSEQ;
229 return ((size_t)-1);
232 static size_t
233 _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
234 const char **_RESTRICT_KYWD src, size_t nms, size_t len,
235 mbstate_t *_RESTRICT_KYWD ps)
237 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc));
240 static size_t
241 _GB18030_wcsnrtombs(char *_RESTRICT_KYWD dst,
242 const wchar_t **_RESTRICT_KYWD src, size_t nwc, size_t len,
243 mbstate_t *_RESTRICT_KYWD ps)
245 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb));