Merge commit 'ea01a15a654b9e1c7b37d958f4d1911882ed7781'
[unleashed.git] / kernel / os / kiconv.c
blob4bda7ebfcf5de3947ed1fa12ce477e3b7dc4bbb8
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * Kernel iconv code conversion functions (PSARC/2007/173).
31 * Man pages: kiconv_open(9F), kiconv(9F), kiconv_close(9F), and kiconvstr(9F).
32 * Interface stability: Committed.
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/sysmacros.h>
38 #include <sys/systm.h>
39 #include <sys/debug.h>
40 #include <sys/kmem.h>
41 #include <sys/sunddi.h>
42 #include <sys/ksynch.h>
43 #include <sys/modctl.h>
44 #include <sys/byteorder.h>
45 #include <sys/errno.h>
46 #include <sys/kiconv.h>
47 #include <sys/kiconv_latin1.h>
51 * The following macros indicate ids to the correct code conversion mapping
52 * data tables to use. The actual tables are coming from <sys/kiconv_latin1.h>.
54 #define KICONV_TBLID_1252 (0x00)
55 #define KICONV_TBLID_8859_1 (0x01)
56 #define KICONV_TBLID_8859_15 (0x02)
57 #define KICONV_TBLID_850 (0x03)
59 #define KICONV_MAX_MAPPING_TBLID (0x03)
62 * The following tables are coming from u8_textprep.c. We use them to
63 * check on validity of UTF-8 characters and their bytes.
65 extern const int8_t u8_number_of_bytes[];
66 extern const uint8_t u8_valid_min_2nd_byte[];
67 extern const uint8_t u8_valid_max_2nd_byte[];
71 * The following four functions, open_to_1252(), open_to_88591(),
72 * open_to_885915(), and open_to_850(), are kiconv_open functions from
73 * UTF-8 to corresponding single byte codesets.
75 static void *
76 open_to_1252()
78 kiconv_state_t s;
80 s = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
81 s->id = KICONV_TBLID_1252;
82 s->bom_processed = 0;
84 return ((void *)s);
87 static void *
88 open_to_88591()
90 kiconv_state_t s;
92 s = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
93 s->id = KICONV_TBLID_8859_1;
94 s->bom_processed = 0;
96 return ((void *)s);
99 static void *
100 open_to_885915()
102 kiconv_state_t s;
104 s = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
105 s->id = KICONV_TBLID_8859_15;
106 s->bom_processed = 0;
108 return ((void *)s);
111 static void *
112 open_to_850()
114 kiconv_state_t s;
116 s = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
117 s->id = KICONV_TBLID_850;
118 s->bom_processed = 0;
120 return ((void *)s);
124 * The following four functions, open_fr_1252(), open_fr_88591(),
125 * open_fr_885915(), and open_fr_850(), are kiconv_open functions from
126 * corresponding single byte codesets to UTF-8.
128 static void *
129 open_fr_1252()
131 return ((void *)KICONV_TBLID_1252);
134 static void *
135 open_fr_88591()
137 return ((void *)KICONV_TBLID_8859_1);
140 static void *
141 open_fr_885915()
143 return ((void *)KICONV_TBLID_8859_15);
146 static void *
147 open_fr_850()
149 return ((void *)KICONV_TBLID_850);
153 * The following close_to_sb() function is kiconv_close function for
154 * the conversions from UTF-8 to single byte codesets. The close_fr_sb()
155 * is kiconv_close function for the conversions from single byte codesets to
156 * UTF-8.
158 static int
159 close_to_sb(void *s)
161 if (! s || s == (void *)-1)
162 return (EBADF);
164 kmem_free(s, sizeof (kiconv_state_data_t));
166 return (0);
169 static int
170 close_fr_sb(void *s)
172 if ((ulong_t)s > KICONV_MAX_MAPPING_TBLID)
173 return (EBADF);
175 return (0);
179 * The following is the common kiconv function for conversions from UTF-8
180 * to single byte codesets.
182 static size_t
183 kiconv_to_sb(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
184 size_t *outbytesleft, int *errno)
186 size_t id;
187 size_t ret_val;
188 uchar_t *ib;
189 uchar_t *oldib;
190 uchar_t *ob;
191 uchar_t *ibtail;
192 uchar_t *obtail;
193 uint32_t u8;
194 size_t i;
195 size_t l;
196 size_t h;
197 size_t init_h;
198 int8_t sz;
199 boolean_t second;
201 /* Check on the kiconv code conversion descriptor. */
202 if (! kcd || kcd == (void *)-1) {
203 *errno = EBADF;
204 return ((size_t)-1);
208 * Get the table id we are going to use for the code conversion
209 * and let's double check on it.
211 id = ((kiconv_state_t)kcd)->id;
212 if (id > KICONV_MAX_MAPPING_TBLID) {
213 *errno = EBADF;
214 return ((size_t)-1);
217 /* If this is a state reset request, process and return. */
218 if (! inbuf || ! (*inbuf)) {
219 ((kiconv_state_t)kcd)->bom_processed = 0;
220 return ((size_t)0);
223 ret_val = 0;
224 ib = (uchar_t *)*inbuf;
225 ob = (uchar_t *)*outbuf;
226 ibtail = ib + *inbytesleft;
227 obtail = ob + *outbytesleft;
230 * The inital high value for the binary search we will be using
231 * shortly is a literal constant as of today but to be future proof,
232 * let's calculate it like the following at here.
234 init_h = sizeof (to_sb_tbl[id]) / sizeof (kiconv_to_sb_tbl_comp_t) - 1;
237 * If we haven't checked on the UTF-8 signature BOM character in
238 * the beginning of the conversion data stream, we check it and if
239 * find one, we skip it since we have no use for it.
241 if (((kiconv_state_t)kcd)->bom_processed == 0 && (ibtail - ib) >= 3 &&
242 *ib == 0xef && *(ib + 1) == 0xbb && *(ib + 2) == 0xbf)
243 ib += 3;
244 ((kiconv_state_t)kcd)->bom_processed = 1;
246 while (ib < ibtail) {
247 sz = u8_number_of_bytes[*ib];
248 if (sz <= 0) {
249 *errno = EILSEQ;
250 ret_val = (size_t)-1;
251 break;
255 * If there is no room to write at the output buffer,
256 * issue E2BIG error.
258 if (ob >= obtail) {
259 *errno = E2BIG;
260 ret_val = (size_t)-1;
261 break;
265 * If it is a 7-bit ASCII character, we don't need to
266 * process further and we just copy the character over.
268 * If not, we collect the character bytes up to four bytes,
269 * validate the bytes, and binary search for the corresponding
270 * single byte codeset character byte. If we find it from
271 * the mapping table, we put that into the output buffer;
272 * otherwise, we put a replacement character instead as
273 * a non-identical conversion.
275 if (sz == 1) {
276 *ob++ = *ib++;
277 continue;
281 * Issue EINVAL error if input buffer has an incomplete
282 * character at the end of the buffer.
284 if ((ibtail - ib) < sz) {
285 *errno = EINVAL;
286 ret_val = (size_t)-1;
287 break;
291 * We collect UTF-8 character bytes and also check if
292 * this is a valid UTF-8 character without any bogus bytes
293 * based on the latest UTF-8 binary representation.
295 oldib = ib;
296 u8 = *ib++;
297 second = B_TRUE;
298 for (i = 1; i < sz; i++) {
299 if (second) {
300 if (*ib < u8_valid_min_2nd_byte[u8] ||
301 *ib > u8_valid_max_2nd_byte[u8]) {
302 *errno = EILSEQ;
303 ret_val = (size_t)-1;
304 ib = oldib;
305 goto TO_SB_ILLEGAL_CHAR_ERR;
307 second = B_FALSE;
308 } else if (*ib < 0x80 || *ib > 0xbf) {
309 *errno = EILSEQ;
310 ret_val = (size_t)-1;
311 ib = oldib;
312 goto TO_SB_ILLEGAL_CHAR_ERR;
314 u8 = (u8 << 8) | ((uint32_t)*ib);
315 ib++;
318 i = l = 0;
319 h = init_h;
320 while (l <= h) {
321 i = (l + h) / 2;
322 if (to_sb_tbl[id][i].u8 == u8)
323 break;
324 else if (to_sb_tbl[id][i].u8 < u8)
325 l = i + 1;
326 else
327 h = i - 1;
330 if (to_sb_tbl[id][i].u8 == u8) {
331 *ob++ = to_sb_tbl[id][i].sb;
332 } else {
334 * If we don't find a character in the target
335 * codeset, we insert an ASCII replacement character
336 * at the output buffer and indicate such
337 * "non-identical" conversion by increasing the
338 * return value which is the non-identical conversion
339 * counter if bigger than 0.
341 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
342 ret_val++;
346 TO_SB_ILLEGAL_CHAR_ERR:
347 *inbuf = (char *)ib;
348 *inbytesleft = ibtail - ib;
349 *outbuf = (char *)ob;
350 *outbytesleft = obtail - ob;
352 return (ret_val);
356 * The following is the common kiconv function from single byte codesets to
357 * UTF-8.
359 static size_t
360 kiconv_fr_sb(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
361 size_t *outbytesleft, int *errno)
363 size_t ret_val;
364 uchar_t *ib;
365 uchar_t *ob;
366 uchar_t *ibtail;
367 uchar_t *obtail;
368 size_t i;
369 size_t k;
370 int8_t sz;
372 /* Check on the kiconv code conversion descriptor validity. */
373 if ((ulong_t)kcd > KICONV_MAX_MAPPING_TBLID) {
374 *errno = EBADF;
375 return ((size_t)-1);
379 * If this is a state reset request, there is nothing to do and so
380 * we just return.
382 if (! inbuf || ! (*inbuf))
383 return ((size_t)0);
385 ret_val = 0;
386 ib = (uchar_t *)*inbuf;
387 ob = (uchar_t *)*outbuf;
388 ibtail = ib + *inbytesleft;
389 obtail = ob + *outbytesleft;
391 while (ib < ibtail) {
393 * If this is a 7-bit ASCII character, we just copy over and
394 * that's all we need to do for this character.
396 if (*ib < 0x80) {
397 if (ob >= obtail) {
398 *errno = E2BIG;
399 ret_val = (size_t)-1;
400 break;
403 *ob++ = *ib++;
404 continue;
408 * Otherwise, we get the corresponding UTF-8 character bytes
409 * from the mapping table and copy them over.
411 * We don't need to worry about if the UTF-8 character bytes
412 * at the mapping tables are valid or not since they are good.
414 k = *ib - 0x80;
415 sz = u8_number_of_bytes[to_u8_tbl[(ulong_t)kcd][k].u8[0]];
418 * If sz <= 0, that means we don't have any assigned character
419 * at the code point, k + 0x80, of the single byte codeset
420 * which is the fromcode. In other words, the input buffer
421 * has an illegal character.
423 if (sz <= 0) {
424 *errno = EILSEQ;
425 ret_val = (size_t)-1;
426 break;
429 if ((obtail - ob) < sz) {
430 *errno = E2BIG;
431 ret_val = (size_t)-1;
432 break;
435 for (i = 0; i < sz; i++)
436 *ob++ = to_u8_tbl[(ulong_t)kcd][k].u8[i];
438 ib++;
441 *inbuf = (char *)ib;
442 *inbytesleft = ibtail - ib;
443 *outbuf = (char *)ob;
444 *outbytesleft = obtail - ob;
446 return (ret_val);
450 * The following is the common kiconvstr function from UTF-8 to single byte
451 * codesets.
453 static size_t
454 kiconvstr_to_sb(size_t id, uchar_t *ib, size_t *inlen, uchar_t *ob,
455 size_t *outlen, int flag, int *errno)
457 size_t ret_val;
458 uchar_t *oldib;
459 uchar_t *ibtail;
460 uchar_t *obtail;
461 uint32_t u8;
462 size_t i;
463 size_t l;
464 size_t h;
465 size_t init_h;
466 int8_t sz;
467 boolean_t second;
468 boolean_t do_not_ignore_null;
470 /* Let's make sure that the table id is within the valid boundary. */
471 if (id > KICONV_MAX_MAPPING_TBLID) {
472 *errno = EBADF;
473 return ((size_t)-1);
476 ret_val = 0;
477 ibtail = ib + *inlen;
478 obtail = ob + *outlen;
479 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
480 init_h = sizeof (to_sb_tbl[id]) / sizeof (kiconv_to_sb_tbl_comp_t) - 1;
482 /* Skip any UTF-8 signature BOM character in the beginning. */
483 if ((ibtail - ib) >= 3 && *ib == 0xef && *(ib + 1) == 0xbb &&
484 *(ib + 2) == 0xbf)
485 ib += 3;
488 * Basically this is pretty much the same as kiconv_to_sb() except
489 * that we are now accepting two flag values and doing the processing
490 * accordingly.
492 while (ib < ibtail) {
493 sz = u8_number_of_bytes[*ib];
494 if (sz <= 0) {
495 if (flag & KICONV_REPLACE_INVALID) {
496 if (ob >= obtail) {
497 *errno = E2BIG;
498 ret_val = (size_t)-1;
499 break;
502 ib++;
503 goto STR_TO_SB_REPLACE_INVALID;
506 *errno = EILSEQ;
507 ret_val = (size_t)-1;
508 break;
511 if (*ib == '\0' && do_not_ignore_null)
512 break;
514 if (ob >= obtail) {
515 *errno = E2BIG;
516 ret_val = (size_t)-1;
517 break;
520 if (sz == 1) {
521 *ob++ = *ib++;
522 continue;
525 if ((ibtail - ib) < sz) {
526 if (flag & KICONV_REPLACE_INVALID) {
527 ib = ibtail;
528 goto STR_TO_SB_REPLACE_INVALID;
531 *errno = EINVAL;
532 ret_val = (size_t)-1;
533 break;
536 oldib = ib;
537 u8 = *ib++;
538 second = B_TRUE;
539 for (i = 1; i < sz; i++) {
540 if (second) {
541 if (*ib < u8_valid_min_2nd_byte[u8] ||
542 *ib > u8_valid_max_2nd_byte[u8]) {
543 if (flag & KICONV_REPLACE_INVALID) {
544 ib = oldib + sz;
545 goto STR_TO_SB_REPLACE_INVALID;
548 *errno = EILSEQ;
549 ret_val = (size_t)-1;
550 ib = oldib;
551 goto STR_TO_SB_ILLEGAL_CHAR_ERR;
553 second = B_FALSE;
554 } else if (*ib < 0x80 || *ib > 0xbf) {
555 if (flag & KICONV_REPLACE_INVALID) {
556 ib = oldib + sz;
557 goto STR_TO_SB_REPLACE_INVALID;
560 *errno = EILSEQ;
561 ret_val = (size_t)-1;
562 ib = oldib;
563 goto STR_TO_SB_ILLEGAL_CHAR_ERR;
565 u8 = (u8 << 8) | ((uint32_t)*ib);
566 ib++;
569 i = l = 0;
570 h = init_h;
571 while (l <= h) {
572 i = (l + h) / 2;
573 if (to_sb_tbl[id][i].u8 == u8)
574 break;
575 else if (to_sb_tbl[id][i].u8 < u8)
576 l = i + 1;
577 else
578 h = i - 1;
581 if (to_sb_tbl[id][i].u8 == u8) {
582 *ob++ = to_sb_tbl[id][i].sb;
583 } else {
584 STR_TO_SB_REPLACE_INVALID:
585 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
586 ret_val++;
590 STR_TO_SB_ILLEGAL_CHAR_ERR:
591 *inlen = ibtail - ib;
592 *outlen = obtail - ob;
594 return (ret_val);
598 * The following four functions are entry points recorded at the conv_list[]
599 * defined at below.
601 static size_t
602 kiconvstr_to_1252(char *inarray, size_t *inlen, char *outarray,
603 size_t *outlen, int flag, int *errno)
605 return (kiconvstr_to_sb(KICONV_TBLID_1252, (uchar_t *)inarray,
606 inlen, (uchar_t *)outarray, outlen, flag, errno));
609 static size_t
610 kiconvstr_to_1(char *inarray, size_t *inlen, char *outarray,
611 size_t *outlen, int flag, int *errno)
613 return (kiconvstr_to_sb(KICONV_TBLID_8859_1, (uchar_t *)inarray,
614 inlen, (uchar_t *)outarray, outlen, flag, errno));
617 static size_t
618 kiconvstr_to_15(char *inarray, size_t *inlen, char *outarray,
619 size_t *outlen, int flag, int *errno)
621 return (kiconvstr_to_sb(KICONV_TBLID_8859_15, (uchar_t *)inarray,
622 inlen, (uchar_t *)outarray, outlen, flag, errno));
625 static size_t
626 kiconvstr_to_850(char *inarray, size_t *inlen, char *outarray,
627 size_t *outlen, int flag, int *errno)
629 return (kiconvstr_to_sb(KICONV_TBLID_850, (uchar_t *)inarray,
630 inlen, (uchar_t *)outarray, outlen, flag, errno));
634 * The following is the common kiconvstr function for conversions from
635 * single byte codesets to UTF-8.
637 static size_t
638 kiconvstr_fr_sb(size_t id, uchar_t *ib, size_t *inlen, uchar_t *ob,
639 size_t *outlen, int flag, int *errno)
641 size_t ret_val;
642 uchar_t *ibtail;
643 uchar_t *obtail;
644 size_t i;
645 size_t k;
646 int8_t sz;
647 boolean_t do_not_ignore_null;
649 ret_val = 0;
650 ibtail = ib + *inlen;
651 obtail = ob + *outlen;
652 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
654 while (ib < ibtail) {
655 if (*ib == '\0' && do_not_ignore_null)
656 break;
658 if (*ib < 0x80) {
659 if (ob >= obtail) {
660 *errno = E2BIG;
661 ret_val = (size_t)-1;
662 break;
664 *ob++ = *ib++;
665 continue;
668 k = *ib - 0x80;
669 sz = u8_number_of_bytes[to_u8_tbl[id][k].u8[0]];
671 if (sz <= 0) {
672 if (flag & KICONV_REPLACE_INVALID) {
673 if ((obtail - ob) < 3) {
674 *errno = E2BIG;
675 ret_val = (size_t)-1;
676 break;
679 /* Save KICONV_UTF8_REPLACEMENT_CHAR. */
680 *ob++ = 0xef;
681 *ob++ = 0xbf;
682 *ob++ = 0xbd;
683 ret_val++;
684 ib++;
686 continue;
689 *errno = EILSEQ;
690 ret_val = (size_t)-1;
691 break;
694 if ((obtail - ob) < sz) {
695 *errno = E2BIG;
696 ret_val = (size_t)-1;
697 break;
700 for (i = 0; i < sz; i++)
701 *ob++ = to_u8_tbl[id][k].u8[i];
703 ib++;
706 *inlen = ibtail - ib;
707 *outlen = obtail - ob;
709 return (ret_val);
713 * The following four functions are also entry points recorded at
714 * the conv_list[] at below.
716 static size_t
717 kiconvstr_fr_1252(char *inarray, size_t *inlen, char *outarray,
718 size_t *outlen, int flag, int *errno)
720 return (kiconvstr_fr_sb(KICONV_TBLID_1252, (uchar_t *)inarray,
721 inlen, (uchar_t *)outarray, outlen, flag, errno));
724 static size_t
725 kiconvstr_fr_1(char *inarray, size_t *inlen, char *outarray,
726 size_t *outlen, int flag, int *errno)
728 return (kiconvstr_fr_sb(KICONV_TBLID_8859_1, (uchar_t *)inarray,
729 inlen, (uchar_t *)outarray, outlen, flag, errno));
732 static size_t
733 kiconvstr_fr_15(char *inarray, size_t *inlen, char *outarray,
734 size_t *outlen, int flag, int *errno)
736 return (kiconvstr_fr_sb(KICONV_TBLID_8859_15, (uchar_t *)inarray,
737 inlen, (uchar_t *)outarray, outlen, flag, errno));
740 static size_t
741 kiconvstr_fr_850(char *inarray, size_t *inlen, char *outarray,
742 size_t *outlen, int flag, int *errno)
744 return (kiconvstr_fr_sb(KICONV_TBLID_850, (uchar_t *)inarray,
745 inlen, (uchar_t *)outarray, outlen, flag, errno));
749 * The following static vector contains the normalized code names
750 * and their corresponding code ids. They are somewhat arbitrarily ordered
751 * based on marketing data available. A code id could repeat for aliases.
753 * The vector was generated by using a small utility program called
754 * codeidlistgen.c that you can find from PSARC/2007/173/materials/util/.
756 * The code ids must be portable, i.e., if needed, you can always generate
757 * the code_list[] again with different code ids. You'll also need to
758 * update the conv_list[] at below.
760 #define KICONV_MAX_CODEID_ENTRY 68
761 #define KICONV_MAX_CODEID 42
763 static kiconv_code_list_t code_list[KICONV_MAX_CODEID_ENTRY] = {
764 { "utf8", 0 },
765 { "cp1252", 1 },
766 { "1252", 1 },
767 { "iso88591", 2 },
768 { "iso885915", 3 },
769 { "cp850", 4 },
770 { "850", 4 },
771 { "eucjp", 5 },
772 { "eucjpms", 6 },
773 { "cp932", 7 },
774 { "932", 7 },
775 { "shiftjis", 8 },
776 { "pck", 8 },
777 { "sjis", 8 },
778 { "gb18030", 9 },
779 { "gbk", 10 },
780 { "cp936", 10 },
781 { "936", 10 },
782 { "euccn", 11 },
783 { "euckr", 12 },
784 { "unifiedhangul", 13 },
785 { "cp949", 13 },
786 { "949", 13 },
787 { "big5", 14 },
788 { "cp950", 14 },
789 { "950", 14 },
790 { "big5hkscs", 15 },
791 { "euctw", 16 },
792 { "cp950hkscs", 17 },
793 { "cp1250", 18 },
794 { "1250", 18 },
795 { "iso88592", 19 },
796 { "cp852", 20 },
797 { "852", 20 },
798 { "cp1251", 21 },
799 { "1251", 21 },
800 { "iso88595", 22 },
801 { "koi8r", 23 },
802 { "cp866", 24 },
803 { "866", 24 },
804 { "cp1253", 25 },
805 { "1253", 25 },
806 { "iso88597", 26 },
807 { "cp737", 27 },
808 { "737", 27 },
809 { "cp1254", 28 },
810 { "1254", 28 },
811 { "iso88599", 29 },
812 { "cp857", 30 },
813 { "857", 30 },
814 { "cp1256", 31 },
815 { "1256", 31 },
816 { "iso88596", 32 },
817 { "cp720", 33 },
818 { "720", 33 },
819 { "cp1255", 34 },
820 { "1255", 34 },
821 { "iso88598", 35 },
822 { "cp862", 36 },
823 { "862", 36 },
824 { "cp1257", 37 },
825 { "1257", 37 },
826 { "iso885913", 38 },
827 { "iso885910", 39 },
828 { "iso885911", 40 },
829 { "tis620", 40 },
830 { "iso88593", 41 },
831 { "iso88594", 42 },
835 * The list of code conversions supported are grouped together per
836 * module which will be loaded as needed.
838 #define KICONV_MAX_CONVERSIONS 84
840 static kiconv_conv_list_t conv_list[KICONV_MAX_CONVERSIONS] = {
841 /* Embedded code conversions: */
843 1, 0, KICONV_EMBEDDED,
844 open_to_1252, kiconv_to_sb, close_to_sb, kiconvstr_to_1252
847 0, 1, KICONV_EMBEDDED,
848 open_fr_1252, kiconv_fr_sb, close_fr_sb, kiconvstr_fr_1252
851 2, 0, KICONV_EMBEDDED,
852 open_to_88591, kiconv_to_sb, close_to_sb, kiconvstr_to_1
855 0, 2, KICONV_EMBEDDED,
856 open_fr_88591, kiconv_fr_sb, close_fr_sb, kiconvstr_fr_1
859 3, 0, KICONV_EMBEDDED,
860 open_to_885915, kiconv_to_sb, close_to_sb, kiconvstr_to_15
863 0, 3, KICONV_EMBEDDED,
864 open_fr_885915, kiconv_fr_sb, close_fr_sb, kiconvstr_fr_15
867 4, 0, KICONV_EMBEDDED,
868 open_to_850, kiconv_to_sb, close_to_sb, kiconvstr_to_850
871 0, 4, KICONV_EMBEDDED,
872 open_fr_850, kiconv_fr_sb, close_fr_sb, kiconvstr_fr_850
875 /* kiconv_ja module conversions: */
876 { 0, 5, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
877 { 5, 0, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
878 { 0, 6, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
879 { 6, 0, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
880 { 0, 7, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
881 { 7, 0, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
882 { 0, 8, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
883 { 8, 0, KICONV_MODULE_ID_JA, NULL, NULL, NULL, NULL },
885 /* kiconv_sc module conversions: */
886 { 0, 9, KICONV_MODULE_ID_SC, NULL, NULL, NULL, NULL },
887 { 9, 0, KICONV_MODULE_ID_SC, NULL, NULL, NULL, NULL },
888 { 0, 10, KICONV_MODULE_ID_SC, NULL, NULL, NULL, NULL },
889 { 10, 0, KICONV_MODULE_ID_SC, NULL, NULL, NULL, NULL },
890 { 0, 11, KICONV_MODULE_ID_SC, NULL, NULL, NULL, NULL },
891 { 11, 0, KICONV_MODULE_ID_SC, NULL, NULL, NULL, NULL },
893 /* kiconv_ko module conversions: */
894 { 0, 12, KICONV_MODULE_ID_KO, NULL, NULL, NULL, NULL },
895 { 12, 0, KICONV_MODULE_ID_KO, NULL, NULL, NULL, NULL },
896 { 0, 13, KICONV_MODULE_ID_KO, NULL, NULL, NULL, NULL },
897 { 13, 0, KICONV_MODULE_ID_KO, NULL, NULL, NULL, NULL },
899 /* kiconv_tc module conversions: */
900 { 0, 14, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
901 { 14, 0, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
902 { 0, 15, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
903 { 15, 0, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
904 { 0, 16, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
905 { 16, 0, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
906 { 0, 17, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
907 { 17, 0, KICONV_MODULE_ID_TC, NULL, NULL, NULL, NULL },
909 /* kiconv_emea module conversions: */
910 { 0, 18, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
911 { 18, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
912 { 0, 19, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
913 { 19, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
914 { 0, 20, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
915 { 20, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
916 { 0, 21, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
917 { 21, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
918 { 0, 22, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
919 { 22, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
920 { 0, 23, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
921 { 23, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
922 { 0, 24, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
923 { 24, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
924 { 0, 25, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
925 { 25, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
926 { 0, 26, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
927 { 26, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
928 { 0, 27, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
929 { 27, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
930 { 0, 28, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
931 { 28, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
932 { 0, 29, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
933 { 29, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
934 { 0, 30, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
935 { 30, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
936 { 0, 31, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
937 { 31, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
938 { 0, 32, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
939 { 32, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
940 { 0, 33, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
941 { 33, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
942 { 0, 34, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
943 { 34, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
944 { 0, 35, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
945 { 35, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
946 { 0, 36, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
947 { 36, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
948 { 0, 37, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
949 { 37, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
950 { 0, 38, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
951 { 38, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
952 { 0, 39, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
953 { 39, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
954 { 0, 40, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
955 { 40, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
956 { 0, 41, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
957 { 41, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
958 { 0, 42, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
959 { 42, 0, KICONV_MODULE_ID_EMEA, NULL, NULL, NULL, NULL },
962 /* The list of implemeted and supported modules. */
963 static kiconv_mod_list_t module_list[KICONV_MAX_MODULE_ID + 1] = {
964 "kiconv_embedded", 0,
965 "kiconv_ja", 0,
966 "kiconv_sc", 0,
967 "kiconv_ko", 0,
968 "kiconv_tc", 0,
969 "kiconv_emea", 0,
973 * We use conv_list_lock to restrict data access of both conv_list[] and
974 * module_list[] as they are tightly coupled critical sections that need to be
975 * dealt together as a unit.
977 static kmutex_t conv_list_lock;
979 void
980 kiconv_init()
982 mutex_init(&conv_list_lock, NULL, MUTEX_DEFAULT, NULL);
986 * The following is used to check on whether a kiconv module is being
987 * used or not at the _fini() of the module.
989 size_t
990 kiconv_module_ref_count(size_t mid)
992 int count;
994 if (mid <= 0 || mid > KICONV_MAX_MODULE_ID)
995 return (0);
997 mutex_enter(&conv_list_lock);
999 count = module_list[mid].refcount;
1001 mutex_exit(&conv_list_lock);
1003 return (count);
1007 * This function "normalizes" a given code name, n, by not including skippable
1008 * characters and folding uppercase letters to corresponding lowercase letters.
1009 * We only fold 7-bit ASCII uppercase characters since the names should be in
1010 * Portable Character Set of 7-bit ASCII.
1012 * By doing this, we will be able to maximize the code name matches.
1014 static size_t
1015 normalize_codename(const char *n)
1017 char s[KICONV_MAX_CODENAME_LEN + 1];
1018 size_t i;
1020 if (n == NULL)
1021 return ((size_t)-1);
1023 for (i = 0; *n; n++) {
1024 if (KICONV_SKIPPABLE_CHAR(*n))
1025 continue;
1027 /* If unreasonably lengthy, we don't support such names. */
1028 if (i >= KICONV_MAX_CODENAME_LEN)
1029 return ((size_t)-1);
1031 s[i++] = (*n >= 'A' && *n <= 'Z') ? *n - 'A' + 'a' : *n;
1033 s[i] = '\0';
1035 /* With the normalized name, find the corresponding codeset id. */
1036 for (i = 0; i < KICONV_MAX_CODEID_ENTRY; i++)
1037 if (strcmp(s, code_list[i].name) == 0)
1038 return (code_list[i].id);
1041 * In future time, we will also have a few more lines of code at below
1042 * that will deal with other user-created modules' fromcodes and
1043 * tocodes including aliases in a different vector. For now, we don't
1044 * support that but only the known names to this project at this time.
1047 return ((size_t)-1);
1051 * This function called from mod_install() registers supplied code
1052 * conversions. At this point, it does not honor aliases and hence does not
1053 * use nowait data field from the kiconv module info data structure.
1056 kiconv_register_module(kiconv_module_info_t *info)
1058 size_t mid;
1059 size_t fid;
1060 size_t tid;
1061 size_t i;
1062 size_t j;
1063 kiconv_ops_t *op;
1065 /* Validate the given kiconv module info. */
1066 if (info == NULL || info->module_name == NULL ||
1067 info->kiconv_num_convs == 0 || info->kiconv_ops_tbl == NULL)
1068 return (EINVAL);
1071 * Check if this is one of the known modules. At this point,
1072 * we do not allow user-defined kiconv modules and that'd be for
1073 * a future project.
1075 for (mid = 1; mid <= KICONV_MAX_MODULE_ID; mid++)
1076 if (strcmp(module_list[mid].name, info->module_name) == 0)
1077 break;
1078 if (mid > KICONV_MAX_MODULE_ID)
1079 return (EINVAL);
1081 /* Let's register the conversions supplied. */
1082 mutex_enter(&conv_list_lock);
1085 * This is very unlikely situation but by any chance we don't want to
1086 * register a module that is already in.
1088 if (module_list[mid].refcount > 0) {
1089 mutex_exit(&conv_list_lock);
1090 return (EAGAIN);
1093 for (i = 0; i < info->kiconv_num_convs; i++) {
1094 op = &(info->kiconv_ops_tbl[i]);
1096 fid = normalize_codename(op->fromcode);
1097 tid = normalize_codename(op->tocode);
1100 * If we find anything wrong in this particular conversion,
1101 * we skip this one and continue to the next one. This include
1102 * a case where there is a conversion already being assigned
1103 * into the conv_list[] somehow, i.e., new one never kicks out
1104 * old one.
1106 if (op->kiconv_open == NULL || op->kiconv == NULL ||
1107 op->kiconv_close == NULL || op->kiconvstr == NULL)
1108 continue;
1110 for (j = 0; j < KICONV_MAX_CONVERSIONS; j++) {
1111 if (conv_list[j].mid == mid &&
1112 conv_list[j].fid == fid &&
1113 conv_list[j].tid == tid) {
1114 if (conv_list[j].open == NULL) {
1115 conv_list[j].open = op->kiconv_open;
1116 conv_list[j].kiconv = op->kiconv;
1117 conv_list[j].close = op->kiconv_close;
1118 conv_list[j].kiconvstr = op->kiconvstr;
1120 break;
1125 mutex_exit(&conv_list_lock);
1127 return (0);
1131 * The following function called during mod_remove() will try to unregister,
1132 * i.e., clear up conversion function pointers, from the conv_list[] if it
1133 * can. If there is any code conversions being used, then, the function will
1134 * just return EBUSY indicating that the module cannot be unloaded.
1137 kiconv_unregister_module(kiconv_module_info_t *info)
1139 size_t mid;
1140 size_t i;
1142 if (info == NULL || info->module_name == NULL ||
1143 info->kiconv_num_convs == 0 || info->kiconv_ops_tbl == NULL)
1144 return (EINVAL);
1146 for (mid = 1; mid <= KICONV_MAX_MODULE_ID; mid++)
1147 if (strcmp(module_list[mid].name, info->module_name) == 0)
1148 break;
1149 if (mid > KICONV_MAX_MODULE_ID)
1150 return (EINVAL);
1152 mutex_enter(&conv_list_lock);
1155 * If any of the conversions are used, then, this module canont be
1156 * unloaded.
1158 if (module_list[mid].refcount > 0) {
1159 mutex_exit(&conv_list_lock);
1160 return (EBUSY);
1164 * Otherwise, we unregister all conversions from this module
1165 * and be ready for the unloading. At this point, we only care about
1166 * the conversions we know about with the module.
1168 for (i = 0; i < KICONV_MAX_CONVERSIONS; i++) {
1169 if (conv_list[i].mid == mid) {
1170 conv_list[i].open = NULL;
1171 conv_list[i].kiconv = NULL;
1172 conv_list[i].close = NULL;
1173 conv_list[i].kiconvstr = NULL;
1177 mutex_exit(&conv_list_lock);
1179 return (0);
1183 * The following function check if asked code conversion is available
1184 * and if necessary, load the corresponding kiconv module that contains
1185 * the conversion (and others).
1187 static kiconv_t
1188 check_and_load_conversions(const char *tocode, const char *fromcode)
1190 kiconv_t kcd;
1191 size_t tid;
1192 size_t fid;
1193 size_t mid;
1194 size_t i;
1196 /* Normalize the given names and find the corresponding code ids. */
1197 tid = normalize_codename(tocode);
1198 if (tid == (size_t)-1)
1199 return ((kiconv_t)-1);
1201 fid = normalize_codename(fromcode);
1202 if (fid == (size_t)-1)
1203 return ((kiconv_t)-1);
1206 * Search the conversion.
1208 * If the conversion isn't supported, just return -1.
1209 * If the conversion is supported but there is no corresponding
1210 * module loaded, try to load it and if successful, return
1211 * a kiconv conversion descriptor memory block.
1213 * We maintain a reference counter of uint_t for each module.
1215 mutex_enter(&conv_list_lock);
1217 for (i = 0; i < KICONV_MAX_CONVERSIONS; i++)
1218 if (conv_list[i].tid == tid && conv_list[i].fid == fid)
1219 break;
1220 if (i >= KICONV_MAX_CONVERSIONS) {
1221 mutex_exit(&conv_list_lock);
1222 return ((kiconv_t)-1);
1225 mid = conv_list[i].mid;
1227 if (conv_list[i].open == NULL) {
1228 mutex_exit(&conv_list_lock);
1230 if (modload("kiconv", module_list[mid].name) < 0)
1231 return ((kiconv_t)-1);
1234 * Let's double check if something happened right after
1235 * the modload and/or if the module really has the conversion.
1237 mutex_enter(&conv_list_lock);
1239 if (conv_list[i].open == NULL) {
1240 mutex_exit(&conv_list_lock);
1241 return ((kiconv_t)-1);
1246 * If we got the conversion, we will use the conversion function
1247 * in the module and so let's increase the module's refcounter
1248 * so that the module won't be kicked out. (To be more exact and
1249 * specific, the "refcount" is thus the reference counter of
1250 * the module functions being used.)
1252 if (module_list[mid].refcount < UINT_MAX)
1253 module_list[mid].refcount++;
1255 mutex_exit(&conv_list_lock);
1257 kcd = (kiconv_t)kmem_alloc(sizeof (kiconv_data_t), KM_SLEEP);
1258 kcd->handle = (void *)-1;
1259 kcd->id = i;
1261 return (kcd);
1265 * The following are the four "Committed" interfaces.
1267 kiconv_t
1268 kiconv_open(const char *tocode, const char *fromcode)
1270 kiconv_t kcd;
1271 size_t mid;
1273 kcd = check_and_load_conversions(tocode, fromcode);
1274 if (kcd == (kiconv_t)-1)
1275 return ((kiconv_t)-1);
1277 kcd->handle = (conv_list[kcd->id].open)();
1278 if (kcd->handle == (void *)-1) {
1280 * If the conversion couldn't be opened for some reason,
1281 * then, we unallocate the kcd and, more importantly, before
1282 * that, we also decrease the module reference counter.
1284 mid = conv_list[kcd->id].mid;
1286 mutex_enter(&conv_list_lock);
1288 if (module_list[mid].refcount > 0)
1289 module_list[mid].refcount--;
1291 mutex_exit(&conv_list_lock);
1293 kmem_free((void *)kcd, sizeof (kiconv_data_t));
1295 return ((kiconv_t)-1);
1298 return (kcd);
1301 size_t
1302 kiconv(kiconv_t kcd, char **inbuf, size_t *inbytesleft,
1303 char **outbuf, size_t *outbytesleft, int *errno)
1305 /* Do some minimum checking on the kiconv conversion descriptor. */
1306 if (! kcd || kcd == (kiconv_t)-1 || conv_list[kcd->id].kiconv == NULL) {
1307 *errno = EBADF;
1308 return ((size_t)-1);
1311 return ((conv_list[kcd->id].kiconv)(kcd->handle, inbuf, inbytesleft,
1312 outbuf, outbytesleft, errno));
1316 kiconv_close(kiconv_t kcd)
1318 int ret;
1319 size_t mid;
1321 if (! kcd || kcd == (kiconv_t)-1 || conv_list[kcd->id].close == NULL)
1322 return (EBADF);
1324 mid = conv_list[kcd->id].mid;
1326 ret = (conv_list[kcd->id].close)(kcd->handle);
1328 kmem_free((void *)kcd, sizeof (kiconv_data_t));
1330 mutex_enter(&conv_list_lock);
1333 * While we maintain reference conter for each module, once loaded,
1334 * we don't modunload from kiconv functions even if the counter
1335 * reaches back to zero.
1337 if (module_list[mid].refcount > 0)
1338 module_list[mid].refcount--;
1340 mutex_exit(&conv_list_lock);
1342 return (ret);
1345 size_t
1346 kiconvstr(const char *tocode, const char *fromcode, char *inarray,
1347 size_t *inlen, char *outarray, size_t *outlen, int flag, int *errno)
1349 kiconv_t kcd;
1350 size_t ret;
1351 size_t mid;
1353 kcd = check_and_load_conversions(tocode, fromcode);
1354 if (kcd == (kiconv_t)-1 || conv_list[kcd->id].kiconvstr == NULL) {
1355 *errno = EBADF;
1356 return ((size_t)-1);
1359 mid = conv_list[kcd->id].mid;
1361 ret = (conv_list[kcd->id].kiconvstr)(inarray, inlen, outarray, outlen,
1362 flag, errno);
1364 kmem_free((void *)kcd, sizeof (kiconv_data_t));
1366 mutex_enter(&conv_list_lock);
1368 if (module_list[mid].refcount > 0)
1369 module_list[mid].refcount--;
1371 mutex_exit(&conv_list_lock);
1373 return (ret);