Merge branch 'master' of git://github.com/illumos/illumos-gate
[unleashed.git] / usr / src / man / man5 / iconv_unicode.5
blob3f59d2dbb6cf26a708883d186e5e7d6efe8d9c6c
1 '\" te
2 .\" Copyright (c) 1997, Sun Microsystems, Inc.  All Rights Reserved.
3 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
4 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
5 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
6 .TH ICONV_UNICODE 5 "Apr 18, 1997"
7 .SH NAME
8 iconv_unicode \- code set conversion tables for Unicode
9 .SH DESCRIPTION
10 .sp
11 .LP
12 The following code set conversions are supported:
13 .sp
14 .in +2
15 .nf
16                     CODE SET CONVERSIONS SUPPORTED
17                     ------------------------------
18   FROM Code Set                               TO Code Set
19       Code              FROM          Target Code            TO
20                         Filename                             Filename
21                         Element                              Element
23 ISO 8859-1 (Latin 1)    8859-1            UTF-8               UTF-8
24 ISO 8859-2 (Latin 2)    8859-2            UTF-8               UTF-8
25 ISO 8859-3 (Latin 3)    8859-3            UTF-8               UTF-8
26 ISO 8859-4 (Latin 4)    8859-4            UTF-8               UTF-8
27 ISO 8859-5 (Cyrillic)   8859-5            UTF-8               UTF-8
28 ISO 8859-6 (Arabic)     8859-6            UTF-8               UTF-8
29 ISO 8859-7 (Greek)      8859-7            UTF-8               UTF-8
30 ISO 8859-8 (Hebrew)     8859-8            UTF-8               UTF-8
31 ISO 8859-9 (Latin 5)    8859-9            UTF-8               UTF-8
32 ISO 8859-10 (Latin 6)   8859-10           UTF-8               UTF-8
33 Japanese EUC            eucJP             UTF-8               UTF-8
34 Chinese/PRC EUC
35 (GB 2312-1980)          gb2312            UTF-8               UTF-8
36 ISO-2022                iso2022           UTF-8               UTF-8
37 Korean EUC              ko_KR-euc         Korean UTF-8        ko_KR-UTF-8
38 ISO-2022-KR             ko_KR-iso2022-7   Korean UTF-8        ko_KR_UTF-8
39 Korean Johap
40 (KS C 5601-1987)        ko_KR-johap       Korean UTF-8        ko_KR-UTF-8
41 Korean Johap
42 (KS C 5601-1992)        ko_KR-johap92     Korean UTF-8        ko_KR-UTF-8
43 Korean UTF-8            ko_KR-UTF-8       Korean EUC          ko_KR-euc
44 Korean UTF-8            ko_KR-UTF-8       Korean Johap        ko_KR-johap
45                                           (KS C 5601-1987)
46 Korean UTF-8            ko_KR-UTF-8       Korean Johap        ko_KR-johap92
47                                           (KS C 5601-1992)
48 KOI8-R (Cyrillic)       KOI8-R            UCS-2               UCS-2
49 KOI8-R (Cyrillic)       KOI8-R            UTF-8               UTF-8
50 PC Kanji (SJIS)         PCK               UTF-8               UTF-8
51 PC Kanji (SJIS)         SJIS              UTF-8               UTF-8
52 UCS-2                   UCS-2             KOI8-R (Cyrillic)   KOI8-R
53 UCS-2                   UCS-2             UCS-4               UCS-4
54 .fi
55 .in -2
56 .sp
58 .sp
59 .in +2
60 .nf
61                     CODE SET CONVERSIONS SUPPORTED
62                     ------------------------------
63   FROM Code Set                               TO Code Set
64       Code              FROM          Target Code            TO
65                         Filename                             Filename
66                         Element                              Element
68 UCS-2              UCS-2           UTF-7                   UTF-7
69 UCS-2              UCS-2           UTF-8                   UTF-8
70 UCS-4              UCS-4           UCS-2                   UCS-2
71 UCS-4              UCS-4           UTF-16                  UTF-16
72 UCS-4              UCS-4           UTF-7                   UTF-7
73 UCS-4              UCS-4           UTF-8                   UTF-8
74 UTF-16             UTF-16          UCS-4                   UCS-4
75 UTF-16             UTF-16          UTF-8                   UTF-8
76 UTF-7              UTF-7           UCS-2                   UCS-2
77 UTF-7              UTF-7           UCS-4                   UCS-4
78 UTF-7              UTF-7           UTF-8                   UTF-8
79 UTF-8              UTF-8           ISO 8859-1 (Latin 1)    8859-1
80 UTF-8              UTF-8           ISO 8859-2 (Latin 2)    8859-2
81 UTF-8              UTF-8           ISO 8859-3 (Latin 3)    8859-3
82 UTF-8              UTF-8           ISO 8859-4 (Latin 4)    8859-4
83 UTF-8              UTF-8           ISO 8859-5 (Cyrillic)   8859-5
84 UTF-8              UTF-8           ISO 8859-6 (Arabic)     8859-6
85 UTF-8              UTF-8           ISO 8859-7 (Greek)      8859-7
86 UTF-8              UTF-8           ISO 8859-8 (Hebrew)     8859-8
87 UTF-8              UTF-8           ISO 8859-9 (Latin 5)    8859-9
88 UTF-8              UTF-8           ISO 8859-10 (Latin 6)   8859-10
89 UTF-8              UTF-8           Japanese EUC            eucJP
90 UTF-8              UTF-8           Chinese/PRC EUC         gb2312
91                                    (GB 2312-1980)
92 UTF-8              UTF-8           ISO-2022                iso2022
93 UTF-8              UTF-8           KOI8-R (Cyrillic)       KOI8-R
94 UTF-8              UTF-8           PC Kanji (SJIS)         PCK
95 UTF-8              UTF-8           PC Kanji (SJIS)         SJIS
96 UTF-8              UTF-8           UCS-2                   UCS-2
97 UTF-8              UTF-8           UCS-4                   UCS-4
98 UTF-8              UTF-8           UTF-16                  UTF-16
99 UTF-8              UTF-8           UTF-7                   UTF-7
100 UTF-8              UTF-8           Chinese/PRC EUC         zh_CN.euc
101                                    (GB 2312-1980)
103 .in -2
107 .in +2
109                     CODE SET CONVERSIONS SUPPORTED
110                     ------------------------------
111   FROM Code Set                               TO Code Set
112       Code              FROM          Target Code            TO
113                         Filename                             Filename
114                         Element                              Element
116 UTF-8                 UTF-8             ISO 2022-CN           zh_CN.iso2022-7
117 UTF-8                 UTF-8             Chinese/Taiwan Big5   zh_TW-big5
118 UTF-8                 UTF-8             Chinese/Taiwan  EUC   zh_TW-euc
119                                         (CNS 11643-1992)
120 UTF-8                 UTF-8             ISO 2022-TW           zh_TW-iso2022-7
121 Chinese/PRC EUC       zh_CN.euc         UTF-8                 UTF-8
122 (GB 2312-1980)
123 ISO 2022-CN           zh_CN.iso2022-7   UTF-8                 UTF-8
124 Chinese/Taiwan Big5   zh_TW-big5        UTF-8                 UTF-8
125 Chinese/Taiwan  EUC   zh_TW-euc         UTF-8                 UTF-8
126 (CNS 11643-1992)
127 ISO 2022-TW           zh_TW-iso2022-7   UTF-8                 UTF-8
129 .in -2
132 .SH EXAMPLES
134 \fBExample 1 \fRThe library module filename
137 In the conversion library, \fB/usr/lib/iconv\fR (see \fBiconv\fR(3C)), the
138 library module filename is composed of two symbolic elements separated by the
139 percent sign (\fB%\fR). The first symbol specifies the code set that is being
140 converted; the second symbol specifies the \fItarget code\fR, that is, the code
141 set to which the first one is being converted.
145 In the conversion table above, the first  symbol is termed the "FROM Filename
146 Element". The second symbol, representing the target code set, is the "TO
147 Filename Element".
151 For example, the library module filename to convert from the \fIKorean\fR
152 \fIEUC\fR code set to the \fIKorean\fR \fIUTF-8\fR code set is
156 \fBko_KR-euc%ko_KR-UTF-8\fR
158 .SH FILES
160 .ne 2
162 \fB\fB/usr/lib/iconv/*.so\fR\fR
164 .RS 23n
165 conversion modules
168 .SH SEE ALSO
171 \fBiconv\fR(1), \fBiconv\fR(3C), \fBiconv\fR(5)
174 Chernov, A., \fIRegistration of a Cyrillic Character Set\fR, RFC 1489, RELCOM
175 Development Team, July 1993.
178 Chon, K., H. Je Park, and U. Choi, \fIKorean Character Encoding for Internet
179 Messages\fR, RFC 1557, Solvit Chosun Media, December 1993.
182 Goldsmith, D., and M. Davis, \fIUTF-7 - A Mail-Safe Transformation Format of
183 Unicode\fR, RFC 1642, Taligent, Inc., July 1994.
186 Lee, F., \fIHZ - A Data Format for Exchanging Files of\fR \fIArbitrarily Mixed
187 Chinese and ASCII characters\fR, RFC 1843, Stanford University, August 1995.
190 Murai, J., M. Crispin, and E. van der Poel, \fIJapanese Character Encoding for
191 Internet Messages\fR, RFC 1468, Keio University, Panda Programming, June 1993.
194 Nussbacher, H., and Y. Bourvine, \fIHebrew Character Encoding for Internet
195 Messages\fR, RFC 1555, Israeli Inter-University, Hebrew University, December
196 1993.
199 Ohta, M., \fICharacter Sets ISO-10646 and ISO-10646-J-1\fR, RFC 1815, Tokyo
200 Institute of Technology, July 1995.
203 Ohta, M., and K. Handa, \fIISO-2022-JP-2: Multilingual Extension of
204 ISO-2022-JP\fR, RFC 1554, Tokyo Institute of Technology, December 1993.
207 Reynolds, J., and J. Postel, \fIASSIGNED NUMBERS\fR, RFC 1700, University of
208 Southern California/Information Sciences Institute, October 1994.
211 Simonson, K., \fICharacter Mnemonics & Character Sets\fR, RFC 1345, Rationel
212 Almen Planlaegning, June 1992.
215 Spinellis, D., \fIGreek Character Encoding for Electronic Mail Messages\fR, RFC
216 1947, SENA S.A., May 1996.
219 The Unicode Consortium, \fIThe Unicode Standard\fR, Version 2.0, Addison Wesley
220 Developers Press, July 1996.
223 Wei, Y., Y. Zhang, J. Li, J. Ding, and Y. Jiang, \fIASCII Printable
224 Characters-Based Chinese Character Encoding\fR \fIfor Internet Messages\fR, RFC
225 1842, AsiaInfo Services Inc., Harvard University, Rice University, University
226 of Maryland, August 1995.
229 Yergeau, F., \fIUTF-8, a transformation format of Unicode and ISO 10646\fR, RFC
230 2044, Alis Technologies, October 1996.
233 Zhu, H., D. Hu, Z. Wang, T. Kao, W. Chang, and M. Crispin, \fIChinese Character
234 Encoding for Internet Messages\fR, RFC 1922, Tsinghua University, China
235 Information Technology Standardization Technical Committee (CITS), Institute
236 for Information Industry (III), University of Washington, March 1996.
237 .SH NOTES
240 ISO 8859 character sets using Latin alphabetic characters are distinguished as
241 follows:
243 .ne 2
245 \fB\fBISO\fR \fB8859-1\fR \fB(Latin\fR \fB1)\fR\fR
247 .RS 25n
248 For most West European languages, including:
253 l l l
254 l l l .
255 Albanian        Finnish Italian
256 Catalan French  Norwegian
257 Danish  German  Portuguese
258 Dutch   Galician        Spanish
259 English Irish   Swedish
260 Faeroese        Icelandic       
266 .ne 2
268 \fB\fBISO\fR \fB8859-2\fR \fB(Latin\fR \fB2)\fR\fR
270 .RS 25n
271 For most Latin-written Slavic and Central European languages:
276 l l l
277 l l l .
278 Czech   Polish  Slovak
279 German  Rumanian        Slovene
280 Hungarian       Croatian        
286 .ne 2
288 \fB\fBISO\fR \fB8859-3\fR \fB(Latin\fR \fB3)\fR\fR
290 .RS 25n
291 Popularly used for Esperanto, Galician, Maltese, and Turkish.
295 .ne 2
297 \fB\fBISO\fR \fB8859-4\fR \fB(Latin\fR \fB4)\fR\fR
299 .RS 25n
300 Introduces letters for Estonian, Latvian, and Lithuanian. It is an incomplete
301 predecessor of ISO 8859-10 (Latin 6).
305 .ne 2
307 \fB\fBISO\fR \fB8859-9\fR \fB(Latin\fR \fB5)\fR\fR
309 .RS 25n
310 Replaces the rarely needed Icelandic letters in ISO 8859-1 (Latin 1) with the
311 Turkish ones.
315 .ne 2
317 \fB\fBISO\fR \fB8859-10\fR \fB(Latin\fR \fB6)\fR\fR
319 .RS 25n
320 Adds the last Inuit (Greenlandic) and Sami (Lappish) letters that were not
321 included in ISO 8859-4 (Latin 4) to complete coverage of the Nordic area.