usr/src/man/man5/iconv_unicode.5

   1 '\" te
   2 .\" Copyright (c) 1997, Sun Microsystems, Inc.  All Rights Reserved.
   3 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
   4 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
   5 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
   6 .TH ICONV_UNICODE 5 "Apr 18, 1997"
   7 .SH NAME
   8 iconv_unicode \- code set conversion tables for Unicode
   9 .SH DESCRIPTION
  10 .sp
  11 .LP
  12 The following code set conversions are supported:
  13 .sp
  14 .in +2
  15 .nf
  16                     CODE SET CONVERSIONS SUPPORTED
  17                     ------------------------------
  18   FROM Code Set                               TO Code Set
  19       Code              FROM          Target Code            TO
  20                         Filename                             Filename
  21                         Element                              Element
  22
  23 ISO 8859-1 (Latin 1)    8859-1            UTF-8               UTF-8
  24 ISO 8859-2 (Latin 2)    8859-2            UTF-8               UTF-8
  25 ISO 8859-3 (Latin 3)    8859-3            UTF-8               UTF-8
  26 ISO 8859-4 (Latin 4)    8859-4            UTF-8               UTF-8
  27 ISO 8859-5 (Cyrillic)   8859-5            UTF-8               UTF-8
  28 ISO 8859-6 (Arabic)     8859-6            UTF-8               UTF-8
  29 ISO 8859-7 (Greek)      8859-7            UTF-8               UTF-8
  30 ISO 8859-8 (Hebrew)     8859-8            UTF-8               UTF-8
  31 ISO 8859-9 (Latin 5)    8859-9            UTF-8               UTF-8
  32 ISO 8859-10 (Latin 6)   8859-10           UTF-8               UTF-8
  33 Japanese EUC            eucJP             UTF-8               UTF-8
  34 Chinese/PRC EUC
  35 (GB 2312-1980)          gb2312            UTF-8               UTF-8
  36 ISO-2022                iso2022           UTF-8               UTF-8
  37 Korean EUC              ko_KR-euc         Korean UTF-8        ko_KR-UTF-8
  38 ISO-2022-KR             ko_KR-iso2022-7   Korean UTF-8        ko_KR_UTF-8
  39 Korean Johap
  40 (KS C 5601-1987)        ko_KR-johap       Korean UTF-8        ko_KR-UTF-8
  41 Korean Johap
  42 (KS C 5601-1992)        ko_KR-johap92     Korean UTF-8        ko_KR-UTF-8
  43 Korean UTF-8            ko_KR-UTF-8       Korean EUC          ko_KR-euc
  44 Korean UTF-8            ko_KR-UTF-8       Korean Johap        ko_KR-johap
  45                                           (KS C 5601-1987)
  46 Korean UTF-8            ko_KR-UTF-8       Korean Johap        ko_KR-johap92
  47                                           (KS C 5601-1992)
  48 KOI8-R (Cyrillic)       KOI8-R            UCS-2               UCS-2
  49 KOI8-R (Cyrillic)       KOI8-R            UTF-8               UTF-8
  50 PC Kanji (SJIS)         PCK               UTF-8               UTF-8
  51 PC Kanji (SJIS)         SJIS              UTF-8               UTF-8
  52 UCS-2                   UCS-2             KOI8-R (Cyrillic)   KOI8-R
  53 UCS-2                   UCS-2             UCS-4               UCS-4
  54 .fi
  55 .in -2
  56 .sp
  57
  58 .sp
  59 .in +2
  60 .nf
  61                     CODE SET CONVERSIONS SUPPORTED
  62                     ------------------------------
  63   FROM Code Set                               TO Code Set
  64       Code              FROM          Target Code            TO
  65                         Filename                             Filename
  66                         Element                              Element
  67
  68 UCS-2              UCS-2           UTF-7                   UTF-7
  69 UCS-2              UCS-2           UTF-8                   UTF-8
  70 UCS-4              UCS-4           UCS-2                   UCS-2
  71 UCS-4              UCS-4           UTF-16                  UTF-16
  72 UCS-4              UCS-4           UTF-7                   UTF-7
  73 UCS-4              UCS-4           UTF-8                   UTF-8
  74 UTF-16             UTF-16          UCS-4                   UCS-4
  75 UTF-16             UTF-16          UTF-8                   UTF-8
  76 UTF-7              UTF-7           UCS-2                   UCS-2
  77 UTF-7              UTF-7           UCS-4                   UCS-4
  78 UTF-7              UTF-7           UTF-8                   UTF-8
  79 UTF-8              UTF-8           ISO 8859-1 (Latin 1)    8859-1
  80 UTF-8              UTF-8           ISO 8859-2 (Latin 2)    8859-2
  81 UTF-8              UTF-8           ISO 8859-3 (Latin 3)    8859-3
  82 UTF-8              UTF-8           ISO 8859-4 (Latin 4)    8859-4
  83 UTF-8              UTF-8           ISO 8859-5 (Cyrillic)   8859-5
  84 UTF-8              UTF-8           ISO 8859-6 (Arabic)     8859-6
  85 UTF-8              UTF-8           ISO 8859-7 (Greek)      8859-7
  86 UTF-8              UTF-8           ISO 8859-8 (Hebrew)     8859-8
  87 UTF-8              UTF-8           ISO 8859-9 (Latin 5)    8859-9
  88 UTF-8              UTF-8           ISO 8859-10 (Latin 6)   8859-10
  89 UTF-8              UTF-8           Japanese EUC            eucJP
  90 UTF-8              UTF-8           Chinese/PRC EUC         gb2312
  91                                    (GB 2312-1980)
  92 UTF-8              UTF-8           ISO-2022                iso2022
  93 UTF-8              UTF-8           KOI8-R (Cyrillic)       KOI8-R
  94 UTF-8              UTF-8           PC Kanji (SJIS)         PCK
  95 UTF-8              UTF-8           PC Kanji (SJIS)         SJIS
  96 UTF-8              UTF-8           UCS-2                   UCS-2
  97 UTF-8              UTF-8           UCS-4                   UCS-4
  98 UTF-8              UTF-8           UTF-16                  UTF-16
  99 UTF-8              UTF-8           UTF-7                   UTF-7
 100 UTF-8              UTF-8           Chinese/PRC EUC         zh_CN.euc
 101                                    (GB 2312-1980)
 102 .fi
 103 .in -2
 104 .sp
 105
 106 .sp
 107 .in +2
 108 .nf
 109                     CODE SET CONVERSIONS SUPPORTED
 110                     ------------------------------
 111   FROM Code Set                               TO Code Set
 112       Code              FROM          Target Code            TO
 113                         Filename                             Filename
 114                         Element                              Element
 115
 116 UTF-8                 UTF-8             ISO 2022-CN           zh_CN.iso2022-7
 117 UTF-8                 UTF-8             Chinese/Taiwan Big5   zh_TW-big5
 118 UTF-8                 UTF-8             Chinese/Taiwan  EUC   zh_TW-euc
 119                                         (CNS 11643-1992)
 120 UTF-8                 UTF-8             ISO 2022-TW           zh_TW-iso2022-7
 121 Chinese/PRC EUC       zh_CN.euc         UTF-8                 UTF-8
 122 (GB 2312-1980)
 123 ISO 2022-CN           zh_CN.iso2022-7   UTF-8                 UTF-8
 124 Chinese/Taiwan Big5   zh_TW-big5        UTF-8                 UTF-8
 125 Chinese/Taiwan  EUC   zh_TW-euc         UTF-8                 UTF-8
 126 (CNS 11643-1992)
 127 ISO 2022-TW           zh_TW-iso2022-7   UTF-8                 UTF-8
 128 .fi
 129 .in -2
 130 .sp
 131
 132 .SH EXAMPLES
 133 .LP
 134 \fBExample 1 \fRThe library module filename
 135 .sp
 136 .LP
 137 In the conversion library, \fB/usr/lib/iconv\fR (see \fBiconv\fR(3C)), the
 138 library module filename is composed of two symbolic elements separated by the
 139 percent sign (\fB%\fR). The first symbol specifies the code set that is being
 140 converted; the second symbol specifies the \fItarget code\fR, that is, the code
 141 set to which the first one is being converted.
 142
 143 .sp
 144 .LP
 145 In the conversion table above, the first  symbol is termed the "FROM Filename
 146 Element". The second symbol, representing the target code set, is the "TO
 147 Filename Element".
 148
 149 .sp
 150 .LP
 151 For example, the library module filename to convert from the \fIKorean\fR
 152 \fIEUC\fR code set to the \fIKorean\fR \fIUTF-8\fR code set is
 153
 154 .sp
 155 .LP
 156 \fBko_KR-euc%ko_KR-UTF-8\fR
 157
 158 .SH FILES
 159 .sp
 160 .ne 2
 161 .na
 162 \fB\fB/usr/lib/iconv/*.so\fR\fR
 163 .ad
 164 .RS 23n
 165 conversion modules
 166 .RE
 167
 168 .SH SEE ALSO
 169 .sp
 170 .LP
 171 \fBiconv\fR(1), \fBiconv\fR(3C), \fBiconv\fR(5)
 172 .sp
 173 .LP
 174 Chernov, A., \fIRegistration of a Cyrillic Character Set\fR, RFC 1489, RELCOM
 175 Development Team, July 1993.
 176 .sp
 177 .LP
 178 Chon, K., H. Je Park, and U. Choi, \fIKorean Character Encoding for Internet
 179 Messages\fR, RFC 1557, Solvit Chosun Media, December 1993.
 180 .sp
 181 .LP
 182 Goldsmith, D., and M. Davis, \fIUTF-7 - A Mail-Safe Transformation Format of
 183 Unicode\fR, RFC 1642, Taligent, Inc., July 1994.
 184 .sp
 185 .LP
 186 Lee, F., \fIHZ - A Data Format for Exchanging Files of\fR \fIArbitrarily Mixed
 187 Chinese and ASCII characters\fR, RFC 1843, Stanford University, August 1995.
 188 .sp
 189 .LP
 190 Murai, J., M. Crispin, and E. van der Poel, \fIJapanese Character Encoding for
 191 Internet Messages\fR, RFC 1468, Keio University, Panda Programming, June 1993.
 192 .sp
 193 .LP
 194 Nussbacher, H., and Y. Bourvine, \fIHebrew Character Encoding for Internet
 195 Messages\fR, RFC 1555, Israeli Inter-University, Hebrew University, December
 196 1993.
 197 .sp
 198 .LP
 199 Ohta, M., \fICharacter Sets ISO-10646 and ISO-10646-J-1\fR, RFC 1815, Tokyo
 200 Institute of Technology, July 1995.
 201 .sp
 202 .LP
 203 Ohta, M., and K. Handa, \fIISO-2022-JP-2: Multilingual Extension of
 204 ISO-2022-JP\fR, RFC 1554, Tokyo Institute of Technology, December 1993.
 205 .sp
 206 .LP
 207 Reynolds, J., and J. Postel, \fIASSIGNED NUMBERS\fR, RFC 1700, University of
 208 Southern California/Information Sciences Institute, October 1994.
 209 .sp
 210 .LP
 211 Simonson, K., \fICharacter Mnemonics & Character Sets\fR, RFC 1345, Rationel
 212 Almen Planlaegning, June 1992.
 213 .sp
 214 .LP
 215 Spinellis, D., \fIGreek Character Encoding for Electronic Mail Messages\fR, RFC
 216 1947, SENA S.A., May 1996.
 217 .sp
 218 .LP
 219 The Unicode Consortium, \fIThe Unicode Standard\fR, Version 2.0, Addison Wesley
 220 Developers Press, July 1996.
 221 .sp
 222 .LP
 223 Wei, Y., Y. Zhang, J. Li, J. Ding, and Y. Jiang, \fIASCII Printable
 224 Characters-Based Chinese Character Encoding\fR \fIfor Internet Messages\fR, RFC
 225 1842, AsiaInfo Services Inc., Harvard University, Rice University, University
 226 of Maryland, August 1995.
 227 .sp
 228 .LP
 229 Yergeau, F., \fIUTF-8, a transformation format of Unicode and ISO 10646\fR, RFC
 230 2044, Alis Technologies, October 1996.
 231 .sp
 232 .LP
 233 Zhu, H., D. Hu, Z. Wang, T. Kao, W. Chang, and M. Crispin, \fIChinese Character
 234 Encoding for Internet Messages\fR, RFC 1922, Tsinghua University, China
 235 Information Technology Standardization Technical Committee (CITS), Institute
 236 for Information Industry (III), University of Washington, March 1996.
 237 .SH NOTES
 238 .sp
 239 .LP
 240 ISO 8859 character sets using Latin alphabetic characters are distinguished as
 241 follows:
 242 .sp
 243 .ne 2
 244 .na
 245 \fB\fBISO\fR \fB8859-1\fR \fB(Latin\fR \fB1)\fR\fR
 246 .ad
 247 .RS 25n
 248 For most West European languages, including:
 249 .sp
 250
 251 .sp
 252 .TS
 253 l l l
 254 l l l .
 255 Albanian        Finnish Italian
 256 Catalan French  Norwegian
 257 Danish  German  Portuguese
 258 Dutch   Galician        Spanish
 259 English Irish   Swedish
 260 Faeroese        Icelandic
 261 .TE
 262
 263 .RE
 264
 265 .sp
 266 .ne 2
 267 .na
 268 \fB\fBISO\fR \fB8859-2\fR \fB(Latin\fR \fB2)\fR\fR
 269 .ad
 270 .RS 25n
 271 For most Latin-written Slavic and Central European languages:
 272 .sp
 273
 274 .sp
 275 .TS
 276 l l l
 277 l l l .
 278 Czech   Polish  Slovak
 279 German  Rumanian        Slovene
 280 Hungarian       Croatian
 281 .TE
 282
 283 .RE
 284
 285 .sp
 286 .ne 2
 287 .na
 288 \fB\fBISO\fR \fB8859-3\fR \fB(Latin\fR \fB3)\fR\fR
 289 .ad
 290 .RS 25n
 291 Popularly used for Esperanto, Galician, Maltese, and Turkish.
 292 .RE
 293
 294 .sp
 295 .ne 2
 296 .na
 297 \fB\fBISO\fR \fB8859-4\fR \fB(Latin\fR \fB4)\fR\fR
 298 .ad
 299 .RS 25n
 300 Introduces letters for Estonian, Latvian, and Lithuanian. It is an incomplete
 301 predecessor of ISO 8859-10 (Latin 6).
 302 .RE
 303
 304 .sp
 305 .ne 2
 306 .na
 307 \fB\fBISO\fR \fB8859-9\fR \fB(Latin\fR \fB5)\fR\fR
 308 .ad
 309 .RS 25n
 310 Replaces the rarely needed Icelandic letters in ISO 8859-1 (Latin 1) with the
 311 Turkish ones.
 312 .RE
 313
 314 .sp
 315 .ne 2
 316 .na
 317 \fB\fBISO\fR \fB8859-10\fR \fB(Latin\fR \fB6)\fR\fR
 318 .ad
 319 .RS 25n
 320 Adds the last Inuit (Greenlandic) and Sami (Lappish) letters that were not
 321 included in ISO 8859-4 (Latin 4) to complete coverage of the Nordic area.
 322 .RE
 323