1 (* Copyright 2005 b8_bavard, INRIA *)
3 This file is part of mldonkey.
5 mldonkey is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 mldonkey is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with mldonkey; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * This part has been inspired by the debian document "Introduction to i18n":
22 * http://www.debian.org/doc/manuals/intro-i18n/
26 (**********************************************************************************)
30 (**********************************************************************************)
35 | ANSI_X3_4_1968
| ANSI_X3_4_1986
| ASCII
| CP367
| IBM367
| ISO_IR_6
| ISO646_US
| ISO_646_IRV_1991
| US
| US_ASCII
| CSASCII
37 | ISO_10646_UCS_2
| UCS_2
| CSUNICODE
38 | UCS_2BE
| UNICODE_1_1
| UNICODEBIG
| CSUNICODE11
39 | UCS_2LE
| UNICODELITTLE
40 | ISO_10646_UCS_4
| UCS_4
| CSUCS4
49 | UNICODE_1_1_UTF_7
| UTF_7
| CSUNICODE11UTF7
56 | CP819
| IBM819
| ISO_8859_1
| ISO_IR_100
| ISO8859_1
| ISO_8859_1_1987
| L1
| LATIN1
| CSISOLATIN1
57 | ISO_8859_2
| ISO_IR_101
| ISO8859_2
| ISO_8859_2_1987
| L2
| LATIN2
| CSISOLATIN2
58 | ISO_8859_3
| ISO_IR_109
| ISO8859_3
| ISO_8859_3_1988
| L3
| LATIN3
| CSISOLATIN3
59 | ISO_8859_4
| ISO_IR_110
| ISO8859_4
| ISO_8859_4_1988
| L4
| LATIN4
| CSISOLATIN4
60 | CYRILLIC
| ISO_8859_5
| ISO_IR_144
| ISO8859_5
| ISO_8859_5_1988
| CSISOLATINCYRILLIC
61 | ARABIC
| ASMO_708
| ECMA_114
| ISO_8859_6
| ISO_IR_127
| ISO8859_6
| ISO_8859_6_1987
| CSISOLATINARABIC
62 | ECMA_118
| ELOT_928
| GREEK
| GREEK8
| ISO_8859_7
| ISO_IR_126
| ISO8859_7
| ISO_8859_7_1987
| CSISOLATINGREEK
63 | HEBREW
| ISO_8859_8
| ISO_IR_138
| ISO8859_8
| ISO_8859_8_1988
| CSISOLATINHEBREW
64 | ISO_8859_9
| ISO_IR_148
| ISO8859_9
| ISO_8859_9_1989
| L5
| LATIN5
| CSISOLATIN5
65 | ISO_8859_10
| ISO_IR_157
| ISO8859_10
| ISO_8859_10_1992
| L6
| LATIN6
| CSISOLATIN6
66 | ISO_8859_13
| ISO_IR_179
| ISO8859_13
| L7
| LATIN7
67 | ISO_8859_14
| ISO_CELTIC
| ISO8859_14
| ISO_IR_199
| ISO_8859_14_1998
| L8
| LATIN8
68 | ISO_8859_15
| ISO_IR_203
| ISO8859_15
| ISO_8859_15_1998
69 | ISO_8859_16
| ISO_IR_226
| ISO8859_16
| ISO_8859_16_2000
73 | CP1250
| MS_EE
| WINDOWS_1250
74 | CP1251
| MS_CYRL
| WINDOWS_1251
75 | CP1252
| MS_ANSI
| WINDOWS_1252
76 | CP1253
| MS_GREEK
| WINDOWS_1253
77 | CP1254
| MS_TURK
| WINDOWS_1254
78 | CP1255
| MS_HEBR
| WINDOWS_1255
79 | CP1256
| MS_ARAB
| WINDOWS_1256
80 | CP1257
| WINBALTRIM
| WINDOWS_1257
81 | CP1258
| WINDOWS_1258
82 | I_850
| CP850
| IBM850
| CSPC850MULTILINGUAL
83 | I_862
| CP862
| IBM862
| CSPC862LATINHEBREW
84 | I_866
| CP866
| IBM866
| CSIBM866
85 | MAC
| MACINTOSH
| MACROMAN
| CSMACINTOSH
97 | HP_ROMAN8
| R8
| ROMAN8
| CSHPROMAN8
104 | CP1133
| IBM_CP1133
105 | ISO_IR_166
| TIS_620
| TIS620
| TIS620_0
| TIS620_2529_1
| TIS620_2533_0
| TIS620_2533_1
106 | CP874
| WINDOWS_874
107 | VISCII
| VISCII1_1_1
| CSVISCII
108 | TCVN
| TCVN_5712
| TCVN5712_1
| TCVN5712_1_1993
109 | ISO_IR_14
| ISO646_JP
| JIS_C6220_1969_RO
| JP
| CSISO14JISC6220RO
110 | JISX0201_1976
| JIS_X0201
| X0201
| CSHALFWIDTHKATAKANA
111 | ISO_IR_87
| JIS0208
| JIS_C6226_1983
| JIS_X0208
| JIS_X0208_1983
| JIS_X0208_1990
| X0208
| CSISO87JISX0208
112 | ISO_IR_159
| JIS_X0212
| JIS_X0212_1990
| JIS_X0212_1990_0
| X0212
| CSISO159JISX02121990
113 | CN
| GB_1988_80
| ISO_IR_57
| ISO646_CN
| CSISO57GB1988
114 | CHINESE
| GB_2312_80
| ISO_IR_58
| CSISO58GB231280
115 | CN_GB_ISOIR165
| ISO_IR_165
116 | ISO_IR_149
| KOREAN
| KSC_5601
| KS_C_5601_1987
| KS_C_5601_1989
| CSKSC56011987
117 | EUC_JP
| EUCJP
| EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE
| CSEUCPKDFMTJAPANESE
118 | MS_KANJI
| SHIFT_JIS
| SJIS
| CSSHIFTJIS
120 | ISO_2022_JP
| CSISO2022JP
122 | ISO_2022_JP_2
| CSISO2022JP2
123 | CN_GB
| EUC_CN
| EUCCN
| GB2312
| CSGB2312
126 | ISO_2022_CN
| CSISO2022CN
129 | EUC_TW
| EUCTW
| CSEUCTW
130 | BIG_5
| BIG_FIVE
| BIG5
| BIGFIVE
| CN_BIG5
| CSBIG5
132 | BIG5_HKSCS
| BIG5HKSCS
133 | EUC_KR
| EUCKR
| CSEUCKR
136 | ISO_2022_KR
| CSISO2022KR
137 | I_437
| CP437
| IBM437
| CSPC8CODEPAGE437
139 | CP775
| IBM775
| CSPC775BALTIC
140 | I_852
| CP852
| IBM852
| CSPCP852
142 | I_855
| CP855
| IBM855
| CSIBM855
143 | I_857
| CP857
| IBM857
| CSIBM857
145 | I_860
| CP860
| IBM860
| CSIBM860
146 | I_861
| CP_IS
| CP861
| IBM861
| CSIBM861
147 | I_863
| CP863
| IBM863
| CSIBM863
148 | CP864
| IBM864
| CSIBM864
149 | I_865
| CP865
| IBM865
| CSIBM865
150 | I_869
| CP_GR
| CP869
| IBM869
| CSIBM869
153 (**********************************************************************************)
157 (**********************************************************************************)
159 exception CharsetError
161 let () = Callback.register_exception
"charset_error" CharsetError
163 external get_charset
: unit -> string = "ml_locale_charset"
164 external get_default_language
: unit -> string = "ml_get_default_language"
165 external convert_string
: string -> string -> string -> string = "ml_convert_string"
166 external is_utf8
: string -> bool = "ml_utf8_validate"
168 (**********************************************************************************)
172 (**********************************************************************************)
174 (* taken from camomile *)
176 (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
180 let n = Char.code s
.[i
] in
181 if n < 0x80 then n else
183 (n - 0xc0) lsl 6 lor (0x7f land (Char.code s
.[i
+ 1]))
184 else if n <= 0xef then
186 let m0 = Char.code s
.[i
+ 2] in
187 let m = Char.code
(String.unsafe_get s
(i
+ 1)) in
188 let n'
= n'
lsl 6 lor (0x7f land m) in
189 n'
lsl 6 lor (0x7f land m0)
190 else if n <= 0xf7 then
192 let m0 = Char.code s
.[i
+ 3] in
193 let m = Char.code
(String.unsafe_get s
(i
+ 1)) in
194 let n'
= n'
lsl 6 lor (0x7f land m) in
195 let m = Char.code
(String.unsafe_get s
(i
+ 2)) in
196 let n'
= n'
lsl 6 lor (0x7f land m) in
197 n'
lsl 6 lor (0x7f land m0)
198 else if n <= 0xfb then
200 let m0 = Char.code s
.[i
+ 4] in
201 let m = Char.code
(String.unsafe_get s
(i
+ 1)) in
202 let n'
= n'
lsl 6 lor (0x7f land m) in
203 let m = Char.code
(String.unsafe_get s
(i
+ 2)) in
204 let n'
= n'
lsl 6 lor (0x7f land m) in
205 let m = Char.code
(String.unsafe_get s
(i
+ 3)) in
206 let n'
= n'
lsl 6 lor (0x7f land m) in
207 n'
lsl 6 lor (0x7f land m0)
208 else if n <= 0xfd then
210 let m0 = Char.code s
.[i
+ 5] in
211 let m = Char.code
(String.unsafe_get s
(i
+ 1)) in
212 let n'
= n'
lsl 6 lor (0x7f land m) in
213 let m = Char.code
(String.unsafe_get s
(i
+ 2)) in
214 let n'
= n'
lsl 6 lor (0x7f land m) in
215 let m = Char.code
(String.unsafe_get s
(i
+ 3)) in
216 let n'
= n'
lsl 6 lor (0x7f land m) in
217 let m = Char.code
(String.unsafe_get s
(i
+ 4)) in
218 let n'
= n'
lsl 6 lor (0x7f land m) in
219 n'
lsl 6 lor (0x7f land m0)
220 else invalid_arg
"utf8_look"
222 if n'
lsr 31 = 0 then n'
else
223 invalid_arg
"utf8_look char_of_uint"
225 let rec search_head s i
=
226 if i
>= String.length s
then i
else
227 let n = Char.code
(String.unsafe_get s i
) in
228 if n < 0x80 || n >= 0xc2 then i
else
229 search_head s
(i
+ 1)
232 let n = Char.code s
.[i
] in
233 if n < 0x80 then i
+ 1 else
234 if n < 0xc0 then search_head s
(i
+ 1) else
235 if n <= 0xdf then i
+ 2
236 else if n <= 0xef then i
+ 3
237 else if n <= 0xf7 then i
+ 4
238 else if n <= 0xfb then i
+ 5
239 else if n <= 0xfd then i
+ 6
240 else invalid_arg
"utf8_next"
242 let rec nth_aux s i
n =
244 nth_aux s
(utf8_next s i
) (n - 1)
246 let nth s
n = nth_aux s
0 n
248 let utf8_get s
n = utf8_look s
(nth s
n)
250 (**********************************************************************************)
254 (**********************************************************************************)
256 (* taken from camomile *)
258 (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
260 let rec length_aux s c i
=
261 if i
>= String.length s
then c
else
262 let n = Char.code
(String.unsafe_get s i
) in
264 if n < 0x80 then 1 else
265 if n < 0xc0 then invalid_arg
"UTF8.length" else
266 if n < 0xe0 then 2 else
267 if n < 0xf0 then 3 else
268 if n < 0xf8 then 4 else
269 if n < 0xfc then 5 else
270 if n < 0xfe then 6 else
271 invalid_arg
"UTF8.length" in
272 length_aux s
(c
+ 1) (i
+ k)
274 let utf8_length s
= length_aux s
0 0
276 (**********************************************************************************)
278 (* add_uchar (internal) *)
280 (**********************************************************************************)
283 (* taken from camomile *)
285 (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
287 external uint_code
: uchar
-> int = "%identity"
289 let add_uchar buf u
=
290 let masq = 0b111111 in
291 let k = uint_code u
in
292 if k < 0 || k >= 0x4000000 then begin
293 Buffer.add_char buf
(Char.chr
(0xfc + (k lsr 30)));
294 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 24) land masq)));
295 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 18) land masq)));
296 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 12) land masq)));
297 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 6) land masq)));
298 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor (k land masq)));
299 end else if k <= 0x7f then
300 Buffer.add_char buf
(Char.unsafe_chr
k)
301 else if k <= 0x7ff then begin
302 Buffer.add_char buf
(Char.unsafe_chr
(0xc0 lor (k lsr 6)));
303 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor (k land masq)))
304 end else if k <= 0xffff then begin
305 Buffer.add_char buf
(Char.unsafe_chr
(0xe0 lor (k lsr 12)));
306 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 6) land masq)));
307 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor (k land masq)));
308 end else if k <= 0x1fffff then begin
309 Buffer.add_char buf
(Char.unsafe_chr
(0xf0 + (k lsr 18)));
310 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 12) land masq)));
311 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 6) land masq)));
312 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor (k land masq)));
314 Buffer.add_char buf
(Char.unsafe_chr
(0xf8 + (k lsr 24)));
315 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 18) land masq)));
316 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 12) land masq)));
317 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor ((k lsr 6) land masq)));
318 Buffer.add_char buf
(Char.unsafe_chr
(0x80 lor (k land masq)));
321 (**********************************************************************************)
323 (* charset_to_string *)
325 (**********************************************************************************)
327 let charset_to_string charset
=
329 | ANSI_X3_4_1968
-> "ANSI_X3.4-1968"
330 | ANSI_X3_4_1986
-> "ANSI_X3.4-1986"
334 | ISO_IR_6
-> "ISO-IR-6"
335 | ISO646_US
-> "ISO646-US"
336 | ISO_646_IRV_1991
-> "ISO_646.IRV:1991"
338 | US_ASCII
-> "US-ASCII"
339 | CSASCII
-> "CSASCII"
343 | ISO_10646_UCS_2
-> "ISO-10646-UCS-2"
345 | CSUNICODE
-> "CSUNICODE"
347 | UCS_2BE
-> "UCS-2BE"
348 | UNICODE_1_1
-> "UNICODE-1-1"
349 | UNICODEBIG
-> "UNICODEBIG"
350 | CSUNICODE11
-> "CSUNICODE11"
352 | UCS_2LE
-> "UCS-2LE"
353 | UNICODELITTLE
-> "UNICODELITTLE"
355 | ISO_10646_UCS_4
-> "ISO-10646-UCS-4"
359 | UCS_4BE
-> "UCS-4BE"
361 | UCS_4LE
-> "UCS-4LE"
365 | UTF_16BE
-> "UTF-16BE"
367 | UTF_16LE
-> "UTF-16LE"
371 | UTF_32BE
-> "UTF-32BE"
373 | UTF_32LE
-> "UTF-32LE"
375 | UNICODE_1_1_UTF_7
-> "UNICODE-1-1-UTF-7"
377 | CSUNICODE11UTF7
-> "CSUNICODE11UTF7"
379 | UCS_2_INTERNAL
-> "UCS-2-INTERNAL"
381 | UCS_2_SWAPPED
-> "UCS-2-SWAPPED"
383 | UCS_4_INTERNAL
-> "UCS-4-INTERNAL"
385 | UCS_4_SWAPPED
-> "UCS-4-SWAPPED"
393 | ISO_8859_1
-> "ISO-8859-1"
394 | ISO_IR_100
-> "ISO-IR-100"
395 | ISO8859_1
-> "ISO8859-1"
396 | ISO_8859_1_1987
-> "ISO_8859-1:1987"
399 | CSISOLATIN1
-> "CSISOLATIN1"
401 | ISO_8859_2
-> "ISO-8859-2"
402 | ISO_IR_101
-> "ISO-IR-101"
403 | ISO8859_2
-> "ISO8859-2"
404 | ISO_8859_2_1987
-> "ISO_8859-2:1987"
407 | CSISOLATIN2
-> "CSISOLATIN2"
409 | ISO_8859_3
-> "ISO-8859-3"
410 | ISO_IR_109
-> "ISO-IR-109"
411 | ISO8859_3
-> "ISO8859-3"
412 | ISO_8859_3_1988
-> "ISO_8859-3:1988"
415 | CSISOLATIN3
-> "CSISOLATIN3"
417 | ISO_8859_4
-> "ISO-8859-4"
418 | ISO_IR_110
-> "ISO-IR-110"
419 | ISO8859_4
-> "ISO8859-4"
420 | ISO_8859_4_1988
-> "ISO_8859-4:1988"
423 | CSISOLATIN4
-> "CSISOLATIN4"
425 | CYRILLIC
-> "CYRILLIC"
426 | ISO_8859_5
-> "ISO-8859-5"
427 | ISO_IR_144
-> "ISO-IR-144"
428 | ISO8859_5
-> "ISO8859-5"
429 | ISO_8859_5_1988
-> "ISO_8859-5:1988"
430 | CSISOLATINCYRILLIC
-> "CSISOLATINCYRILLIC"
433 | ASMO_708
-> "ASMO-708"
434 | ECMA_114
-> "ECMA-114"
435 | ISO_8859_6
-> "ISO-8859-6"
436 | ISO_IR_127
-> "ISO-IR-127"
437 | ISO8859_6
-> "ISO8859-6"
438 | ISO_8859_6_1987
-> "ISO_8859-6:1987"
439 | CSISOLATINARABIC
-> "CSISOLATINARABIC"
441 | ECMA_118
-> "ECMA-118"
442 | ELOT_928
-> "ELOT_928"
445 | ISO_8859_7
-> "ISO-8859-7"
446 | ISO_IR_126
-> "ISO-IR-126"
447 | ISO8859_7
-> "ISO8859-7"
448 | ISO_8859_7_1987
-> "ISO_8859-7:1987"
449 | CSISOLATINGREEK
-> "CSISOLATINGREEK"
452 | ISO_8859_8
-> "ISO-8859-8"
453 | ISO_IR_138
-> "ISO-IR-138"
454 | ISO8859_8
-> "ISO8859-8"
455 | ISO_8859_8_1988
-> "ISO_8859-8:1988"
456 | CSISOLATINHEBREW
-> "CSISOLATINHEBREW"
458 | ISO_8859_9
-> "ISO-8859-9"
459 | ISO_IR_148
-> "ISO-IR-148"
460 | ISO8859_9
-> "ISO8859-9"
461 | ISO_8859_9_1989
-> "ISO_8859-9:1989"
464 | CSISOLATIN5
-> "CSISOLATIN5"
466 | ISO_8859_10
-> "ISO-8859-10"
467 | ISO_IR_157
-> "ISO-IR-157"
468 | ISO8859_10
-> "ISO8859-10"
469 | ISO_8859_10_1992
-> "ISO_8859-10:1992"
472 | CSISOLATIN6
-> "CSISOLATIN6"
474 | ISO_8859_13
-> "ISO-8859-13"
475 | ISO_IR_179
-> "ISO-IR-179"
476 | ISO8859_13
-> "ISO8859-13"
480 | ISO_8859_14
-> "ISO-8859-14"
481 | ISO_CELTIC
-> "ISO-CELTIC"
482 | ISO_IR_199
-> "ISO-IR-199"
483 | ISO8859_14
-> "ISO8859-14"
484 | ISO_8859_14_1998
-> "ISO_8859-14:1998"
488 | ISO_8859_15
-> "ISO-8859-15"
489 | ISO_IR_203
-> "ISO-IR-203"
490 | ISO8859_15
-> "ISO8859-15"
491 | ISO_8859_15_1998
-> "ISO_8859-15:1998"
493 | ISO_8859_16
-> "ISO-8859-16"
494 | ISO_IR_226
-> "ISO-IR-226"
495 | ISO8859_16
-> "ISO8859-16"
496 | ISO_8859_16_2000
-> "ISO_8859-16:2000"
499 | CSKOI8R
-> "CSKOI8R"
503 | KOI8_RU
-> "KOI8-RU"
507 | WINDOWS_1250
-> "WINDOWS-1250"
510 | MS_CYRL
-> "MS-CYRL"
511 | WINDOWS_1251
-> "WINDOWS-1251"
514 | MS_ANSI
-> "MS-ANSI"
515 | WINDOWS_1252
-> "WINDOWS-1252"
518 | MS_GREEK
-> "MS-GREEK"
519 | WINDOWS_1253
-> "WINDOWS-1253"
522 | MS_TURK
-> "MS-TURK"
523 | WINDOWS_1254
-> "WINDOWS-1254"
526 | MS_HEBR
-> "MS-HEBR"
527 | WINDOWS_1255
-> "WINDOWS-1255"
530 | MS_ARAB
-> "MS-ARAB"
531 | WINDOWS_1256
-> "WINDOWS-1256"
534 | WINBALTRIM
-> "WINBALTRIM"
535 | WINDOWS_1257
-> "WINDOWS-1257"
538 | WINDOWS_1258
-> "WINDOWS-1258"
543 | CSPC850MULTILINGUAL
-> "CSPC850MULTILINGUAL"
548 | CSPC862LATINHEBREW
-> "CSPC862LATINHEBREW"
553 | CSIBM866
-> "CSIBM866"
556 | MACINTOSH
-> "MACINTOSH"
557 | MACROMAN
-> "MACROMAN"
558 | CSMACINTOSH
-> "CSMACINTOSH"
560 | MACCENTRALEUROPE
-> "MACCENTRALEUROPE"
562 | MACICELAND
-> "MACICELAND"
564 | MACCROATIAN
-> "MACCROATIAN"
566 | MACROMANIA
-> "MACROMANIA"
568 | MACCYRILLIC
-> "MACCYRILLIC"
570 | MACUKRAINE
-> "MACUKRAINE"
572 | MACGREEK
-> "MACGREEK"
574 | MACTURKISH
-> "MACTURKISH"
576 | MACHEBREW
-> "MACHEBREW"
578 | MACARABIC
-> "MACARABIC"
580 | MACTHAI
-> "MACTHAI"
582 | HP_ROMAN8
-> "HP-ROMAN8"
585 | CSHPROMAN8
-> "CSHPROMAN8"
587 | NEXTSTEP
-> "NEXTSTEP"
589 | ARMSCII_8
-> "ARMSCII-8"
591 | GEORGIAN_ACADEMY
-> "GEORGIAN-ACADEMY"
593 | GEORGIAN_PS
-> "GEORGIAN-PS"
597 | MULELAO_1
-> "MULELAO-1"
600 | IBM_CP1133
-> "IBM-CP1133"
602 | ISO_IR_166
-> "ISO-IR-166"
603 | TIS_620
-> "TIS-620"
605 | TIS620_0
-> "TIS620-0"
606 | TIS620_2529_1
-> "TIS620.2529-1"
607 | TIS620_2533_0
-> "TIS620.2533-0"
608 | TIS620_2533_1
-> "TIS620.2533-1"
611 | WINDOWS_874
-> "WINDOWS-874"
614 | VISCII1_1_1
-> "VISCII1.1-1"
615 | CSVISCII
-> "CSVISCII"
618 | TCVN_5712
-> "TCVN-5712"
619 | TCVN5712_1
-> "TCVN5712-1"
620 | TCVN5712_1_1993
-> "TCVN5712-1:1993"
622 | ISO_IR_14
-> "ISO-IR-14"
623 | ISO646_JP
-> "ISO646-JP"
624 | JIS_C6220_1969_RO
-> "JIS_C6220-1969-RO"
626 | CSISO14JISC6220RO
-> "CSISO14JISC6220RO"
628 | JISX0201_1976
-> "JISX0201-1976"
629 | JIS_X0201
-> "JIS_X0201"
631 | CSHALFWIDTHKATAKANA
-> "CSHALFWIDTHKATAKANA"
633 | ISO_IR_87
-> "ISO-IR-87"
634 | JIS0208
-> "JIS0208"
635 | JIS_C6226_1983
-> "JIS_C6226-1983"
636 | JIS_X0208
-> "JIS_X0208"
637 | JIS_X0208_1983
-> "JIS_X0208-1983"
638 | JIS_X0208_1990
-> "JIS_X0208-1990"
640 | CSISO87JISX0208
-> "CSISO87JISX0208"
642 | ISO_IR_159
-> "ISO-IR-159"
643 | JIS_X0212
-> "JIS_X0212"
644 | JIS_X0212_1990
-> "JIS_X0212-1990"
645 | JIS_X0212_1990_0
-> "JIS_X0212.1990-0"
647 | CSISO159JISX02121990
-> "CSISO159JISX02121990"
650 | GB_1988_80
-> "GB_1988-80"
651 | ISO_IR_57
-> "ISO-IR-57"
652 | ISO646_CN
-> "ISO646-CN"
653 | CSISO57GB1988
-> "CSISO57GB1988"
655 | CHINESE
-> "CHINESE"
656 | GB_2312_80
-> "GB_2312-80"
657 | ISO_IR_58
-> "ISO-IR-58"
658 | CSISO58GB231280
-> "CSISO58GB231280"
660 | CN_GB_ISOIR165
-> "CN-GB-ISOIR165"
661 | ISO_IR_165
-> "ISO-IR-165"
663 | ISO_IR_149
-> "ISO-IR-149"
665 | KSC_5601
-> "KSC_5601"
666 | KS_C_5601_1987
-> "KS_C_5601-1987"
667 | KS_C_5601_1989
-> "KS_C_5601-1989"
668 | CSKSC56011987
-> "CSKSC56011987"
672 | EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE
-> "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE"
673 | CSEUCPKDFMTJAPANESE
-> "CSEUCPKDFMTJAPANESE"
675 | MS_KANJI
-> "MS_KANJI"
676 | SHIFT_JIS
-> "SHIFT-JIS"
678 | CSSHIFTJIS
-> "CSSHIFTJIS"
682 | ISO_2022_JP
-> "ISO-2022-JP"
683 | CSISO2022JP
-> "CSISO2022JP"
685 | ISO_2022_JP_1
-> "ISO-2022-JP-1"
687 | ISO_2022_JP_2
-> "ISO-2022-JP-2"
688 | CSISO2022JP2
-> "CSISO2022JP2"
694 | CSGB2312
-> "CSGB2312"
699 | GB18030
-> "GB18030"
701 | ISO_2022_CN
-> "ISO-2022-CN"
702 | CSISO2022CN
-> "CSISO2022CN"
704 | ISO_2022_CN_EXT
-> "ISO-2022-CN-EXT"
707 | HZ_GB_2312
-> "HZ-GB-2312"
711 | CSEUCTW
-> "CSEUCTW"
714 | BIG_FIVE
-> "BIG-FIVE"
716 | BIGFIVE
-> "BIGFIVE"
717 | CN_BIG5
-> "CN-BIG5"
722 | BIG5_HKSCS
-> "BIG5-HKSCS"
723 | BIG5HKSCS
-> "BIG5HKSCS"
727 | CSEUCKR
-> "CSEUCKR"
735 | ISO_2022_KR
-> "ISO-2022-KR"
736 | CSISO2022KR
-> "CSISO2022KR"
741 | CSPC8CODEPAGE437
-> "CSPC8CODEPAGE437"
747 | CSPC775BALTIC
-> "CSPC775BALTIC"
752 | CSPCP852
-> "CSPCP852"
759 | CSIBM855
-> "CSIBM855"
764 | CSIBM857
-> "CSIBM857"
771 | CSIBM860
-> "CSIBM860"
777 | CSIBM861
-> "CSIBM861"
782 | CSIBM863
-> "CSIBM863"
786 | CSIBM864
-> "CSIBM864"
791 | CSIBM865
-> "CSIBM865"
797 | CSIBM869
-> "CSIBM869"
801 (**********************************************************************************)
803 (* charset_from_string *)
805 (**********************************************************************************)
807 let charset_from_string s
=
809 | "ANSI_X3.4-1968" -> ANSI_X3_4_1968
810 | "ANSI_X3.4-1986" -> ANSI_X3_4_1986
814 | "ISO-IR-6" -> ISO_IR_6
815 | "ISO646-US" -> ISO646_US
816 | "ISO_646.IRV:1991" -> ISO_646_IRV_1991
818 | "US-ASCII" -> US_ASCII
819 | "CSASCII" -> CSASCII
823 | "ISO-10646-UCS-2" -> ISO_10646_UCS_2
825 | "CSUNICODE" -> CSUNICODE
827 | "UCS-2BE" -> UCS_2BE
828 | "UNICODE-1-1" -> UNICODE_1_1
829 | "UNICODEBIG" -> UNICODEBIG
830 | "CSUNICODE11" -> CSUNICODE11
832 | "UCS-2LE" -> UCS_2LE
833 | "UNICODELITTLE" -> UNICODELITTLE
835 | "ISO-10646-UCS-4" -> ISO_10646_UCS_4
839 | "UCS-4BE" -> UCS_4BE
841 | "UCS-4LE" -> UCS_4LE
845 | "UTF-16BE" -> UTF_16BE
847 | "UTF-16LE" -> UTF_16LE
851 | "UTF-32BE" -> UTF_32BE
853 | "UTF-32LE" -> UTF_32LE
855 | "UNICODE-1-1-UTF-7" -> UNICODE_1_1_UTF_7
857 | "CSUNICODE11UTF7" -> CSUNICODE11UTF7
859 | "UCS-2-INTERNAL" -> UCS_2_INTERNAL
861 | "UCS-2-SWAPPED" -> UCS_2_SWAPPED
863 | "UCS-4-INTERNAL" -> UCS_4_INTERNAL
865 | "UCS-4-SWAPPED" -> UCS_4_SWAPPED
873 | "ISO-8859-1" -> ISO_8859_1
874 | "ISO-IR-100" -> ISO_IR_100
875 | "ISO8859-1" -> ISO8859_1
876 | "ISO_8859-1" -> ISO_8859_1
877 | "ISO_8859-1:1987" -> ISO_8859_1_1987
880 | "CSISOLATIN1" -> CSISOLATIN1
882 | "ISO-8859-2" -> ISO_8859_2
883 | "ISO-IR-101" -> ISO_IR_101
884 | "ISO8859-2" -> ISO8859_2
885 | "ISO_8859-2" -> ISO_8859_2
886 | "ISO_8859-2:1987" -> ISO_8859_2_1987
889 | "CSISOLATIN2" -> CSISOLATIN2
891 | "ISO-8859-3" -> ISO_8859_3
892 | "ISO-IR-109" -> ISO_IR_109
893 | "ISO8859-3" -> ISO8859_3
894 | "ISO_8859-3" -> ISO_8859_3
895 | "ISO_8859-3:1988" -> ISO_8859_3_1988
898 | "CSISOLATIN3" -> CSISOLATIN3
900 | "ISO-8859-4" -> ISO_8859_4
901 | "ISO-IR-110" -> ISO_IR_110
902 | "ISO8859-4" -> ISO8859_4
903 | "ISO_8859-4" -> ISO_8859_4
904 | "ISO_8859-4:1988" -> ISO_8859_4_1988
907 | "CSISOLATIN4" -> CSISOLATIN4
909 | "CYRILLIC" -> CYRILLIC
910 | "ISO-8859-5" -> ISO_8859_5
911 | "ISO-IR-144" -> ISO_IR_144
912 | "ISO8859-5" -> ISO8859_5
913 | "ISO_8859-5" -> ISO_8859_5
914 | "ISO_8859-5:1988" -> ISO_8859_5_1988
915 | "CSISOLATINCYRILLIC" -> CSISOLATINCYRILLIC
918 | "ASMO-708" -> ASMO_708
919 | "ECMA-114" -> ECMA_114
920 | "ISO-8859-6" -> ISO_8859_6
921 | "ISO-IR-127" -> ISO_IR_127
922 | "ISO8859-6" -> ISO8859_6
923 | "ISO_8859-6" -> ISO_8859_6
924 | "ISO_8859-6:1987" -> ISO_8859_6_1987
925 | "CSISOLATINARABIC" -> CSISOLATINARABIC
927 | "ECMA-118" -> ECMA_118
928 | "ELOT_928" -> ELOT_928
931 | "ISO-8859-7" -> ISO_8859_7
932 | "ISO-IR-126" -> ISO_IR_126
933 | "ISO8859-7" -> ISO8859_7
934 | "ISO_8859-7" -> ISO_8859_7
935 | "ISO_8859-7:1987" -> ISO_8859_7_1987
936 | "CSISOLATINGREEK" -> CSISOLATINGREEK
939 | "ISO-8859-8" -> ISO_8859_8
940 | "ISO-IR-138" -> ISO_IR_138
941 | "ISO8859-8" -> ISO8859_8
942 | "ISO_8859-8" -> ISO_8859_8
943 | "ISO_8859-8:1988" -> ISO_8859_8_1988
944 | "CSISOLATINHEBREW" -> CSISOLATINHEBREW
946 | "ISO-8859-9" -> ISO_8859_9
947 | "ISO-IR-148" -> ISO_IR_148
948 | "ISO8859-9" -> ISO8859_9
949 | "ISO_8859-9" -> ISO_8859_9
950 | "ISO_8859-9:1989" -> ISO_8859_9_1989
953 | "CSISOLATIN5" -> CSISOLATIN5
955 | "ISO-8859-10" -> ISO_8859_10
956 | "ISO-IR-157" -> ISO_IR_157
957 | "ISO8859-10" -> ISO8859_10
958 | "ISO_8859-10" -> ISO_8859_10
959 | "ISO_8859-10:1992" -> ISO_8859_10_1992
962 | "CSISOLATIN6" -> CSISOLATIN6
964 | "ISO-8859-13" -> ISO_8859_13
965 | "ISO-IR-179" -> ISO_IR_179
966 | "ISO8859-13" -> ISO8859_13
967 | "ISO_8859-13" -> ISO_8859_13
971 | "ISO-8859-14" -> ISO_8859_14
972 | "ISO-CELTIC" -> ISO_CELTIC
973 | "ISO-IR-199" -> ISO_IR_199
974 | "ISO8859-14" -> ISO8859_14
975 | "ISO_8859-14" -> ISO_8859_14
976 | "ISO_8859-14:1998" -> ISO_8859_14_1998
980 | "ISO-8859-15" -> ISO_8859_15
981 | "ISO-IR-203" -> ISO_IR_203
982 | "ISO8859-15" -> ISO8859_15
983 | "ISO_8859-15" -> ISO_8859_15
984 | "ISO_8859-15:1998" -> ISO_8859_15_1998
986 | "ISO-8859-16" -> ISO_8859_16
987 | "ISO-IR-226" -> ISO_IR_226
988 | "ISO8859-16" -> ISO8859_16
989 | "ISO_8859-16" -> ISO_8859_16
990 | "ISO_8859-16:2000" -> ISO_8859_16_2000
993 | "CSKOI8R" -> CSKOI8R
997 | "KOI8-RU" -> KOI8_RU
1001 | "WINDOWS-1250" -> WINDOWS_1250
1003 | "CP1251" -> CP1251
1004 | "MS-CYRL" -> MS_CYRL
1005 | "WINDOWS-1251" -> WINDOWS_1251
1007 | "CP1252" -> CP1252
1008 | "MS-ANSI" -> MS_ANSI
1009 | "WINDOWS-1252" -> WINDOWS_1252
1011 | "CP1253" -> CP1253
1012 | "MS-GREEK" -> MS_GREEK
1013 | "WINDOWS-1253" -> WINDOWS_1253
1015 | "CP1254" -> CP1254
1016 | "MS-TURK" -> MS_TURK
1017 | "WINDOWS-1254" -> WINDOWS_1254
1019 | "CP1255" -> CP1255
1020 | "MS-HEBR" -> MS_HEBR
1021 | "WINDOWS-1255" -> WINDOWS_1255
1023 | "CP1256" -> CP1256
1024 | "MS-ARAB" -> MS_ARAB
1025 | "WINDOWS-1256" -> WINDOWS_1256
1027 | "CP1257" -> CP1257
1028 | "WINBALTRIM" -> WINBALTRIM
1029 | "WINDOWS-1257" -> WINDOWS_1257
1031 | "CP1258" -> CP1258
1032 | "WINDOWS-1258" -> WINDOWS_1258
1036 | "IBM850" -> IBM850
1037 | "CSPC850MULTILINGUAL" -> CSPC850MULTILINGUAL
1041 | "IBM862" -> IBM862
1042 | "CSPC862LATINHEBREW" -> CSPC862LATINHEBREW
1046 | "IBM866" -> IBM866
1047 | "CSIBM866" -> CSIBM866
1050 | "MACINTOSH" -> MACINTOSH
1051 | "MACROMAN" -> MACROMAN
1052 | "CSMACINTOSH" -> CSMACINTOSH
1054 | "MACCENTRALEUROPE" -> MACCENTRALEUROPE
1056 | "MACICELAND" -> MACICELAND
1058 | "MACCROATIAN" -> MACCROATIAN
1060 | "MACROMANIA" -> MACROMANIA
1062 | "MACCYRILLIC" -> MACCYRILLIC
1064 | "MACUKRAINE" -> MACUKRAINE
1066 | "MACGREEK" -> MACGREEK
1068 | "MACTURKISH" -> MACTURKISH
1070 | "MACHEBREW" -> MACHEBREW
1072 | "MACARABIC" -> MACARABIC
1074 | "MACTHAI" -> MACTHAI
1076 | "HP-ROMAN8" -> HP_ROMAN8
1078 | "ROMAN8" -> ROMAN8
1079 | "CSHPROMAN8" -> CSHPROMAN8
1081 | "NEXTSTEP" -> NEXTSTEP
1083 | "ARMSCII-8" -> ARMSCII_8
1085 | "GEORGIAN-ACADEMY" -> GEORGIAN_ACADEMY
1087 | "GEORGIAN-PS" -> GEORGIAN_PS
1089 | "KOI8-T" -> KOI8_T
1091 | "MULELAO-1" -> MULELAO_1
1093 | "CP1133" -> CP1133
1094 | "IBM-CP1133" -> IBM_CP1133
1096 | "ISO-IR-166" -> ISO_IR_166
1097 | "TIS-620" -> TIS_620
1098 | "TIS620" -> TIS620
1099 | "TIS620-0" -> TIS620_0
1100 | "TIS620.2529-1" -> TIS620_2529_1
1101 | "TIS620.2533-0" -> TIS620_2533_0
1102 | "TIS620.2533-1" -> TIS620_2533_1
1105 | "WINDOWS-874" -> WINDOWS_874
1107 | "VISCII" -> VISCII
1108 | "VISCII1.1-1" -> VISCII1_1_1
1109 | "CSVISCII" -> CSVISCII
1112 | "TCVN-5712" -> TCVN_5712
1113 | "TCVN5712-1" -> TCVN5712_1
1114 | "TCVN5712-1:1993" -> TCVN5712_1_1993
1116 | "ISO-IR-14" -> ISO_IR_14
1117 | "ISO646-JP" -> ISO646_JP
1118 | "JIS_C6220-1969-RO" -> JIS_C6220_1969_RO
1120 | "CSISO14JISC6220RO" -> CSISO14JISC6220RO
1122 | "JISX0201-1976" -> JISX0201_1976
1123 | "JIS_X0201" -> JIS_X0201
1125 | "CSHALFWIDTHKATAKANA" -> CSHALFWIDTHKATAKANA
1127 | "ISO-IR-87" -> ISO_IR_87
1128 | "JIS0208" -> JIS0208
1129 | "JIS_C6226-1983" -> JIS_C6226_1983
1130 | "JIS_X0208" -> JIS_X0208
1131 | "JIS_X0208-1983" -> JIS_X0208_1983
1132 | "JIS_X0208-1990" -> JIS_X0208_1990
1134 | "CSISO87JISX0208" -> CSISO87JISX0208
1136 | "ISO-IR-159" -> ISO_IR_159
1137 | "JIS_X0212" -> JIS_X0212
1138 | "JIS_X0212-1990" -> JIS_X0212_1990
1139 | "JIS_X0212.1990-0" -> JIS_X0212_1990_0
1141 | "CSISO159JISX02121990" -> CSISO159JISX02121990
1144 | "GB_1988-80" -> GB_1988_80
1145 | "ISO-IR-57" -> ISO_IR_57
1146 | "ISO646-CN" -> ISO646_CN
1147 | "CSISO57GB1988" -> CSISO57GB1988
1149 | "CHINESE" -> CHINESE
1150 | "GB_2312-80" -> GB_2312_80
1151 | "ISO-IR-58" -> ISO_IR_58
1152 | "CSISO58GB231280" -> CSISO58GB231280
1154 | "CN-GB-ISOIR165" -> CN_GB_ISOIR165
1155 | "ISO-IR-165" -> ISO_IR_165
1157 | "ISO-IR-149" -> ISO_IR_149
1158 | "KOREAN" -> KOREAN
1159 | "KSC_5601" -> KSC_5601
1160 | "KS_C_5601-1987" -> KS_C_5601_1987
1161 | "KS_C_5601-1989" -> KS_C_5601_1989
1162 | "CSKSC56011987" -> CSKSC56011987
1164 | "EUC-JP" -> EUC_JP
1166 | "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE" -> EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE
1167 | "CSEUCPKDFMTJAPANESE" -> CSEUCPKDFMTJAPANESE
1169 | "MS_KANJI" -> MS_KANJI
1170 | "SHIFT-JIS" -> SHIFT_JIS
1171 | "SHIFT_JIS" -> SHIFT_JIS
1173 | "CSSHIFTJIS" -> CSSHIFTJIS
1177 | "ISO-2022-JP" -> ISO_2022_JP
1178 | "CSISO2022JP" -> CSISO2022JP
1180 | "ISO-2022-JP-1" -> ISO_2022_JP_1
1182 | "ISO-2022-JP-2" -> ISO_2022_JP_2
1183 | "CSISO2022JP2" -> CSISO2022JP2
1186 | "EUC-CN" -> EUC_CN
1188 | "GB2312" -> GB2312
1189 | "CSGB2312" -> CSGB2312
1194 | "GB18030" -> GB18030
1196 | "ISO-2022-CN" -> ISO_2022_CN
1197 | "CSISO2022CN" -> CSISO2022CN
1199 | "ISO-2022-CN-EXT" -> ISO_2022_CN_EXT
1202 | "HZ-GB-2312" -> HZ_GB_2312
1204 | "EUC-TW" -> EUC_TW
1206 | "CSEUCTW" -> CSEUCTW
1209 | "BIG-FIVE" -> BIG_FIVE
1211 | "BIGFIVE" -> BIGFIVE
1212 | "CN-BIG5" -> CN_BIG5
1213 | "CSBIG5" -> CSBIG5
1217 | "BIG5-HKSCS" -> BIG5_HKSCS
1218 | "BIG5HKSCS" -> BIG5HKSCS
1220 | "EUC-KR" -> EUC_KR
1222 | "CSEUCKR" -> CSEUCKR
1227 | "CP1361" -> CP1361
1230 | "ISO-2022-KR" -> ISO_2022_KR
1231 | "CSISO2022KR" -> CSISO2022KR
1235 | "IBM437" -> IBM437
1236 | "CSPC8CODEPAGE437" -> CSPC8CODEPAGE437
1241 | "IBM775" -> IBM775
1242 | "CSPC775BALTIC" -> CSPC775BALTIC
1246 | "IBM852" -> IBM852
1247 | "CSPCP852" -> CSPCP852
1253 | "IBM855" -> IBM855
1254 | "CSIBM855" -> CSIBM855
1258 | "IBM857" -> IBM857
1259 | "CSIBM857" -> CSIBM857
1265 | "IBM860" -> IBM860
1266 | "CSIBM860" -> CSIBM860
1271 | "IBM861" -> IBM861
1272 | "CSIBM861" -> CSIBM861
1276 | "IBM863" -> IBM863
1277 | "CSIBM863" -> CSIBM863
1280 | "IBM864" -> IBM864
1281 | "CSIBM864" -> CSIBM864
1285 | "IBM865" -> IBM865
1286 | "CSIBM865" -> CSIBM865
1291 | "IBM869" -> IBM869
1292 | "CSIBM869" -> CSIBM869
1294 | "CP1125" -> CP1125
1299 (**********************************************************************************)
1301 (* normalize_language *)
1303 (**********************************************************************************)
1305 let normalize_language s
=
1306 let s = String.uppercase
s in
1307 if String.length
s > 1
1309 (* We have to distinguish between ZH_tw and ZH_cn here
1310 ZH_tw = BIG5/Chinese traditional -- ZH_cn = GBK/Chinese simplified *)
1311 if String.sub
s 0 2 = "ZH" && String.length
s > 4 then
1317 (**********************************************************************************)
1321 (**********************************************************************************)
1324 let charset_aliases =
1326 [ANSI_X3_4_1968; ANSI_X3_4_1986; ASCII; CP367; IBM367; ISO_IR_6; ISO646_US; ISO_646_IRV_1991; US; US_ASCII; CSASCII];
1328 [ISO_10646_UCS_2; UCS_2; CSUNICODE];
1329 [UCS_2BE; UNICODE_1_1; UNICODEBIG; CSUNICODE11];
1330 [UCS_2LE; UNICODELITTLE];
1331 [ISO_10646_UCS_4; UCS_4; CSUCS4];
1340 [UNICODE_1_1_UTF_7; UTF_7; CSUNICODE11UTF7];
1347 [CP819; IBM819; ISO_8859_1; ISO_IR_100; ISO8859_1; ISO_8859_1_1987; L1; LATIN1; CSISOLATIN1];
1348 [ISO_8859_2; ISO_IR_101; ISO8859_2; ISO_8859_2_1987; L2; LATIN2; CSISOLATIN2];
1349 [ISO_8859_3; ISO_IR_109; ISO8859_3; ISO_8859_3_1988; L3; LATIN3; CSISOLATIN3];
1350 [ISO_8859_4; ISO_IR_110; ISO8859_4; ISO_8859_4_1988; L4; LATIN4; CSISOLATIN4];
1351 [CYRILLIC; ISO_8859_5; ISO_IR_144; ISO8859_5; ISO_8859_5_1988; CSISOLATINCYRILLIC];
1352 [ARABIC; ASMO_708; ECMA_114; ISO_8859_6; ISO_IR_127; ISO8859_6; ISO_8859_6_1987; CSISOLATINARABIC];
1353 [ECMA_118; ELOT_928; GREEK; GREEK8; ISO_8859_7; ISO_IR_126; ISO8859_7; ISO_8859_7_1987; CSISOLATINGREEK];
1354 [HEBREW; ISO_8859_8; ISO_IR_138; ISO8859_8; ISO_8859_8_1988; CSISOLATINHEBREW];
1355 [ISO_8859_9; ISO_IR_148; ISO8859_9; ISO_8859_9_1989; L5; LATIN5; CSISOLATIN5];
1356 [ISO_8859_10; ISO_IR_157; ISO8859_10; ISO_8859_10_1992; L6; LATIN6; CSISOLATIN6];
1357 [ISO_8859_13; ISO_IR_179; ISO8859_13; L7; LATIN7];
1358 [ISO_8859_14; ISO_CELTIC; ISO_IR_199; ISO_8859_14; ISO_8859_14_1998; L8; LATIN8];
1359 [ISO_8859_15; ISO_IR_203; ISO8859_15; ISO_8859_15_1998];
1360 [ISO_8859_16; ISO_IR_226; ISO8859_16; ISO_8859_16_2000];
1364 [CP1250; MS_EE; WINDOWS_1250];
1365 [CP1251; MS_CYRL; WINDOWS_1251];
1366 [CP1252; MS_ANSI; WINDOWS_1252];
1367 [CP1253; MS_GREEK; WINDOWS_1253];
1368 [CP1254; MS_TURK; WINDOWS_1254];
1369 [CP1255; MS_HEBR; WINDOWS_1255];
1370 [CP1256; MS_ARAB; WINDOWS_1256];
1371 [CP1257; WINBALTRIM; WINDOWS_1257];
1372 [CP1258; WINDOWS_1258];
1373 [I_850; CP850; IBM850; CSPC850MULTILINGUAL];
1374 [I_862; CP862; IBM862; CSPC862LATINHEBREW];
1375 [I_866; CP866; IBM866; CSIBM866];
1376 [MAC; MACINTOSH; MACROMAN; CSMACINTOSH];
1388 [HP_ROMAN8; R8; ROMAN8; CSHPROMAN8]; (* no region *)
1389 [NEXTSTEP
]; (* no region *)
1395 [CP1133
; IBM_CP1133
];
1396 [ISO_IR_166
; TIS_620
; TIS620
; TIS620_0
; TIS620_2529_1
; TIS620_2533_0
; TIS620_2533_1
];
1397 [CP874
; WINDOWS_874
];
1398 [VISCII
; VISCII1_1_1
; CSVISCII
];
1399 [TCVN
; TCVN_5712
; TCVN5712_1
; TCVN5712_1_1993
];
1400 [ISO_IR_14
; ISO646_JP
; JIS_C6220_1969_RO
; JP
; CSISO14JISC6220RO
];
1401 [JISX0201_1976
; JIS_X0201
; X0201
; CSHALFWIDTHKATAKANA
];
1402 [ISO_IR_87
; JIS0208
; JIS_C6226_1983
; JIS_X0208
; JIS_X0208_1983
; JIS_X0208_1990
; X0208
; CSISO87JISX0208
];
1403 [ISO_IR_159
; JIS_X0212
; JIS_X0212_1990
; JIS_X0212_1990_0
; X0212
; CSISO159JISX02121990
];
1404 [CN
; GB_1988_80
; ISO_IR_57
; ISO646_CN
; CSISO57GB1988
];
1405 [CHINESE
; GB_2312_80
; ISO_IR_58
; CSISO58GB231280
];
1406 [CN_GB_ISOIR165
; ISO_IR_165
];
1407 [ISO_IR_149
; KOREAN
; KSC_5601
; KS_C_5601_1987
; KS_C_5601_1989
; CSKSC56011987
];
1408 [EUC_JP
; EUCJP
; EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE
; CSEUCPKDFMTJAPANESE
];
1409 [MS_KANJI
; SHIFT_JIS
; SJIS
; CSSHIFTJIS
];
1411 [ISO_2022_JP
; CSISO2022JP
];
1413 [ISO_2022_JP_2
; CSISO2022JP2
];
1414 [CN_GB
; EUC_CN
; EUCCN
; GB2312
; CSGB2312
];
1417 [ISO_2022_CN
; CSISO2022CN
];
1420 [EUC_TW
; EUCTW
; CSEUCTW
];
1421 [BIG_5
; BIG_FIVE
; BIG5
; BIGFIVE
; CN_BIG5
; CSBIG5
];
1423 [BIG5_HKSCS
; BIG5HKSCS
];
1424 [EUC_KR
; EUCKR
; CSEUCKR
];
1427 [ISO_2022_KR
; CSISO2022KR
];
1428 [I_437
; CP437
; IBM437
; CSPC8CODEPAGE437
];
1430 [CP775
; IBM775
; CSPC775BALTIC
];
1431 [I_852
; CP852
; IBM852
; CSPCP852
];
1432 [CP853
]; (* no region *)
1433 [I_855
; CP855
; IBM855
; CSIBM855
];
1434 [I_857
; CP857
; IBM857
; CSIBM857
];
1435 [CP858
]; (* no region *)
1436 [I_860
; CP860
; IBM860
; CSIBM860
];
1437 [I_861
; CP_IS
; CP861
; IBM861
; CSIBM861
];
1438 [I_863
; CP863
; IBM863
; CSIBM863
];
1439 [CP864
; IBM864
; CSIBM864
];
1440 [I_865
; CP865
; IBM865
; CSIBM865
];
1441 [I_869
; CP_GR
; CP869
; IBM869
; CSIBM869
];
1448 [ANSI_X3_4_1968
; ANSI_X3_4_1986
; ASCII
; CP367
; IBM367
; ISO_IR_6
; ISO646_US
; ISO_646_IRV_1991
; US
; US_ASCII
; CSASCII
];
1454 [CP1256
; MS_ARAB
; WINDOWS_1256
];
1455 [ARABIC
; ASMO_708
; ECMA_114
; ISO_8859_6
; ISO_IR_127
; ISO8859_6
; ISO_8859_6_1987
; CSISOLATINARABIC
];
1456 [CP864
; IBM864
; CSIBM864
]; [CP864
; IBM864
; CSIBM864
];
1467 [CP1257
; WINBALTRIM
; WINDOWS_1257
];
1468 [ISO_8859_13
; ISO_IR_179
; ISO8859_13
; L7
; LATIN7
];
1469 [ISO_8859_4
; ISO_IR_110
; ISO8859_4
; ISO_8859_4_1988
; L4
; LATIN4
; CSISOLATIN4
];
1470 [CP775
; IBM775
; CSPC775BALTIC
];
1477 [ISO_8859_14
; ISO_CELTIC
; ISO_IR_199
; ISO8859_14
; ISO_8859_14_1998
; L8
; LATIN8
];
1480 let central_european =
1482 [CP1250
; MS_EE
; WINDOWS_1250
];
1483 [ISO_8859_2
; ISO_IR_101
; ISO8859_2
; ISO_8859_2_1987
; L2
; LATIN2
; CSISOLATIN2
];
1484 [I_852
; CP852
; IBM852
; CSPCP852
];
1488 let chinese_simplified =
1492 [CN_GB
; EUC_CN
; EUCCN
; GB2312
; CSGB2312
];
1493 [CHINESE
; GB_2312_80
; ISO_IR_58
; CSISO58GB231280
];
1494 [CN
; GB_1988_80
; ISO_IR_57
; ISO646_CN
; CSISO57GB1988
];
1495 [CN_GB_ISOIR165
; ISO_IR_165
];
1496 [ISO_2022_CN
; CSISO2022CN
];
1501 let chinese_traditional =
1503 [BIG_5
; BIG_FIVE
; BIG5
; BIGFIVE
; CN_BIG5
; CSBIG5
];
1504 [BIG5_HKSCS
; BIG5HKSCS
];
1505 [EUC_TW
; EUCTW
; CSEUCTW
];
1511 [CP1251
; MS_CYRL
; WINDOWS_1251
];
1512 [I_866
; CP866
; IBM866
; CSIBM866
];
1514 [CYRILLIC
; ISO_8859_5
; ISO_IR_144
; ISO8859_5
; ISO_8859_5_1988
; CSISOLATINCYRILLIC
];
1515 [I_855
; CP855
; IBM855
; CSIBM855
];
1530 [CP1253
; MS_GREEK
; WINDOWS_1253
];
1531 [ECMA_118
; ELOT_928
; GREEK
; GREEK8
; ISO_8859_7
; ISO_IR_126
; ISO8859_7
; ISO_8859_7_1987
; CSISOLATINGREEK
];
1534 [I_869
; CP_GR
; CP869
; IBM869
; CSIBM869
];
1539 [CP1255
; MS_HEBR
; WINDOWS_1255
];
1540 [HEBREW
; ISO_8859_8
; ISO_IR_138
; ISO8859_8
; ISO_8859_8_1988
; CSISOLATINHEBREW
];
1541 [I_862
; CP862
; IBM862
; CSPC862LATINHEBREW
];
1547 [EUC_JP
; EUCJP
; EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE
; CSEUCPKDFMTJAPANESE
];
1548 [ISO_2022_JP
; CSISO2022JP
];
1550 [ISO_2022_JP_2
; CSISO2022JP2
];
1551 [MS_KANJI
; SHIFT_JIS
; SJIS
; CSSHIFTJIS
];
1552 [ISO_IR_14
; ISO646_JP
; JIS_C6220_1969_RO
; JP
; CSISO14JISC6220RO
];
1553 [JISX0201_1976
; JIS_X0201
; X0201
; CSHALFWIDTHKATAKANA
];
1554 [ISO_IR_87
; JIS0208
; JIS_C6226_1983
; JIS_X0208
; JIS_X0208_1983
; JIS_X0208_1990
; X0208
; CSISO87JISX0208
];
1555 [ISO_IR_159
; JIS_X0212
; JIS_X0212_1990
; JIS_X0212_1990_0
; X0212
; CSISO159JISX02121990
];
1563 [EUC_KR
; EUCKR
; CSEUCKR
];
1564 [ISO_2022_KR
; CSISO2022KR
];
1565 [ISO_IR_149
; KOREAN
; KSC_5601
; KS_C_5601_1987
; KS_C_5601_1989
; CSKSC56011987
];
1570 [ISO_8859_10
; ISO_IR_157
; ISO8859_10
; ISO_8859_10_1992
; L6
; LATIN6
; CSISOLATIN6
];
1571 [I_861
; CP_IS
; CP861
; IBM861
; CSIBM861
];
1573 [I_865
; CP865
; IBM865
; CSIBM865
];
1578 [ISO_8859_16
; ISO_IR_226
; ISO8859_16
; ISO_8859_16_2000
];
1582 let south_european =
1584 [ISO_8859_3
; ISO_IR_109
; ISO8859_3
; ISO_8859_3_1988
; L3
; LATIN3
; CSISOLATIN3
];
1594 [CP874
; WINDOWS_874
];
1595 [ISO_IR_166
; TIS_620
; TIS620
; TIS620_0
; TIS620_2529_1
; TIS620_2533_0
; TIS620_2533_1
];
1597 [CP1133
; IBM_CP1133
];
1603 [CP1254
; MS_TURK
; WINDOWS_1254
];
1604 [ISO_8859_9
; ISO_IR_148
; ISO8859_9
; ISO_8859_9_1989
; L5
; LATIN5
; CSISOLATIN5
];
1605 [I_857
; CP857
; IBM857
; CSIBM857
];
1611 [ISO_10646_UCS_2
; UCS_2
; CSUNICODE
];
1612 [UCS_2BE
; UNICODE_1_1
; UNICODEBIG
; CSUNICODE11
];
1613 [UCS_2LE
; UNICODELITTLE
];
1614 [ISO_10646_UCS_4
; UCS_4
; CSUCS4
];
1623 [UNICODE_1_1_UTF_7
; UTF_7
; CSUNICODE11UTF7
];
1632 [CP1258
; WINDOWS_1258
];
1633 [VISCII
; VISCII1_1_1
; CSVISCII
];
1634 [TCVN
; TCVN_5712
; TCVN5712_1
; TCVN5712_1_1993
];
1637 let western_european =
1639 [CP1252
; MS_ANSI
; WINDOWS_1252
];
1640 [ISO_8859_15
; ISO_IR_203
; ISO8859_15
; ISO_8859_15_1998
];
1641 [I_850
; CP850
; IBM850
; CSPC850MULTILINGUAL
];
1642 [CP819
; IBM819
; ISO_8859_1
; ISO_IR_100
; ISO8859_1
; ISO_8859_1
; ISO_8859_1_1987
; L1
; LATIN1
; CSISOLATIN1
];
1643 [MAC
; MACINTOSH
; MACROMAN
; CSMACINTOSH
];
1644 [I_437
; CP437
; IBM437
; CSPC8CODEPAGE437
];
1645 [I_860
; CP860
; IBM860
; CSIBM860
];
1646 [I_863
; CP863
; IBM863
; CSIBM863
];
1649 let convert ~from_charset ~to_charset
s =
1650 if s <> "" then begin
1651 let t = charset_to_string to_charset
in
1652 let f = charset_to_string from_charset
in
1653 convert_string
s t f
1656 let safe_convert enc
s =
1662 ~from_charset
: (charset_from_string enc
)
1667 (* Locale specific conversions *)
1668 module Locale
= struct
1670 (* FIXME move away! *)
1672 (* block signals until core started correctly *)
1673 (MlUnix.set_signal
Sys.sigint
1674 (Sys.Signal_handle
(fun _
-> ())));
1675 (MlUnix.set_signal
Sys.sigterm
1676 (Sys.Signal_handle
(fun _
-> ())))
1680 let cs = get_charset
() in
1681 charset_from_string cs
1684 let locale_string = charset_to_string locale
1686 let (enc_list
: string list
ref) = ref []
1688 let char_const = "_"
1690 let default_language =
1691 let s = get_default_language
() in
1692 let s = normalize_language s in
1705 chinese_traditional;
1723 (* See http://www.gnu.org/software/gettext/manual/html_chapter/gettext_15.html#SEC221
1724 * The strategy is not perfect. Any comment to improve it, is highly appreciated.
1725 * The charset list shall be improved according to the language detected on the
1729 let charset_list_from_language lang
=
1731 li := ascii :: unicode :: !li;
1734 "AR" -> li := arabic :: !li
1735 | "HY" -> li := armenian :: !li
1738 | "MI" -> li := baltic :: !li
1739 | "CY" -> li := celtic :: western_european :: !li
1746 | "SL" -> li := central_european :: !li
1748 | "SR" -> li := central_european :: cyrillic ::!li
1749 | "ZH_CN" -> li := chinese_simplified :: !li
1750 | "ZH_TW" -> li := chinese_traditional :: !li
1751 | "ZH" -> li := chinese_traditional :: chinese_simplified :: !li
1756 | "UK" -> li := cyrillic :: !li
1757 | "KA" -> li := georgian :: !li
1758 | "EL" -> li := greek :: !li
1761 | "IW" -> li := hebrew :: !li
1762 | "JA" -> li := japanese :: !li
1763 | "KO" -> li := korean :: !li
1764 | "RO" -> li := romanian :: central_european :: !li
1765 | "MT" -> li := south_european :: !li
1766 | "TG" -> li := tajik :: !li
1767 | "TH" -> li := thai :: !li
1768 | "TR" -> li := turkish :: !li
1769 | "VI" -> li := vietnamese :: !li
1801 | "WA" -> li := western_european :: !li
1806 let set_default_charset_list (lang
: string) =
1807 (* Let's get rid of charset aliases *)
1808 let l = List.map
(fun li -> List.hd
li) (charset_list_from_language lang
) in
1809 enc_list
:= List.map
(fun c
-> charset_to_string c
) l;
1810 (* Printf2.lprintf "List of charmap used to convert the strings:\n";
1811 List.iter (fun enc ->
1812 Printf2.lprintf " Use encoding %s\n" enc;
1814 nenc := List.length
!enc_list
1816 let conversion_enabled = ref true
1818 let slow_encode_from_utf8 s to_codeset
=
1820 let slen = utf8_length s in
1821 let buf = Buffer.create
10 in
1822 for i
= 0 to (slen - 1) do
1824 let uchar = utf8_get s i
in
1825 add_uchar buf uchar;
1826 let s'
= Buffer.contents
buf in
1828 let s'
= convert_string
s' to_codeset
"UTF-8" in
1831 us := !us ^
char_const
1835 let slow_encode s to_codeset
=
1837 then slow_encode_from_utf8 s to_codeset
1840 let slen = String.length
s in
1841 for i
= 0 to (slen - 1) do
1843 us := !us ^
(convert_string
(String.sub
s i
1) to_codeset
locale_string)
1845 us := !us ^
char_const
1850 let fast_encode s to_codeset
=
1853 then slow_encode s to_codeset
1856 let from_codeset = List.nth !enc_list i
in
1857 convert_string
s to_codeset
from_codeset
1858 with _ -> iter (i
+ 1) !nenc
1866 else fast_encode s "UTF-8"
1869 if s = "" || not
!conversion_enabled
1872 let s = to_utf8 s in
1878 convert_string
s locale_string "UTF-8"
1880 slow_encode_from_utf8 s locale_string
1885 set_default_charset_list default_language