patch #7201
[mldonkey.git] / src / utils / lib / charset.ml
blobb76651334027538ade5817805d1280aa93cf432f
1 (* Copyright 2005 b8_bavard, INRIA *)
2 (*
3 This file is part of mldonkey.
5 mldonkey is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 mldonkey is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with mldonkey; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * This part has been inspired by the debian document "Introduction to i18n":
22 * http://www.debian.org/doc/manuals/intro-i18n/
26 (**********************************************************************************)
27 (* *)
28 (* types *)
29 (* *)
30 (**********************************************************************************)
32 type uchar = int
34 type charset =
35 | ANSI_X3_4_1968 | ANSI_X3_4_1986 | ASCII | CP367 | IBM367 | ISO_IR_6 | ISO646_US | ISO_646_IRV_1991 | US | US_ASCII | CSASCII
36 | UTF_8
37 | ISO_10646_UCS_2 | UCS_2 | CSUNICODE
38 | UCS_2BE | UNICODE_1_1 | UNICODEBIG | CSUNICODE11
39 | UCS_2LE | UNICODELITTLE
40 | ISO_10646_UCS_4 | UCS_4 | CSUCS4
41 | UCS_4BE
42 | UCS_4LE
43 | UTF_16
44 | UTF_16BE
45 | UTF_16LE
46 | UTF_32
47 | UTF_32BE
48 | UTF_32LE
49 | UNICODE_1_1_UTF_7 | UTF_7 | CSUNICODE11UTF7
50 | UCS_2_INTERNAL
51 | UCS_2_SWAPPED
52 | UCS_4_INTERNAL
53 | UCS_4_SWAPPED
54 | C99
55 | JAVA
56 | CP819 | IBM819 | ISO_8859_1 | ISO_IR_100 | ISO8859_1 | ISO_8859_1_1987 | L1 | LATIN1 | CSISOLATIN1
57 | ISO_8859_2 | ISO_IR_101 | ISO8859_2 | ISO_8859_2_1987 | L2 | LATIN2 | CSISOLATIN2
58 | ISO_8859_3 | ISO_IR_109 | ISO8859_3 | ISO_8859_3_1988 | L3 | LATIN3 | CSISOLATIN3
59 | ISO_8859_4 | ISO_IR_110 | ISO8859_4 | ISO_8859_4_1988 | L4 | LATIN4 | CSISOLATIN4
60 | CYRILLIC | ISO_8859_5 | ISO_IR_144 | ISO8859_5 | ISO_8859_5_1988 | CSISOLATINCYRILLIC
61 | ARABIC | ASMO_708 | ECMA_114 | ISO_8859_6 | ISO_IR_127 | ISO8859_6 | ISO_8859_6_1987 | CSISOLATINARABIC
62 | ECMA_118 | ELOT_928 | GREEK | GREEK8 | ISO_8859_7 | ISO_IR_126 | ISO8859_7 | ISO_8859_7_1987 | CSISOLATINGREEK
63 | HEBREW | ISO_8859_8 | ISO_IR_138 | ISO8859_8 | ISO_8859_8_1988 | CSISOLATINHEBREW
64 | ISO_8859_9 | ISO_IR_148 | ISO8859_9 | ISO_8859_9_1989 | L5 | LATIN5 | CSISOLATIN5
65 | ISO_8859_10 | ISO_IR_157 | ISO8859_10 | ISO_8859_10_1992 | L6 | LATIN6 | CSISOLATIN6
66 | ISO_8859_13 | ISO_IR_179 | ISO8859_13 | L7 | LATIN7
67 | ISO_8859_14 | ISO_CELTIC | ISO8859_14 | ISO_IR_199 | ISO_8859_14_1998 | L8 | LATIN8
68 | ISO_8859_15 | ISO_IR_203 | ISO8859_15 | ISO_8859_15_1998
69 | ISO_8859_16 | ISO_IR_226 | ISO8859_16 | ISO_8859_16_2000
70 | KOI8_R | CSKOI8R
71 | KOI8_U
72 | KOI8_RU
73 | CP1250 | MS_EE | WINDOWS_1250
74 | CP1251 | MS_CYRL | WINDOWS_1251
75 | CP1252 | MS_ANSI | WINDOWS_1252
76 | CP1253 | MS_GREEK | WINDOWS_1253
77 | CP1254 | MS_TURK | WINDOWS_1254
78 | CP1255 | MS_HEBR | WINDOWS_1255
79 | CP1256 | MS_ARAB | WINDOWS_1256
80 | CP1257 | WINBALTRIM | WINDOWS_1257
81 | CP1258 | WINDOWS_1258
82 | I_850 | CP850 | IBM850 | CSPC850MULTILINGUAL
83 | I_862 | CP862 | IBM862 | CSPC862LATINHEBREW
84 | I_866 | CP866 | IBM866 | CSIBM866
85 | MAC | MACINTOSH | MACROMAN | CSMACINTOSH
86 | MACCENTRALEUROPE
87 | MACICELAND
88 | MACCROATIAN
89 | MACROMANIA
90 | MACCYRILLIC
91 | MACUKRAINE
92 | MACGREEK
93 | MACTURKISH
94 | MACHEBREW
95 | MACARABIC
96 | MACTHAI
97 | HP_ROMAN8 | R8 | ROMAN8 | CSHPROMAN8
98 | NEXTSTEP
99 | ARMSCII_8
100 | GEORGIAN_ACADEMY
101 | GEORGIAN_PS
102 | KOI8_T
103 | MULELAO_1
104 | CP1133 | IBM_CP1133
105 | ISO_IR_166 | TIS_620 | TIS620 | TIS620_0 | TIS620_2529_1 | TIS620_2533_0 | TIS620_2533_1
106 | CP874 | WINDOWS_874
107 | VISCII | VISCII1_1_1 | CSVISCII
108 | TCVN | TCVN_5712 | TCVN5712_1 | TCVN5712_1_1993
109 | ISO_IR_14 | ISO646_JP | JIS_C6220_1969_RO | JP | CSISO14JISC6220RO
110 | JISX0201_1976 | JIS_X0201 | X0201 | CSHALFWIDTHKATAKANA
111 | ISO_IR_87 | JIS0208 | JIS_C6226_1983 | JIS_X0208 | JIS_X0208_1983 | JIS_X0208_1990 | X0208 | CSISO87JISX0208
112 | ISO_IR_159 | JIS_X0212 | JIS_X0212_1990 | JIS_X0212_1990_0 | X0212 | CSISO159JISX02121990
113 | CN | GB_1988_80 | ISO_IR_57 | ISO646_CN | CSISO57GB1988
114 | CHINESE | GB_2312_80 | ISO_IR_58 | CSISO58GB231280
115 | CN_GB_ISOIR165 | ISO_IR_165
116 | ISO_IR_149 | KOREAN | KSC_5601 | KS_C_5601_1987 | KS_C_5601_1989 | CSKSC56011987
117 | EUC_JP | EUCJP | EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE | CSEUCPKDFMTJAPANESE
118 | MS_KANJI | SHIFT_JIS | SJIS | CSSHIFTJIS
119 | CP932
120 | ISO_2022_JP | CSISO2022JP
121 | ISO_2022_JP_1
122 | ISO_2022_JP_2 | CSISO2022JP2
123 | CN_GB | EUC_CN | EUCCN | GB2312 | CSGB2312
124 | CP936 | GBK
125 | GB18030
126 | ISO_2022_CN | CSISO2022CN
127 | ISO_2022_CN_EXT
128 | HZ | HZ_GB_2312
129 | EUC_TW | EUCTW | CSEUCTW
130 | BIG_5 | BIG_FIVE | BIG5 | BIGFIVE | CN_BIG5 | CSBIG5
131 | CP950
132 | BIG5_HKSCS | BIG5HKSCS
133 | EUC_KR | EUCKR | CSEUCKR
134 | CP949 | UHC
135 | CP1361 | JOHAB
136 | ISO_2022_KR | CSISO2022KR
137 | I_437 | CP437 | IBM437 | CSPC8CODEPAGE437
138 | CP737
139 | CP775 | IBM775 | CSPC775BALTIC
140 | I_852 | CP852 | IBM852 | CSPCP852
141 | CP853
142 | I_855 | CP855 | IBM855 | CSIBM855
143 | I_857 | CP857 | IBM857 | CSIBM857
144 | CP858
145 | I_860 | CP860 | IBM860 | CSIBM860
146 | I_861 | CP_IS | CP861 | IBM861 | CSIBM861
147 | I_863 | CP863 | IBM863 | CSIBM863
148 | CP864 | IBM864 | CSIBM864
149 | I_865 | CP865 | IBM865 | CSIBM865
150 | I_869 | CP_GR | CP869 | IBM869 | CSIBM869
151 | CP1125
153 (**********************************************************************************)
154 (* *)
155 (* charsetstubs *)
156 (* *)
157 (**********************************************************************************)
159 exception CharsetError
161 let () = Callback.register_exception "charset_error" CharsetError
163 external get_charset : unit -> string = "ml_locale_charset"
164 external get_default_language : unit -> string = "ml_get_default_language"
165 external convert_string : string -> string -> string -> string = "ml_convert_string"
166 external is_utf8 : string -> bool = "ml_utf8_validate"
168 (**********************************************************************************)
169 (* *)
170 (* utf8_get *)
171 (* *)
172 (**********************************************************************************)
174 (* taken from camomile *)
175 (* $Id$ *)
176 (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
178 let utf8_look s i =
179 let n' =
180 let n = Char.code s.[i] in
181 if n < 0x80 then n else
182 if n <= 0xdf then
183 (n - 0xc0) lsl 6 lor (0x7f land (Char.code s.[i + 1]))
184 else if n <= 0xef then
185 let n' = n - 0xe0 in
186 let m0 = Char.code s.[i + 2] in
187 let m = Char.code (String.unsafe_get s (i + 1)) in
188 let n' = n' lsl 6 lor (0x7f land m) in
189 n' lsl 6 lor (0x7f land m0)
190 else if n <= 0xf7 then
191 let n' = n - 0xf0 in
192 let m0 = Char.code s.[i + 3] in
193 let m = Char.code (String.unsafe_get s (i + 1)) in
194 let n' = n' lsl 6 lor (0x7f land m) in
195 let m = Char.code (String.unsafe_get s (i + 2)) in
196 let n' = n' lsl 6 lor (0x7f land m) in
197 n' lsl 6 lor (0x7f land m0)
198 else if n <= 0xfb then
199 let n' = n - 0xf8 in
200 let m0 = Char.code s.[i + 4] in
201 let m = Char.code (String.unsafe_get s (i + 1)) in
202 let n' = n' lsl 6 lor (0x7f land m) in
203 let m = Char.code (String.unsafe_get s (i + 2)) in
204 let n' = n' lsl 6 lor (0x7f land m) in
205 let m = Char.code (String.unsafe_get s (i + 3)) in
206 let n' = n' lsl 6 lor (0x7f land m) in
207 n' lsl 6 lor (0x7f land m0)
208 else if n <= 0xfd then
209 let n' = n - 0xfc in
210 let m0 = Char.code s.[i + 5] in
211 let m = Char.code (String.unsafe_get s (i + 1)) in
212 let n' = n' lsl 6 lor (0x7f land m) in
213 let m = Char.code (String.unsafe_get s (i + 2)) in
214 let n' = n' lsl 6 lor (0x7f land m) in
215 let m = Char.code (String.unsafe_get s (i + 3)) in
216 let n' = n' lsl 6 lor (0x7f land m) in
217 let m = Char.code (String.unsafe_get s (i + 4)) in
218 let n' = n' lsl 6 lor (0x7f land m) in
219 n' lsl 6 lor (0x7f land m0)
220 else invalid_arg "utf8_look"
222 if n' lsr 31 = 0 then n' else
223 invalid_arg "utf8_look char_of_uint"
225 let rec search_head s i =
226 if i >= String.length s then i else
227 let n = Char.code (String.unsafe_get s i) in
228 if n < 0x80 || n >= 0xc2 then i else
229 search_head s (i + 1)
231 let utf8_next s i =
232 let n = Char.code s.[i] in
233 if n < 0x80 then i + 1 else
234 if n < 0xc0 then search_head s (i + 1) else
235 if n <= 0xdf then i + 2
236 else if n <= 0xef then i + 3
237 else if n <= 0xf7 then i + 4
238 else if n <= 0xfb then i + 5
239 else if n <= 0xfd then i + 6
240 else invalid_arg "utf8_next"
242 let rec nth_aux s i n =
243 if n = 0 then i else
244 nth_aux s (utf8_next s i) (n - 1)
246 let utf8_nth s n = nth_aux s 0 n
248 let utf8_get s n = utf8_look s (utf8_nth s n)
250 (**********************************************************************************)
251 (* *)
252 (* utf8_length *)
253 (* *)
254 (**********************************************************************************)
256 (* taken from camomile *)
257 (* $Id$ *)
258 (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
260 let rec length_aux s c i =
261 if i >= String.length s then c else
262 let n = Char.code (String.unsafe_get s i) in
263 let k =
264 if n < 0x80 then 1 else
265 if n < 0xc0 then invalid_arg "UTF8.length" else
266 if n < 0xe0 then 2 else
267 if n < 0xf0 then 3 else
268 if n < 0xf8 then 4 else
269 if n < 0xfc then 5 else
270 if n < 0xfe then 6 else
271 invalid_arg "UTF8.length" in
272 length_aux s (c + 1) (i + k)
274 let utf8_length s = length_aux s 0 0
276 (**********************************************************************************)
277 (* *)
278 (* add_uchar (internal) *)
279 (* *)
280 (**********************************************************************************)
283 (* taken from camomile *)
284 (* $Id$ *)
285 (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
287 external uint_code : uchar -> int = "%identity"
289 let add_uchar buf u =
290 let masq = 0b111111 in
291 let k = uint_code u in
292 if k < 0 || k >= 0x4000000 then begin
293 Buffer.add_char buf (Char.chr (0xfc + (k lsr 30)));
294 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 24) land masq)));
295 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 18) land masq)));
296 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
297 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
298 Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
299 end else if k <= 0x7f then
300 Buffer.add_char buf (Char.unsafe_chr k)
301 else if k <= 0x7ff then begin
302 Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (k lsr 6)));
303 Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)))
304 end else if k <= 0xffff then begin
305 Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (k lsr 12)));
306 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
307 Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
308 end else if k <= 0x1fffff then begin
309 Buffer.add_char buf (Char.unsafe_chr (0xf0 + (k lsr 18)));
310 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
311 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
312 Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
313 end else begin
314 Buffer.add_char buf (Char.unsafe_chr (0xf8 + (k lsr 24)));
315 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 18) land masq)));
316 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
317 Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
318 Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
321 (**********************************************************************************)
322 (* *)
323 (* charset_to_string *)
324 (* *)
325 (**********************************************************************************)
327 let charset_to_string charset =
328 match charset with
329 | ANSI_X3_4_1968 -> "ANSI_X3.4-1968"
330 | ANSI_X3_4_1986 -> "ANSI_X3.4-1986"
331 | ASCII -> "ASCII"
332 | CP367 -> "CP367"
333 | IBM367 -> "IBM367"
334 | ISO_IR_6 -> "ISO-IR-6"
335 | ISO646_US -> "ISO646-US"
336 | ISO_646_IRV_1991 -> "ISO_646.IRV:1991"
337 | US -> "US"
338 | US_ASCII -> "US-ASCII"
339 | CSASCII -> "CSASCII"
341 | UTF_8 -> "UTF-8"
343 | ISO_10646_UCS_2 -> "ISO-10646-UCS-2"
344 | UCS_2 -> "UCS-2"
345 | CSUNICODE -> "CSUNICODE"
347 | UCS_2BE -> "UCS-2BE"
348 | UNICODE_1_1 -> "UNICODE-1-1"
349 | UNICODEBIG -> "UNICODEBIG"
350 | CSUNICODE11 -> "CSUNICODE11"
352 | UCS_2LE -> "UCS-2LE"
353 | UNICODELITTLE -> "UNICODELITTLE"
355 | ISO_10646_UCS_4 -> "ISO-10646-UCS-4"
356 | UCS_4 -> "UCS-4"
357 | CSUCS4 -> "CSUCS4"
359 | UCS_4BE -> "UCS-4BE"
361 | UCS_4LE -> "UCS-4LE"
363 | UTF_16 -> "UTF-16"
365 | UTF_16BE -> "UTF-16BE"
367 | UTF_16LE -> "UTF-16LE"
369 | UTF_32 -> "UTF-32"
371 | UTF_32BE -> "UTF-32BE"
373 | UTF_32LE -> "UTF-32LE"
375 | UNICODE_1_1_UTF_7 -> "UNICODE-1-1-UTF-7"
376 | UTF_7 -> "UTF-7"
377 | CSUNICODE11UTF7 -> "CSUNICODE11UTF7"
379 | UCS_2_INTERNAL -> "UCS-2-INTERNAL"
381 | UCS_2_SWAPPED -> "UCS-2-SWAPPED"
383 | UCS_4_INTERNAL -> "UCS-4-INTERNAL"
385 | UCS_4_SWAPPED -> "UCS-4-SWAPPED"
387 | C99 -> "C99"
389 | JAVA -> "JAVA"
391 | CP819 -> "CP819"
392 | IBM819 -> "IBM819"
393 | ISO_8859_1 -> "ISO-8859-1"
394 | ISO_IR_100 -> "ISO-IR-100"
395 | ISO8859_1 -> "ISO8859-1"
396 | ISO_8859_1_1987 -> "ISO_8859-1:1987"
397 | L1 -> "L1"
398 | LATIN1 -> "LATIN1"
399 | CSISOLATIN1 -> "CSISOLATIN1"
401 | ISO_8859_2 -> "ISO-8859-2"
402 | ISO_IR_101 -> "ISO-IR-101"
403 | ISO8859_2 -> "ISO8859-2"
404 | ISO_8859_2_1987 -> "ISO_8859-2:1987"
405 | L2 -> "L2"
406 | LATIN2 -> "LATIN2"
407 | CSISOLATIN2 -> "CSISOLATIN2"
409 | ISO_8859_3 -> "ISO-8859-3"
410 | ISO_IR_109 -> "ISO-IR-109"
411 | ISO8859_3 -> "ISO8859-3"
412 | ISO_8859_3_1988 -> "ISO_8859-3:1988"
413 | L3 -> "L3"
414 | LATIN3 -> "LATIN3"
415 | CSISOLATIN3 -> "CSISOLATIN3"
417 | ISO_8859_4 -> "ISO-8859-4"
418 | ISO_IR_110 -> "ISO-IR-110"
419 | ISO8859_4 -> "ISO8859-4"
420 | ISO_8859_4_1988 -> "ISO_8859-4:1988"
421 | L4 -> "L4"
422 | LATIN4 -> "LATIN4"
423 | CSISOLATIN4 -> "CSISOLATIN4"
425 | CYRILLIC -> "CYRILLIC"
426 | ISO_8859_5 -> "ISO-8859-5"
427 | ISO_IR_144 -> "ISO-IR-144"
428 | ISO8859_5 -> "ISO8859-5"
429 | ISO_8859_5_1988 -> "ISO_8859-5:1988"
430 | CSISOLATINCYRILLIC -> "CSISOLATINCYRILLIC"
432 | ARABIC -> "ARABIC"
433 | ASMO_708 -> "ASMO-708"
434 | ECMA_114 -> "ECMA-114"
435 | ISO_8859_6 -> "ISO-8859-6"
436 | ISO_IR_127 -> "ISO-IR-127"
437 | ISO8859_6 -> "ISO8859-6"
438 | ISO_8859_6_1987 -> "ISO_8859-6:1987"
439 | CSISOLATINARABIC -> "CSISOLATINARABIC"
441 | ECMA_118 -> "ECMA-118"
442 | ELOT_928 -> "ELOT_928"
443 | GREEK -> "GREEK"
444 | GREEK8 -> "GREEK8"
445 | ISO_8859_7 -> "ISO-8859-7"
446 | ISO_IR_126 -> "ISO-IR-126"
447 | ISO8859_7 -> "ISO8859-7"
448 | ISO_8859_7_1987 -> "ISO_8859-7:1987"
449 | CSISOLATINGREEK -> "CSISOLATINGREEK"
451 | HEBREW -> "HEBREW"
452 | ISO_8859_8 -> "ISO-8859-8"
453 | ISO_IR_138 -> "ISO-IR-138"
454 | ISO8859_8 -> "ISO8859-8"
455 | ISO_8859_8_1988 -> "ISO_8859-8:1988"
456 | CSISOLATINHEBREW -> "CSISOLATINHEBREW"
458 | ISO_8859_9 -> "ISO-8859-9"
459 | ISO_IR_148 -> "ISO-IR-148"
460 | ISO8859_9 -> "ISO8859-9"
461 | ISO_8859_9_1989 -> "ISO_8859-9:1989"
462 | L5 -> "L5"
463 | LATIN5 -> "LATIN5"
464 | CSISOLATIN5 -> "CSISOLATIN5"
466 | ISO_8859_10 -> "ISO-8859-10"
467 | ISO_IR_157 -> "ISO-IR-157"
468 | ISO8859_10 -> "ISO8859-10"
469 | ISO_8859_10_1992 -> "ISO_8859-10:1992"
470 | L6 -> "L6"
471 | LATIN6 -> "LATIN6"
472 | CSISOLATIN6 -> "CSISOLATIN6"
474 | ISO_8859_13 -> "ISO-8859-13"
475 | ISO_IR_179 -> "ISO-IR-179"
476 | ISO8859_13 -> "ISO8859-13"
477 | L7 -> "L7"
478 | LATIN7 -> "LATIN7"
480 | ISO_8859_14 -> "ISO-8859-14"
481 | ISO_CELTIC -> "ISO-CELTIC"
482 | ISO_IR_199 -> "ISO-IR-199"
483 | ISO8859_14 -> "ISO8859-14"
484 | ISO_8859_14_1998 -> "ISO_8859-14:1998"
485 | L8 -> "L8"
486 | LATIN8 -> "LATIN8"
488 | ISO_8859_15 -> "ISO-8859-15"
489 | ISO_IR_203 -> "ISO-IR-203"
490 | ISO8859_15 -> "ISO8859-15"
491 | ISO_8859_15_1998 -> "ISO_8859-15:1998"
493 | ISO_8859_16 -> "ISO-8859-16"
494 | ISO_IR_226 -> "ISO-IR-226"
495 | ISO8859_16 -> "ISO8859-16"
496 | ISO_8859_16_2000 -> "ISO_8859-16:2000"
498 | KOI8_R -> "KOI8-R"
499 | CSKOI8R -> "CSKOI8R"
501 | KOI8_U -> "KOI8-U"
503 | KOI8_RU -> "KOI8-RU"
505 | CP1250 -> "CP1250"
506 | MS_EE -> "MS-EE"
507 | WINDOWS_1250 -> "WINDOWS-1250"
509 | CP1251 -> "CP1251"
510 | MS_CYRL -> "MS-CYRL"
511 | WINDOWS_1251 -> "WINDOWS-1251"
513 | CP1252 -> "CP1252"
514 | MS_ANSI -> "MS-ANSI"
515 | WINDOWS_1252 -> "WINDOWS-1252"
517 | CP1253 -> "CP1253"
518 | MS_GREEK -> "MS-GREEK"
519 | WINDOWS_1253 -> "WINDOWS-1253"
521 | CP1254 -> "CP1254"
522 | MS_TURK -> "MS-TURK"
523 | WINDOWS_1254 -> "WINDOWS-1254"
525 | CP1255 -> "CP1255"
526 | MS_HEBR -> "MS-HEBR"
527 | WINDOWS_1255 -> "WINDOWS-1255"
529 | CP1256 -> "CP1256"
530 | MS_ARAB -> "MS-ARAB"
531 | WINDOWS_1256 -> "WINDOWS-1256"
533 | CP1257 -> "CP1257"
534 | WINBALTRIM -> "WINBALTRIM"
535 | WINDOWS_1257 -> "WINDOWS-1257"
537 | CP1258 -> "CP1258"
538 | WINDOWS_1258 -> "WINDOWS-1258"
540 | I_850 -> "850"
541 | CP850 -> "CP850"
542 | IBM850 -> "IBM850"
543 | CSPC850MULTILINGUAL -> "CSPC850MULTILINGUAL"
545 | I_862 -> "862"
546 | CP862 -> "CP862"
547 | IBM862 -> "IBM862"
548 | CSPC862LATINHEBREW -> "CSPC862LATINHEBREW"
550 | I_866 -> "866"
551 | CP866 -> "CP866"
552 | IBM866 -> "IBM866"
553 | CSIBM866 -> "CSIBM866"
555 | MAC -> "MAC"
556 | MACINTOSH -> "MACINTOSH"
557 | MACROMAN -> "MACROMAN"
558 | CSMACINTOSH -> "CSMACINTOSH"
560 | MACCENTRALEUROPE -> "MACCENTRALEUROPE"
562 | MACICELAND -> "MACICELAND"
564 | MACCROATIAN -> "MACCROATIAN"
566 | MACROMANIA -> "MACROMANIA"
568 | MACCYRILLIC -> "MACCYRILLIC"
570 | MACUKRAINE -> "MACUKRAINE"
572 | MACGREEK -> "MACGREEK"
574 | MACTURKISH -> "MACTURKISH"
576 | MACHEBREW -> "MACHEBREW"
578 | MACARABIC -> "MACARABIC"
580 | MACTHAI -> "MACTHAI"
582 | HP_ROMAN8 -> "HP-ROMAN8"
583 | R8 -> "R8"
584 | ROMAN8 -> "ROMAN8"
585 | CSHPROMAN8 -> "CSHPROMAN8"
587 | NEXTSTEP -> "NEXTSTEP"
589 | ARMSCII_8 -> "ARMSCII-8"
591 | GEORGIAN_ACADEMY -> "GEORGIAN-ACADEMY"
593 | GEORGIAN_PS -> "GEORGIAN-PS"
595 | KOI8_T -> "KOI8-T"
597 | MULELAO_1 -> "MULELAO-1"
599 | CP1133 -> "CP1133"
600 | IBM_CP1133 -> "IBM-CP1133"
602 | ISO_IR_166 -> "ISO-IR-166"
603 | TIS_620 -> "TIS-620"
604 | TIS620 -> "TIS620"
605 | TIS620_0 -> "TIS620-0"
606 | TIS620_2529_1 -> "TIS620.2529-1"
607 | TIS620_2533_0 -> "TIS620.2533-0"
608 | TIS620_2533_1 -> "TIS620.2533-1"
610 | CP874 -> "CP874"
611 | WINDOWS_874 -> "WINDOWS-874"
613 | VISCII -> "VISCII"
614 | VISCII1_1_1 -> "VISCII1.1-1"
615 | CSVISCII -> "CSVISCII"
617 | TCVN -> "TCVN"
618 | TCVN_5712 -> "TCVN-5712"
619 | TCVN5712_1 -> "TCVN5712-1"
620 | TCVN5712_1_1993 -> "TCVN5712-1:1993"
622 | ISO_IR_14 -> "ISO-IR-14"
623 | ISO646_JP -> "ISO646-JP"
624 | JIS_C6220_1969_RO -> "JIS_C6220-1969-RO"
625 | JP -> "JP"
626 | CSISO14JISC6220RO -> "CSISO14JISC6220RO"
628 | JISX0201_1976 -> "JISX0201-1976"
629 | JIS_X0201 -> "JIS_X0201"
630 | X0201 -> "X0201"
631 | CSHALFWIDTHKATAKANA -> "CSHALFWIDTHKATAKANA"
633 | ISO_IR_87 -> "ISO-IR-87"
634 | JIS0208 -> "JIS0208"
635 | JIS_C6226_1983 -> "JIS_C6226-1983"
636 | JIS_X0208 -> "JIS_X0208"
637 | JIS_X0208_1983 -> "JIS_X0208-1983"
638 | JIS_X0208_1990 -> "JIS_X0208-1990"
639 | X0208 -> "X0208"
640 | CSISO87JISX0208 -> "CSISO87JISX0208"
642 | ISO_IR_159 -> "ISO-IR-159"
643 | JIS_X0212 -> "JIS_X0212"
644 | JIS_X0212_1990 -> "JIS_X0212-1990"
645 | JIS_X0212_1990_0 -> "JIS_X0212.1990-0"
646 | X0212 -> "X0212"
647 | CSISO159JISX02121990 -> "CSISO159JISX02121990"
649 | CN -> "CN"
650 | GB_1988_80 -> "GB_1988-80"
651 | ISO_IR_57 -> "ISO-IR-57"
652 | ISO646_CN -> "ISO646-CN"
653 | CSISO57GB1988 -> "CSISO57GB1988"
655 | CHINESE -> "CHINESE"
656 | GB_2312_80 -> "GB_2312-80"
657 | ISO_IR_58 -> "ISO-IR-58"
658 | CSISO58GB231280 -> "CSISO58GB231280"
660 | CN_GB_ISOIR165 -> "CN-GB-ISOIR165"
661 | ISO_IR_165 -> "ISO-IR-165"
663 | ISO_IR_149 -> "ISO-IR-149"
664 | KOREAN -> "KOREAN"
665 | KSC_5601 -> "KSC_5601"
666 | KS_C_5601_1987 -> "KS_C_5601-1987"
667 | KS_C_5601_1989 -> "KS_C_5601-1989"
668 | CSKSC56011987 -> "CSKSC56011987"
670 | EUC_JP -> "EUC-JP"
671 | EUCJP -> "EUCJP"
672 | EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE -> "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE"
673 | CSEUCPKDFMTJAPANESE -> "CSEUCPKDFMTJAPANESE"
675 | MS_KANJI -> "MS_KANJI"
676 | SHIFT_JIS -> "SHIFT-JIS"
677 | SJIS -> "SJIS"
678 | CSSHIFTJIS -> "CSSHIFTJIS"
680 | CP932 -> "CP932"
682 | ISO_2022_JP -> "ISO-2022-JP"
683 | CSISO2022JP -> "CSISO2022JP"
685 | ISO_2022_JP_1 -> "ISO-2022-JP-1"
687 | ISO_2022_JP_2 -> "ISO-2022-JP-2"
688 | CSISO2022JP2 -> "CSISO2022JP2"
690 | CN_GB -> "CN-GB"
691 | EUC_CN -> "EUC-CN"
692 | EUCCN -> "EUCCN"
693 | GB2312 -> "GB2312"
694 | CSGB2312 -> "CSGB2312"
696 | CP936 -> "CP936"
697 | GBK -> "GBK"
699 | GB18030 -> "GB18030"
701 | ISO_2022_CN -> "ISO-2022-CN"
702 | CSISO2022CN -> "CSISO2022CN"
704 | ISO_2022_CN_EXT -> "ISO-2022-CN-EXT"
706 | HZ -> "HZ"
707 | HZ_GB_2312 -> "HZ-GB-2312"
709 | EUC_TW -> "EUC-TW"
710 | EUCTW -> "EUCTW"
711 | CSEUCTW -> "CSEUCTW"
713 | BIG_5 -> "BIG-5"
714 | BIG_FIVE -> "BIG-FIVE"
715 | BIG5 -> "BIG5"
716 | BIGFIVE -> "BIGFIVE"
717 | CN_BIG5 -> "CN-BIG5"
718 | CSBIG5 -> "CSBIG5"
720 | CP950 -> "CP950"
722 | BIG5_HKSCS -> "BIG5-HKSCS"
723 | BIG5HKSCS -> "BIG5HKSCS"
725 | EUC_KR -> "EUC-KR"
726 | EUCKR -> "EUCKR"
727 | CSEUCKR -> "CSEUCKR"
729 | CP949 -> "CP949"
730 | UHC -> "UHC"
732 | CP1361 -> "CP1361"
733 | JOHAB -> "JOHAB"
735 | ISO_2022_KR -> "ISO-2022-KR"
736 | CSISO2022KR -> "CSISO2022KR"
738 | I_437 -> "437"
739 | CP437 -> "CP437"
740 | IBM437 -> "IBM437"
741 | CSPC8CODEPAGE437 -> "CSPC8CODEPAGE437"
743 | CP737 -> "CP737"
745 | CP775 -> "CP775"
746 | IBM775 -> "IBM775"
747 | CSPC775BALTIC -> "CSPC775BALTIC"
749 | I_852 -> "852"
750 | CP852 -> "CP852"
751 | IBM852 -> "IBM852"
752 | CSPCP852 -> "CSPCP852"
754 | CP853 -> "CP853"
756 | I_855 -> "855"
757 | CP855 -> "CP855"
758 | IBM855 -> "IBM855"
759 | CSIBM855 -> "CSIBM855"
761 | I_857 -> "857"
762 | CP857 -> "CP857"
763 | IBM857 -> "IBM857"
764 | CSIBM857 -> "CSIBM857"
766 | CP858 -> "CP858"
768 | I_860 -> "860"
769 | CP860 -> "CP860"
770 | IBM860 -> "IBM860"
771 | CSIBM860 -> "CSIBM860"
773 | I_861 -> "861"
774 | CP_IS -> "CP-IS"
775 | CP861 -> "CP861"
776 | IBM861 -> "IBM861"
777 | CSIBM861 -> "CSIBM861"
779 | I_863 -> "863"
780 | CP863 -> "CP863"
781 | IBM863 -> "IBM863"
782 | CSIBM863 -> "CSIBM863"
784 | CP864 -> "CP864"
785 | IBM864 -> "IBM864"
786 | CSIBM864 -> "CSIBM864"
788 | I_865 -> "865"
789 | CP865 -> "CP865"
790 | IBM865 -> "IBM865"
791 | CSIBM865 -> "CSIBM865"
793 | I_869 -> "869"
794 | CP_GR -> "CP-GR"
795 | CP869 -> "CP869"
796 | IBM869 -> "IBM869"
797 | CSIBM869 -> "CSIBM869"
799 | CP1125 -> "CP1125"
801 (**********************************************************************************)
802 (* *)
803 (* charset_from_string *)
804 (* *)
805 (**********************************************************************************)
807 let charset_from_string s =
808 match s with
809 | "ANSI_X3.4-1968" -> ANSI_X3_4_1968
810 | "ANSI_X3.4-1986" -> ANSI_X3_4_1986
811 | "ASCII" -> ASCII
812 | "CP367" -> CP367
813 | "IBM367" -> IBM367
814 | "ISO-IR-6" -> ISO_IR_6
815 | "ISO646-US" -> ISO646_US
816 | "ISO_646.IRV:1991" -> ISO_646_IRV_1991
817 | "US" -> US
818 | "US-ASCII" -> US_ASCII
819 | "CSASCII" -> CSASCII
821 | "UTF-8" -> UTF_8
823 | "ISO-10646-UCS-2" -> ISO_10646_UCS_2
824 | "UCS-2" -> UCS_2
825 | "CSUNICODE" -> CSUNICODE
827 | "UCS-2BE" -> UCS_2BE
828 | "UNICODE-1-1" -> UNICODE_1_1
829 | "UNICODEBIG" -> UNICODEBIG
830 | "CSUNICODE11" -> CSUNICODE11
832 | "UCS-2LE" -> UCS_2LE
833 | "UNICODELITTLE" -> UNICODELITTLE
835 | "ISO-10646-UCS-4" -> ISO_10646_UCS_4
836 | "UCS-4" -> UCS_4
837 | "CSUCS4" -> CSUCS4
839 | "UCS-4BE" -> UCS_4BE
841 | "UCS-4LE" -> UCS_4LE
843 | "UTF-16" -> UTF_16
845 | "UTF-16BE" -> UTF_16BE
847 | "UTF-16LE" -> UTF_16LE
849 | "UTF-32" -> UTF_32
851 | "UTF-32BE" -> UTF_32BE
853 | "UTF-32LE" -> UTF_32LE
855 | "UNICODE-1-1-UTF-7" -> UNICODE_1_1_UTF_7
856 | "UTF-7" -> UTF_7
857 | "CSUNICODE11UTF7" -> CSUNICODE11UTF7
859 | "UCS-2-INTERNAL" -> UCS_2_INTERNAL
861 | "UCS-2-SWAPPED" -> UCS_2_SWAPPED
863 | "UCS-4-INTERNAL" -> UCS_4_INTERNAL
865 | "UCS-4-SWAPPED" -> UCS_4_SWAPPED
867 | "C99" -> C99
869 | "JAVA" -> JAVA
871 | "CP819" -> CP819
872 | "IBM819" -> IBM819
873 | "ISO-8859-1" -> ISO_8859_1
874 | "ISO-IR-100" -> ISO_IR_100
875 | "ISO8859-1" -> ISO8859_1
876 | "ISO_8859-1" -> ISO_8859_1
877 | "ISO_8859-1:1987" -> ISO_8859_1_1987
878 | "L1" -> L1
879 | "LATIN1" -> LATIN1
880 | "CSISOLATIN1" -> CSISOLATIN1
882 | "ISO-8859-2" -> ISO_8859_2
883 | "ISO-IR-101" -> ISO_IR_101
884 | "ISO8859-2" -> ISO8859_2
885 | "ISO_8859-2" -> ISO_8859_2
886 | "ISO_8859-2:1987" -> ISO_8859_2_1987
887 | "L2" -> L2
888 | "LATIN2" -> LATIN2
889 | "CSISOLATIN2" -> CSISOLATIN2
891 | "ISO-8859-3" -> ISO_8859_3
892 | "ISO-IR-109" -> ISO_IR_109
893 | "ISO8859-3" -> ISO8859_3
894 | "ISO_8859-3" -> ISO_8859_3
895 | "ISO_8859-3:1988" -> ISO_8859_3_1988
896 | "L3" -> L3
897 | "LATIN3" -> LATIN3
898 | "CSISOLATIN3" -> CSISOLATIN3
900 | "ISO-8859-4" -> ISO_8859_4
901 | "ISO-IR-110" -> ISO_IR_110
902 | "ISO8859-4" -> ISO8859_4
903 | "ISO_8859-4" -> ISO_8859_4
904 | "ISO_8859-4:1988" -> ISO_8859_4_1988
905 | "L4" -> L4
906 | "LATIN4" -> LATIN4
907 | "CSISOLATIN4" -> CSISOLATIN4
909 | "CYRILLIC" -> CYRILLIC
910 | "ISO-8859-5" -> ISO_8859_5
911 | "ISO-IR-144" -> ISO_IR_144
912 | "ISO8859-5" -> ISO8859_5
913 | "ISO_8859-5" -> ISO_8859_5
914 | "ISO_8859-5:1988" -> ISO_8859_5_1988
915 | "CSISOLATINCYRILLIC" -> CSISOLATINCYRILLIC
917 | "ARABIC" -> ARABIC
918 | "ASMO-708" -> ASMO_708
919 | "ECMA-114" -> ECMA_114
920 | "ISO-8859-6" -> ISO_8859_6
921 | "ISO-IR-127" -> ISO_IR_127
922 | "ISO8859-6" -> ISO8859_6
923 | "ISO_8859-6" -> ISO_8859_6
924 | "ISO_8859-6:1987" -> ISO_8859_6_1987
925 | "CSISOLATINARABIC" -> CSISOLATINARABIC
927 | "ECMA-118" -> ECMA_118
928 | "ELOT_928" -> ELOT_928
929 | "GREEK" -> GREEK
930 | "GREEK8" -> GREEK8
931 | "ISO-8859-7" -> ISO_8859_7
932 | "ISO-IR-126" -> ISO_IR_126
933 | "ISO8859-7" -> ISO8859_7
934 | "ISO_8859-7" -> ISO_8859_7
935 | "ISO_8859-7:1987" -> ISO_8859_7_1987
936 | "CSISOLATINGREEK" -> CSISOLATINGREEK
938 | "HEBREW" -> HEBREW
939 | "ISO-8859-8" -> ISO_8859_8
940 | "ISO-IR-138" -> ISO_IR_138
941 | "ISO8859-8" -> ISO8859_8
942 | "ISO_8859-8" -> ISO_8859_8
943 | "ISO_8859-8:1988" -> ISO_8859_8_1988
944 | "CSISOLATINHEBREW" -> CSISOLATINHEBREW
946 | "ISO-8859-9" -> ISO_8859_9
947 | "ISO-IR-148" -> ISO_IR_148
948 | "ISO8859-9" -> ISO8859_9
949 | "ISO_8859-9" -> ISO_8859_9
950 | "ISO_8859-9:1989" -> ISO_8859_9_1989
951 | "L5" -> L5
952 | "LATIN5" -> LATIN5
953 | "CSISOLATIN5" -> CSISOLATIN5
955 | "ISO-8859-10" -> ISO_8859_10
956 | "ISO-IR-157" -> ISO_IR_157
957 | "ISO8859-10" -> ISO8859_10
958 | "ISO_8859-10" -> ISO_8859_10
959 | "ISO_8859-10:1992" -> ISO_8859_10_1992
960 | "L6" -> L6
961 | "LATIN6" -> LATIN6
962 | "CSISOLATIN6" -> CSISOLATIN6
964 | "ISO-8859-13" -> ISO_8859_13
965 | "ISO-IR-179" -> ISO_IR_179
966 | "ISO8859-13" -> ISO8859_13
967 | "ISO_8859-13" -> ISO_8859_13
968 | "L7" -> L7
969 | "LATIN7" -> LATIN7
971 | "ISO-8859-14" -> ISO_8859_14
972 | "ISO-CELTIC" -> ISO_CELTIC
973 | "ISO-IR-199" -> ISO_IR_199
974 | "ISO8859-14" -> ISO8859_14
975 | "ISO_8859-14" -> ISO_8859_14
976 | "ISO_8859-14:1998" -> ISO_8859_14_1998
977 | "L8" -> L8
978 | "LATIN8" -> LATIN8
980 | "ISO-8859-15" -> ISO_8859_15
981 | "ISO-IR-203" -> ISO_IR_203
982 | "ISO8859-15" -> ISO8859_15
983 | "ISO_8859-15" -> ISO_8859_15
984 | "ISO_8859-15:1998" -> ISO_8859_15_1998
986 | "ISO-8859-16" -> ISO_8859_16
987 | "ISO-IR-226" -> ISO_IR_226
988 | "ISO8859-16" -> ISO8859_16
989 | "ISO_8859-16" -> ISO_8859_16
990 | "ISO_8859-16:2000" -> ISO_8859_16_2000
992 | "KOI8-R" -> KOI8_R
993 | "CSKOI8R" -> CSKOI8R
995 | "KOI8-U" -> KOI8_U
997 | "KOI8-RU" -> KOI8_RU
999 | "CP1250" -> CP1250
1000 | "MS-EE" -> MS_EE
1001 | "WINDOWS-1250" -> WINDOWS_1250
1003 | "CP1251" -> CP1251
1004 | "MS-CYRL" -> MS_CYRL
1005 | "WINDOWS-1251" -> WINDOWS_1251
1007 | "CP1252" -> CP1252
1008 | "MS-ANSI" -> MS_ANSI
1009 | "WINDOWS-1252" -> WINDOWS_1252
1011 | "CP1253" -> CP1253
1012 | "MS-GREEK" -> MS_GREEK
1013 | "WINDOWS-1253" -> WINDOWS_1253
1015 | "CP1254" -> CP1254
1016 | "MS-TURK" -> MS_TURK
1017 | "WINDOWS-1254" -> WINDOWS_1254
1019 | "CP1255" -> CP1255
1020 | "MS-HEBR" -> MS_HEBR
1021 | "WINDOWS-1255" -> WINDOWS_1255
1023 | "CP1256" -> CP1256
1024 | "MS-ARAB" -> MS_ARAB
1025 | "WINDOWS-1256" -> WINDOWS_1256
1027 | "CP1257" -> CP1257
1028 | "WINBALTRIM" -> WINBALTRIM
1029 | "WINDOWS-1257" -> WINDOWS_1257
1031 | "CP1258" -> CP1258
1032 | "WINDOWS-1258" -> WINDOWS_1258
1034 | "850" -> I_850
1035 | "CP850" -> CP850
1036 | "IBM850" -> IBM850
1037 | "CSPC850MULTILINGUAL" -> CSPC850MULTILINGUAL
1039 | "862" -> I_862
1040 | "CP862" -> CP862
1041 | "IBM862" -> IBM862
1042 | "CSPC862LATINHEBREW" -> CSPC862LATINHEBREW
1044 | "866" -> I_866
1045 | "CP866" -> CP866
1046 | "IBM866" -> IBM866
1047 | "CSIBM866" -> CSIBM866
1049 | "MAC" -> MAC
1050 | "MACINTOSH" -> MACINTOSH
1051 | "MACROMAN" -> MACROMAN
1052 | "CSMACINTOSH" -> CSMACINTOSH
1054 | "MACCENTRALEUROPE" -> MACCENTRALEUROPE
1056 | "MACICELAND" -> MACICELAND
1058 | "MACCROATIAN" -> MACCROATIAN
1060 | "MACROMANIA" -> MACROMANIA
1062 | "MACCYRILLIC" -> MACCYRILLIC
1064 | "MACUKRAINE" -> MACUKRAINE
1066 | "MACGREEK" -> MACGREEK
1068 | "MACTURKISH" -> MACTURKISH
1070 | "MACHEBREW" -> MACHEBREW
1072 | "MACARABIC" -> MACARABIC
1074 | "MACTHAI" -> MACTHAI
1076 | "HP-ROMAN8" -> HP_ROMAN8
1077 | "R8" -> R8
1078 | "ROMAN8" -> ROMAN8
1079 | "CSHPROMAN8" -> CSHPROMAN8
1081 | "NEXTSTEP" -> NEXTSTEP
1083 | "ARMSCII-8" -> ARMSCII_8
1085 | "GEORGIAN-ACADEMY" -> GEORGIAN_ACADEMY
1087 | "GEORGIAN-PS" -> GEORGIAN_PS
1089 | "KOI8-T" -> KOI8_T
1091 | "MULELAO-1" -> MULELAO_1
1093 | "CP1133" -> CP1133
1094 | "IBM-CP1133" -> IBM_CP1133
1096 | "ISO-IR-166" -> ISO_IR_166
1097 | "TIS-620" -> TIS_620
1098 | "TIS620" -> TIS620
1099 | "TIS620-0" -> TIS620_0
1100 | "TIS620.2529-1" -> TIS620_2529_1
1101 | "TIS620.2533-0" -> TIS620_2533_0
1102 | "TIS620.2533-1" -> TIS620_2533_1
1104 | "CP874" -> CP874
1105 | "WINDOWS-874" -> WINDOWS_874
1107 | "VISCII" -> VISCII
1108 | "VISCII1.1-1" -> VISCII1_1_1
1109 | "CSVISCII" -> CSVISCII
1111 | "TCVN" -> TCVN
1112 | "TCVN-5712" -> TCVN_5712
1113 | "TCVN5712-1" -> TCVN5712_1
1114 | "TCVN5712-1:1993" -> TCVN5712_1_1993
1116 | "ISO-IR-14" -> ISO_IR_14
1117 | "ISO646-JP" -> ISO646_JP
1118 | "JIS_C6220-1969-RO" -> JIS_C6220_1969_RO
1119 | "JP" -> JP
1120 | "CSISO14JISC6220RO" -> CSISO14JISC6220RO
1122 | "JISX0201-1976" -> JISX0201_1976
1123 | "JIS_X0201" -> JIS_X0201
1124 | "X0201" -> X0201
1125 | "CSHALFWIDTHKATAKANA" -> CSHALFWIDTHKATAKANA
1127 | "ISO-IR-87" -> ISO_IR_87
1128 | "JIS0208" -> JIS0208
1129 | "JIS_C6226-1983" -> JIS_C6226_1983
1130 | "JIS_X0208" -> JIS_X0208
1131 | "JIS_X0208-1983" -> JIS_X0208_1983
1132 | "JIS_X0208-1990" -> JIS_X0208_1990
1133 | "X0208" -> X0208
1134 | "CSISO87JISX0208" -> CSISO87JISX0208
1136 | "ISO-IR-159" -> ISO_IR_159
1137 | "JIS_X0212" -> JIS_X0212
1138 | "JIS_X0212-1990" -> JIS_X0212_1990
1139 | "JIS_X0212.1990-0" -> JIS_X0212_1990_0
1140 | "X0212" -> X0212
1141 | "CSISO159JISX02121990" -> CSISO159JISX02121990
1143 | "CN" -> CN
1144 | "GB_1988-80" -> GB_1988_80
1145 | "ISO-IR-57" -> ISO_IR_57
1146 | "ISO646-CN" -> ISO646_CN
1147 | "CSISO57GB1988" -> CSISO57GB1988
1149 | "CHINESE" -> CHINESE
1150 | "GB_2312-80" -> GB_2312_80
1151 | "ISO-IR-58" -> ISO_IR_58
1152 | "CSISO58GB231280" -> CSISO58GB231280
1154 | "CN-GB-ISOIR165" -> CN_GB_ISOIR165
1155 | "ISO-IR-165" -> ISO_IR_165
1157 | "ISO-IR-149" -> ISO_IR_149
1158 | "KOREAN" -> KOREAN
1159 | "KSC_5601" -> KSC_5601
1160 | "KS_C_5601-1987" -> KS_C_5601_1987
1161 | "KS_C_5601-1989" -> KS_C_5601_1989
1162 | "CSKSC56011987" -> CSKSC56011987
1164 | "EUC-JP" -> EUC_JP
1165 | "EUCJP" -> EUCJP
1166 | "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE" -> EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE
1167 | "CSEUCPKDFMTJAPANESE" -> CSEUCPKDFMTJAPANESE
1169 | "MS_KANJI" -> MS_KANJI
1170 | "SHIFT-JIS" -> SHIFT_JIS
1171 | "SHIFT_JIS" -> SHIFT_JIS
1172 | "SJIS" -> SJIS
1173 | "CSSHIFTJIS" -> CSSHIFTJIS
1175 | "CP932" -> CP932
1177 | "ISO-2022-JP" -> ISO_2022_JP
1178 | "CSISO2022JP" -> CSISO2022JP
1180 | "ISO-2022-JP-1" -> ISO_2022_JP_1
1182 | "ISO-2022-JP-2" -> ISO_2022_JP_2
1183 | "CSISO2022JP2" -> CSISO2022JP2
1185 | "CN-GB" -> CN_GB
1186 | "EUC-CN" -> EUC_CN
1187 | "EUCCN" -> EUCCN
1188 | "GB2312" -> GB2312
1189 | "CSGB2312" -> CSGB2312
1191 | "CP936" -> CP936
1192 | "GBK" -> GBK
1194 | "GB18030" -> GB18030
1196 | "ISO-2022-CN" -> ISO_2022_CN
1197 | "CSISO2022CN" -> CSISO2022CN
1199 | "ISO-2022-CN-EXT" -> ISO_2022_CN_EXT
1201 | "HZ" -> HZ
1202 | "HZ-GB-2312" -> HZ_GB_2312
1204 | "EUC-TW" -> EUC_TW
1205 | "EUCTW" -> EUCTW
1206 | "CSEUCTW" -> CSEUCTW
1208 | "BIG-5" -> BIG_5
1209 | "BIG-FIVE" -> BIG_FIVE
1210 | "BIG5" -> BIG5
1211 | "BIGFIVE" -> BIGFIVE
1212 | "CN-BIG5" -> CN_BIG5
1213 | "CSBIG5" -> CSBIG5
1215 | "CP950" -> CP950
1217 | "BIG5-HKSCS" -> BIG5_HKSCS
1218 | "BIG5HKSCS" -> BIG5HKSCS
1220 | "EUC-KR" -> EUC_KR
1221 | "EUCKR" -> EUCKR
1222 | "CSEUCKR" -> CSEUCKR
1224 | "CP949" -> CP949
1225 | "UHC" -> UHC
1227 | "CP1361" -> CP1361
1228 | "JOHAB" -> JOHAB
1230 | "ISO-2022-KR" -> ISO_2022_KR
1231 | "CSISO2022KR" -> CSISO2022KR
1233 | "437" -> I_437
1234 | "CP437" -> CP437
1235 | "IBM437" -> IBM437
1236 | "CSPC8CODEPAGE437" -> CSPC8CODEPAGE437
1238 | "CP737" -> CP737
1240 | "CP775" -> CP775
1241 | "IBM775" -> IBM775
1242 | "CSPC775BALTIC" -> CSPC775BALTIC
1244 | "852" -> I_852
1245 | "CP852" -> CP852
1246 | "IBM852" -> IBM852
1247 | "CSPCP852" -> CSPCP852
1249 | "CP853" -> CP853
1251 | "855" -> I_855
1252 | "CP855" -> CP855
1253 | "IBM855" -> IBM855
1254 | "CSIBM855" -> CSIBM855
1256 | "857" -> I_857
1257 | "CP857" -> CP857
1258 | "IBM857" -> IBM857
1259 | "CSIBM857" -> CSIBM857
1261 | "CP858" -> CP858
1263 | "860" -> I_860
1264 | "CP860" -> CP860
1265 | "IBM860" -> IBM860
1266 | "CSIBM860" -> CSIBM860
1268 | "861" -> I_861
1269 | "CP-IS" -> CP_IS
1270 | "CP861" -> CP861
1271 | "IBM861" -> IBM861
1272 | "CSIBM861" -> CSIBM861
1274 | "863" -> I_863
1275 | "CP863" -> CP863
1276 | "IBM863" -> IBM863
1277 | "CSIBM863" -> CSIBM863
1279 | "CP864" -> CP864
1280 | "IBM864" -> IBM864
1281 | "CSIBM864" -> CSIBM864
1283 | "865" -> I_865
1284 | "CP865" -> CP865
1285 | "IBM865" -> IBM865
1286 | "CSIBM865" -> CSIBM865
1288 | "869" -> I_869
1289 | "CP-GR" -> CP_GR
1290 | "CP869" -> CP869
1291 | "IBM869" -> IBM869
1292 | "CSIBM869" -> CSIBM869
1294 | "CP1125" -> CP1125
1296 | _ -> ASCII
1299 (**********************************************************************************)
1300 (* *)
1301 (* normalize_language *)
1302 (* *)
1303 (**********************************************************************************)
1305 let normalize_language s =
1306 let s = String.uppercase s in
1307 if String.length s > 1
1308 then begin
1309 (* We have to distinguish between ZH_tw and ZH_cn here
1310 ZH_tw = BIG5/Chinese traditional -- ZH_cn = GBK/Chinese simplified *)
1311 if String.sub s 0 2 = "ZH" && String.length s > 4 then
1312 String.sub s 0 5
1313 else
1314 String.sub s 0 2
1315 end else "EN"
1317 (**********************************************************************************)
1318 (* *)
1319 (* variables *)
1320 (* *)
1321 (**********************************************************************************)
1324 let charset_aliases =
1326 [ANSI_X3_4_1968; ANSI_X3_4_1986; ASCII; CP367; IBM367; ISO_IR_6; ISO646_US; ISO_646_IRV_1991; US; US_ASCII; CSASCII];
1327 [UTF_8];
1328 [ISO_10646_UCS_2; UCS_2; CSUNICODE];
1329 [UCS_2BE; UNICODE_1_1; UNICODEBIG; CSUNICODE11];
1330 [UCS_2LE; UNICODELITTLE];
1331 [ISO_10646_UCS_4; UCS_4; CSUCS4];
1332 [UCS_4BE];
1333 [UCS_4LE];
1334 [UTF_16];
1335 [UTF_16BE];
1336 [UTF_16LE];
1337 [UTF_32];
1338 [UTF_32BE];
1339 [UTF_32LE];
1340 [UNICODE_1_1_UTF_7; UTF_7; CSUNICODE11UTF7];
1341 [UCS_2_INTERNAL];
1342 [UCS_2_SWAPPED];
1343 [UCS_4_INTERNAL];
1344 [UCS_4_SWAPPED];
1345 [C99];
1346 [JAVA];
1347 [CP819; IBM819; ISO_8859_1; ISO_IR_100; ISO8859_1; ISO_8859_1_1987; L1; LATIN1; CSISOLATIN1];
1348 [ISO_8859_2; ISO_IR_101; ISO8859_2; ISO_8859_2_1987; L2; LATIN2; CSISOLATIN2];
1349 [ISO_8859_3; ISO_IR_109; ISO8859_3; ISO_8859_3_1988; L3; LATIN3; CSISOLATIN3];
1350 [ISO_8859_4; ISO_IR_110; ISO8859_4; ISO_8859_4_1988; L4; LATIN4; CSISOLATIN4];
1351 [CYRILLIC; ISO_8859_5; ISO_IR_144; ISO8859_5; ISO_8859_5_1988; CSISOLATINCYRILLIC];
1352 [ARABIC; ASMO_708; ECMA_114; ISO_8859_6; ISO_IR_127; ISO8859_6; ISO_8859_6_1987; CSISOLATINARABIC];
1353 [ECMA_118; ELOT_928; GREEK; GREEK8; ISO_8859_7; ISO_IR_126; ISO8859_7; ISO_8859_7_1987; CSISOLATINGREEK];
1354 [HEBREW; ISO_8859_8; ISO_IR_138; ISO8859_8; ISO_8859_8_1988; CSISOLATINHEBREW];
1355 [ISO_8859_9; ISO_IR_148; ISO8859_9; ISO_8859_9_1989; L5; LATIN5; CSISOLATIN5];
1356 [ISO_8859_10; ISO_IR_157; ISO8859_10; ISO_8859_10_1992; L6; LATIN6; CSISOLATIN6];
1357 [ISO_8859_13; ISO_IR_179; ISO8859_13; L7; LATIN7];
1358 [ISO_8859_14; ISO_CELTIC; ISO_IR_199; ISO_8859_14; ISO_8859_14_1998; L8; LATIN8];
1359 [ISO_8859_15; ISO_IR_203; ISO8859_15; ISO_8859_15_1998];
1360 [ISO_8859_16; ISO_IR_226; ISO8859_16; ISO_8859_16_2000];
1361 [KOI8_R; CSKOI8R];
1362 [KOI8_U];
1363 [KOI8_RU];
1364 [CP1250; MS_EE; WINDOWS_1250];
1365 [CP1251; MS_CYRL; WINDOWS_1251];
1366 [CP1252; MS_ANSI; WINDOWS_1252];
1367 [CP1253; MS_GREEK; WINDOWS_1253];
1368 [CP1254; MS_TURK; WINDOWS_1254];
1369 [CP1255; MS_HEBR; WINDOWS_1255];
1370 [CP1256; MS_ARAB; WINDOWS_1256];
1371 [CP1257; WINBALTRIM; WINDOWS_1257];
1372 [CP1258; WINDOWS_1258];
1373 [I_850; CP850; IBM850; CSPC850MULTILINGUAL];
1374 [I_862; CP862; IBM862; CSPC862LATINHEBREW];
1375 [I_866; CP866; IBM866; CSIBM866];
1376 [MAC; MACINTOSH; MACROMAN; CSMACINTOSH];
1377 [MACCENTRALEUROPE];
1378 [MACICELAND];
1379 [MACCROATIAN];
1380 [MACROMANIA];
1381 [MACCYRILLIC];
1382 [MACUKRAINE];
1383 [MACGREEK];
1384 [MACTURKISH];
1385 [MACHEBREW];
1386 [MACARABIC];
1387 [MACTHAI];
1388 [HP_ROMAN8; R8; ROMAN8; CSHPROMAN8]; (* no region *)
1389 [NEXTSTEP]; (* no region *)
1390 [ARMSCII_8];
1391 [GEORGIAN_ACADEMY];
1392 [GEORGIAN_PS];
1393 [KOI8_T];
1394 [MULELAO_1];
1395 [CP1133; IBM_CP1133];
1396 [ISO_IR_166; TIS_620; TIS620; TIS620_0; TIS620_2529_1; TIS620_2533_0; TIS620_2533_1];
1397 [CP874; WINDOWS_874];
1398 [VISCII; VISCII1_1_1; CSVISCII];
1399 [TCVN; TCVN_5712; TCVN5712_1; TCVN5712_1_1993];
1400 [ISO_IR_14; ISO646_JP; JIS_C6220_1969_RO; JP; CSISO14JISC6220RO];
1401 [JISX0201_1976; JIS_X0201; X0201; CSHALFWIDTHKATAKANA];
1402 [ISO_IR_87; JIS0208; JIS_C6226_1983; JIS_X0208; JIS_X0208_1983; JIS_X0208_1990; X0208; CSISO87JISX0208];
1403 [ISO_IR_159; JIS_X0212; JIS_X0212_1990; JIS_X0212_1990_0; X0212; CSISO159JISX02121990];
1404 [CN; GB_1988_80; ISO_IR_57; ISO646_CN; CSISO57GB1988];
1405 [CHINESE; GB_2312_80; ISO_IR_58; CSISO58GB231280];
1406 [CN_GB_ISOIR165; ISO_IR_165];
1407 [ISO_IR_149; KOREAN; KSC_5601; KS_C_5601_1987; KS_C_5601_1989; CSKSC56011987];
1408 [EUC_JP; EUCJP; EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE; CSEUCPKDFMTJAPANESE];
1409 [MS_KANJI; SHIFT_JIS; SJIS; CSSHIFTJIS];
1410 [CP932];
1411 [ISO_2022_JP; CSISO2022JP];
1412 [ISO_2022_JP_1];
1413 [ISO_2022_JP_2; CSISO2022JP2];
1414 [CN_GB; EUC_CN; EUCCN; GB2312; CSGB2312];
1415 [CP936; GBK];
1416 [GB18030];
1417 [ISO_2022_CN; CSISO2022CN];
1418 [ISO_2022_CN_EXT];
1419 [HZ; HZ_GB_2312];
1420 [EUC_TW; EUCTW; CSEUCTW];
1421 [BIG_5; BIG_FIVE; BIG5; BIGFIVE; CN_BIG5; CSBIG5];
1422 [CP950];
1423 [BIG5_HKSCS; BIG5HKSCS];
1424 [EUC_KR; EUCKR; CSEUCKR];
1425 [CP949; UHC];
1426 [CP1361; JOHAB];
1427 [ISO_2022_KR; CSISO2022KR];
1428 [I_437; CP437; IBM437; CSPC8CODEPAGE437];
1429 [CP737];
1430 [CP775; IBM775; CSPC775BALTIC];
1431 [I_852; CP852; IBM852; CSPCP852];
1432 [CP853]; (* no region *)
1433 [I_855; CP855; IBM855; CSIBM855];
1434 [I_857; CP857; IBM857; CSIBM857];
1435 [CP858]; (* no region *)
1436 [I_860; CP860; IBM860; CSIBM860];
1437 [I_861; CP_IS; CP861; IBM861; CSIBM861];
1438 [I_863; CP863; IBM863; CSIBM863];
1439 [CP864; IBM864; CSIBM864];
1440 [I_865; CP865; IBM865; CSIBM865];
1441 [I_869; CP_GR; CP869; IBM869; CSIBM869];
1442 [CP1125];
1446 let ascii =
1448 [ANSI_X3_4_1968; ANSI_X3_4_1986; ASCII; CP367; IBM367; ISO_IR_6; ISO646_US; ISO_646_IRV_1991; US; US_ASCII; CSASCII];
1449 [C99];
1452 let arabic =
1454 [CP1256; MS_ARAB; WINDOWS_1256];
1455 [ARABIC; ASMO_708; ECMA_114; ISO_8859_6; ISO_IR_127; ISO8859_6; ISO_8859_6_1987; CSISOLATINARABIC];
1456 [CP864; IBM864; CSIBM864]; [CP864; IBM864; CSIBM864];
1457 [MACARABIC];
1460 let armenian =
1462 [ARMSCII_8];
1465 let baltic =
1467 [CP1257; WINBALTRIM; WINDOWS_1257];
1468 [ISO_8859_13; ISO_IR_179; ISO8859_13; L7; LATIN7];
1469 [ISO_8859_4; ISO_IR_110; ISO8859_4; ISO_8859_4_1988; L4; LATIN4; CSISOLATIN4];
1470 [CP775; IBM775; CSPC775BALTIC];
1471 [MACUKRAINE];
1472 [MACCROATIAN];
1475 let celtic =
1477 [ISO_8859_14; ISO_CELTIC; ISO_IR_199; ISO8859_14; ISO_8859_14_1998; L8; LATIN8];
1480 let central_european =
1482 [CP1250; MS_EE; WINDOWS_1250];
1483 [ISO_8859_2; ISO_IR_101; ISO8859_2; ISO_8859_2_1987; L2; LATIN2; CSISOLATIN2];
1484 [I_852; CP852; IBM852; CSPCP852];
1485 [MACCENTRALEUROPE];
1488 let chinese_simplified =
1490 [GB18030];
1491 [CP936; GBK];
1492 [CN_GB; EUC_CN; EUCCN; GB2312; CSGB2312];
1493 [CHINESE; GB_2312_80; ISO_IR_58; CSISO58GB231280];
1494 [CN; GB_1988_80; ISO_IR_57; ISO646_CN; CSISO57GB1988];
1495 [CN_GB_ISOIR165; ISO_IR_165];
1496 [ISO_2022_CN; CSISO2022CN];
1497 [ISO_2022_CN_EXT];
1498 [HZ; HZ_GB_2312];
1501 let chinese_traditional =
1503 [BIG_5; BIG_FIVE; BIG5; BIGFIVE; CN_BIG5; CSBIG5];
1504 [BIG5_HKSCS; BIG5HKSCS];
1505 [EUC_TW; EUCTW; CSEUCTW];
1506 [CP950];
1509 let cyrillic =
1511 [CP1251; MS_CYRL; WINDOWS_1251];
1512 [I_866; CP866; IBM866; CSIBM866];
1513 [KOI8_R; CSKOI8R];
1514 [CYRILLIC; ISO_8859_5; ISO_IR_144; ISO8859_5; ISO_8859_5_1988; CSISOLATINCYRILLIC];
1515 [I_855; CP855; IBM855; CSIBM855];
1516 [KOI8_U];
1517 [KOI8_RU];
1518 [MACCYRILLIC];
1519 [CP1125];
1522 let georgian =
1524 [GEORGIAN_ACADEMY];
1525 [GEORGIAN_PS];
1528 let greek =
1530 [CP1253; MS_GREEK; WINDOWS_1253];
1531 [ECMA_118; ELOT_928; GREEK; GREEK8; ISO_8859_7; ISO_IR_126; ISO8859_7; ISO_8859_7_1987; CSISOLATINGREEK];
1532 [MACGREEK];
1533 [CP737];
1534 [I_869; CP_GR; CP869; IBM869; CSIBM869];
1537 let hebrew =
1539 [CP1255; MS_HEBR; WINDOWS_1255];
1540 [HEBREW; ISO_8859_8; ISO_IR_138; ISO8859_8; ISO_8859_8_1988; CSISOLATINHEBREW];
1541 [I_862; CP862; IBM862; CSPC862LATINHEBREW];
1542 [MACHEBREW];
1545 let japanese =
1547 [EUC_JP; EUCJP; EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE; CSEUCPKDFMTJAPANESE];
1548 [ISO_2022_JP; CSISO2022JP];
1549 [ISO_2022_JP_1];
1550 [ISO_2022_JP_2; CSISO2022JP2];
1551 [MS_KANJI; SHIFT_JIS; SJIS; CSSHIFTJIS];
1552 [ISO_IR_14; ISO646_JP; JIS_C6220_1969_RO; JP; CSISO14JISC6220RO];
1553 [JISX0201_1976; JIS_X0201; X0201; CSHALFWIDTHKATAKANA];
1554 [ISO_IR_87; JIS0208; JIS_C6226_1983; JIS_X0208; JIS_X0208_1983; JIS_X0208_1990; X0208; CSISO87JISX0208];
1555 [ISO_IR_159; JIS_X0212; JIS_X0212_1990; JIS_X0212_1990_0; X0212; CSISO159JISX02121990];
1556 [CP932];
1559 let korean =
1561 [CP949; UHC];
1562 [CP1361; JOHAB];
1563 [EUC_KR; EUCKR; CSEUCKR];
1564 [ISO_2022_KR; CSISO2022KR];
1565 [ISO_IR_149; KOREAN; KSC_5601; KS_C_5601_1987; KS_C_5601_1989; CSKSC56011987];
1568 let nordic =
1570 [ISO_8859_10; ISO_IR_157; ISO8859_10; ISO_8859_10_1992; L6; LATIN6; CSISOLATIN6];
1571 [I_861; CP_IS; CP861; IBM861; CSIBM861];
1572 [MACICELAND];
1573 [I_865; CP865; IBM865; CSIBM865];
1576 let romanian =
1578 [ISO_8859_16; ISO_IR_226; ISO8859_16; ISO_8859_16_2000];
1579 [MACROMANIA];
1582 let south_european =
1584 [ISO_8859_3; ISO_IR_109; ISO8859_3; ISO_8859_3_1988; L3; LATIN3; CSISOLATIN3];
1587 let tajik =
1589 [KOI8_T];
1592 let thai =
1594 [CP874; WINDOWS_874];
1595 [ISO_IR_166; TIS_620; TIS620; TIS620_0; TIS620_2529_1; TIS620_2533_0; TIS620_2533_1];
1596 [MULELAO_1];
1597 [CP1133; IBM_CP1133];
1598 [MACTHAI];
1601 let turkish =
1603 [CP1254; MS_TURK; WINDOWS_1254];
1604 [ISO_8859_9; ISO_IR_148; ISO8859_9; ISO_8859_9_1989; L5; LATIN5; CSISOLATIN5];
1605 [I_857; CP857; IBM857; CSIBM857];
1606 [MACTURKISH];
1609 let unicode =
1611 [ISO_10646_UCS_2; UCS_2; CSUNICODE];
1612 [UCS_2BE; UNICODE_1_1; UNICODEBIG; CSUNICODE11];
1613 [UCS_2LE; UNICODELITTLE];
1614 [ISO_10646_UCS_4; UCS_4; CSUCS4];
1615 [UCS_4BE];
1616 [UCS_4LE];
1617 [UTF_16];
1618 [UTF_16BE];
1619 [UTF_16LE];
1620 [UTF_32];
1621 [UTF_32BE];
1622 [UTF_32LE];
1623 [UNICODE_1_1_UTF_7; UTF_7; CSUNICODE11UTF7];
1624 [UCS_2_INTERNAL];
1625 [UCS_2_SWAPPED];
1626 [UCS_4_INTERNAL];
1627 [UCS_4_SWAPPED];
1630 let vietnamese =
1632 [CP1258; WINDOWS_1258];
1633 [VISCII; VISCII1_1_1; CSVISCII];
1634 [TCVN; TCVN_5712; TCVN5712_1; TCVN5712_1_1993];
1637 let western_european =
1639 [CP1252; MS_ANSI; WINDOWS_1252];
1640 [ISO_8859_15; ISO_IR_203; ISO8859_15; ISO_8859_15_1998];
1641 [I_850; CP850; IBM850; CSPC850MULTILINGUAL];
1642 [CP819; IBM819; ISO_8859_1; ISO_IR_100; ISO8859_1; ISO_8859_1; ISO_8859_1_1987; L1; LATIN1; CSISOLATIN1];
1643 [MAC; MACINTOSH; MACROMAN; CSMACINTOSH];
1644 [I_437; CP437; IBM437; CSPC8CODEPAGE437];
1645 [I_860; CP860; IBM860; CSIBM860];
1646 [I_863; CP863; IBM863; CSIBM863];
1649 let convert ~from_charset ~to_charset s =
1650 if s <> "" then begin
1651 let t = charset_to_string to_charset in
1652 let f = charset_to_string from_charset in
1653 convert_string s t f
1654 end else s
1656 let safe_convert enc s =
1657 match enc with
1658 | "" -> s
1659 | enc ->
1661 convert
1662 ~from_charset: (charset_from_string enc)
1663 ~to_charset:UTF_8
1665 with _ -> s
1667 (* Locale specific conversions *)
1668 module Locale = struct
1670 (* FIXME move away! *)
1671 let () =
1672 (* block signals until core started correctly *)
1673 (MlUnix.set_signal Sys.sigint
1674 (Sys.Signal_handle (fun _ -> ())));
1675 (MlUnix.set_signal Sys.sigterm
1676 (Sys.Signal_handle (fun _ -> ())))
1678 let locale =
1680 let cs = get_charset () in
1681 charset_from_string cs
1682 with _ -> ASCII
1684 let locale_string = charset_to_string locale
1686 let (enc_list : string list ref) = ref []
1687 let nenc = ref 0
1688 let char_const = "_"
1690 let default_language =
1691 let s = get_default_language () in
1692 let s = normalize_language s in
1696 let all_regions =
1698 ascii;
1699 arabic;
1700 armenian;
1701 baltic;
1702 celtic;
1703 central_european;
1704 chinese_simplified;
1705 chinese_traditional;
1706 cyrillic;
1707 georgian;
1708 greek;
1709 hebrew;
1710 japanese;
1711 korean;
1712 nordic;
1713 romanian;
1714 south_european;
1715 tajik;
1716 thai;
1717 turkish;
1718 vietnamese;
1719 western_european;
1723 (* See http://www.gnu.org/software/gettext/manual/html_chapter/gettext_15.html#SEC221
1724 * The strategy is not perfect. Any comment to improve it, is highly appreciated.
1725 * The charset list shall be improved according to the language detected on the
1726 * target machine.
1729 let charset_list_from_language lang =
1730 let li = ref [] in
1731 li := ascii :: unicode :: !li;
1732 let _ =
1733 match lang with
1734 "AR" -> li := arabic :: !li
1735 | "HY" -> li := armenian :: !li
1736 | "LT"
1737 | "LV"
1738 | "MI" -> li := baltic :: !li
1739 | "CY" -> li := celtic :: western_european :: !li
1740 | "BS"
1741 | "CS"
1742 | "HR"
1743 | "HU"
1744 | "PL"
1745 | "SK"
1746 | "SL" -> li := central_european :: !li
1747 | "SH"
1748 | "SR" -> li := central_european :: cyrillic ::!li
1749 | "ZH_CN" -> li := chinese_simplified :: !li
1750 | "ZH_TW" -> li := chinese_traditional :: !li
1751 | "ZH" -> li := chinese_traditional :: chinese_simplified :: !li
1752 | "BE"
1753 | "BG"
1754 | "MK"
1755 | "RU"
1756 | "UK" -> li := cyrillic :: !li
1757 | "KA" -> li := georgian :: !li
1758 | "EL" -> li := greek :: !li
1759 | "YI"
1760 | "HE"
1761 | "IW" -> li := hebrew :: !li
1762 | "JA" -> li := japanese :: !li
1763 | "KO" -> li := korean :: !li
1764 | "RO" -> li := romanian :: central_european :: !li
1765 | "MT" -> li := south_european :: !li
1766 | "TG" -> li := tajik :: !li
1767 | "TH" -> li := thai :: !li
1768 | "TR" -> li := turkish :: !li
1769 | "VI" -> li := vietnamese :: !li
1770 | "AF"
1771 | "AN"
1772 | "BR"
1773 | "CA"
1774 | "DA"
1775 | "DE"
1776 | "EN"
1777 | "ES"
1778 | "ET"
1779 | "EU"
1780 | "FI"
1781 | "FO"
1782 | "FR"
1783 | "GA"
1784 | "GL"
1785 | "GV"
1786 | "ID"
1787 | "IS"
1788 | "IT"
1789 | "KL"
1790 | "KW"
1791 | "MS"
1792 | "NL"
1793 | "NN"
1794 | "NO"
1795 | "OC"
1796 | "PT"
1797 | "SQ"
1798 | "SV"
1799 | "TL"
1800 | "UZ"
1801 | "WA" -> li := western_european :: !li
1802 | _ -> ()
1804 List.flatten !li
1806 let set_default_charset_list (lang : string) =
1807 (* Let's get rid of charset aliases *)
1808 let l = List.map (fun li -> List.hd li) (charset_list_from_language lang) in
1809 enc_list := List.map (fun c -> charset_to_string c ) l;
1810 (* Printf2.lprintf "List of charmap used to convert the strings:\n";
1811 List.iter (fun enc ->
1812 Printf2.lprintf " Use encoding %s\n" enc;
1813 ) !enc_list; *)
1814 nenc := List.length !enc_list
1816 let conversion_enabled = ref true
1818 let slow_encode_from_utf8 s to_codeset =
1819 let us = ref "" in
1820 let slen = utf8_length s in
1821 let buf = Buffer.create 10 in
1822 for i = 0 to (slen - 1) do
1824 let uchar = utf8_get s i in
1825 add_uchar buf uchar;
1826 let s' = Buffer.contents buf in
1827 Buffer.reset buf;
1828 let s' = convert_string s' to_codeset "UTF-8" in
1829 us := !us ^ s'
1830 with _ ->
1831 us := !us ^ char_const
1832 done;
1835 let slow_encode s to_codeset =
1836 if is_utf8 s
1837 then slow_encode_from_utf8 s to_codeset
1838 else begin
1839 let us = ref "" in
1840 let slen = String.length s in
1841 for i = 0 to (slen - 1) do
1843 us := !us ^ (convert_string (String.sub s i 1) to_codeset locale_string)
1844 with _ ->
1845 us := !us ^ char_const
1846 done;
1850 let fast_encode s to_codeset =
1851 let rec iter i n =
1852 if i = n
1853 then slow_encode s to_codeset
1854 else
1856 let from_codeset = List.nth !enc_list i in
1857 convert_string s to_codeset from_codeset
1858 with _ -> iter (i + 1) !nenc
1859 in iter 0 !nenc
1861 let to_utf8 s =
1862 if s = ""
1863 then s
1864 else if is_utf8 s
1865 then s
1866 else fast_encode s "UTF-8"
1868 let to_locale s =
1869 if s = "" || not !conversion_enabled
1870 then s
1871 else begin
1872 let s = to_utf8 s in
1873 match locale with
1874 UTF_8 -> s
1875 | _ ->
1876 begin
1878 convert_string s locale_string "UTF-8"
1879 with _ ->
1880 slow_encode_from_utf8 s locale_string
1884 let () =
1885 set_default_charset_list default_language
1887 end (* Locale *)