patch #7180
[mldonkey.git] / src / utils / lib / charset.mli
blobb225942aad371c478bee46f89757a08352296631
1 (* Copyright 2005 b8_bavard, INRIA *)
2 (*
3 This file is part of mldonkey.
5 mldonkey is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 mldonkey is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with mldonkey; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 type uchar = int
22 type charset =
23 | ANSI_X3_4_1968 | ANSI_X3_4_1986 | ASCII | CP367 | IBM367 | ISO_IR_6 | ISO646_US | ISO_646_IRV_1991 | US | US_ASCII | CSASCII
24 | UTF_8
25 | ISO_10646_UCS_2 | UCS_2 | CSUNICODE
26 | UCS_2BE | UNICODE_1_1 | UNICODEBIG | CSUNICODE11
27 | UCS_2LE | UNICODELITTLE
28 | ISO_10646_UCS_4 | UCS_4 | CSUCS4
29 | UCS_4BE
30 | UCS_4LE
31 | UTF_16
32 | UTF_16BE
33 | UTF_16LE
34 | UTF_32
35 | UTF_32BE
36 | UTF_32LE
37 | UNICODE_1_1_UTF_7 | UTF_7 | CSUNICODE11UTF7
38 | UCS_2_INTERNAL
39 | UCS_2_SWAPPED
40 | UCS_4_INTERNAL
41 | UCS_4_SWAPPED
42 | C99
43 | JAVA
44 | CP819 | IBM819 | ISO_8859_1 | ISO_IR_100 | ISO8859_1 | ISO_8859_1_1987 | L1 | LATIN1 | CSISOLATIN1
45 | ISO_8859_2 | ISO_IR_101 | ISO8859_2 | ISO_8859_2_1987 | L2 | LATIN2 | CSISOLATIN2
46 | ISO_8859_3 | ISO_IR_109 | ISO8859_3 | ISO_8859_3_1988 | L3 | LATIN3 | CSISOLATIN3
47 | ISO_8859_4 | ISO_IR_110 | ISO8859_4 | ISO_8859_4_1988 | L4 | LATIN4 | CSISOLATIN4
48 | CYRILLIC | ISO_8859_5 | ISO_IR_144 | ISO8859_5 | ISO_8859_5_1988 | CSISOLATINCYRILLIC
49 | ARABIC | ASMO_708 | ECMA_114 | ISO_8859_6 | ISO_IR_127 | ISO8859_6 | ISO_8859_6_1987 | CSISOLATINARABIC
50 | ECMA_118 | ELOT_928 | GREEK | GREEK8 | ISO_8859_7 | ISO_IR_126 | ISO8859_7 | ISO_8859_7_1987 | CSISOLATINGREEK
51 | HEBREW | ISO_8859_8 | ISO_IR_138 | ISO8859_8 | ISO_8859_8_1988 | CSISOLATINHEBREW
52 | ISO_8859_9 | ISO_IR_148 | ISO8859_9 | ISO_8859_9_1989 | L5 | LATIN5 | CSISOLATIN5
53 | ISO_8859_10 | ISO_IR_157 | ISO8859_10 | ISO_8859_10_1992 | L6 | LATIN6 | CSISOLATIN6
54 | ISO_8859_13 | ISO_IR_179 | ISO8859_13 | L7 | LATIN7
55 | ISO_8859_14 | ISO_CELTIC | ISO8859_14 | ISO_IR_199 | ISO_8859_14_1998 | L8 | LATIN8
56 | ISO_8859_15 | ISO_IR_203 | ISO8859_15 | ISO_8859_15_1998
57 | ISO_8859_16 | ISO_IR_226 | ISO8859_16 | ISO_8859_16_2000
58 | KOI8_R | CSKOI8R
59 | KOI8_U
60 | KOI8_RU
61 | CP1250 | MS_EE | WINDOWS_1250
62 | CP1251 | MS_CYRL | WINDOWS_1251
63 | CP1252 | MS_ANSI | WINDOWS_1252
64 | CP1253 | MS_GREEK | WINDOWS_1253
65 | CP1254 | MS_TURK | WINDOWS_1254
66 | CP1255 | MS_HEBR | WINDOWS_1255
67 | CP1256 | MS_ARAB | WINDOWS_1256
68 | CP1257 | WINBALTRIM | WINDOWS_1257
69 | CP1258 | WINDOWS_1258
70 | I_850 | CP850 | IBM850 | CSPC850MULTILINGUAL
71 | I_862 | CP862 | IBM862 | CSPC862LATINHEBREW
72 | I_866 | CP866 | IBM866 | CSIBM866
73 | MAC | MACINTOSH | MACROMAN | CSMACINTOSH
74 | MACCENTRALEUROPE
75 | MACICELAND
76 | MACCROATIAN
77 | MACROMANIA
78 | MACCYRILLIC
79 | MACUKRAINE
80 | MACGREEK
81 | MACTURKISH
82 | MACHEBREW
83 | MACARABIC
84 | MACTHAI
85 | HP_ROMAN8 | R8 | ROMAN8 | CSHPROMAN8
86 | NEXTSTEP
87 | ARMSCII_8
88 | GEORGIAN_ACADEMY
89 | GEORGIAN_PS
90 | KOI8_T
91 | MULELAO_1
92 | CP1133 | IBM_CP1133
93 | ISO_IR_166 | TIS_620 | TIS620 | TIS620_0 | TIS620_2529_1 | TIS620_2533_0 | TIS620_2533_1
94 | CP874 | WINDOWS_874
95 | VISCII | VISCII1_1_1 | CSVISCII
96 | TCVN | TCVN_5712 | TCVN5712_1 | TCVN5712_1_1993
97 | ISO_IR_14 | ISO646_JP | JIS_C6220_1969_RO | JP | CSISO14JISC6220RO
98 | JISX0201_1976 | JIS_X0201 | X0201 | CSHALFWIDTHKATAKANA
99 | ISO_IR_87 | JIS0208 | JIS_C6226_1983 | JIS_X0208 | JIS_X0208_1983 | JIS_X0208_1990 | X0208 | CSISO87JISX0208
100 | ISO_IR_159 | JIS_X0212 | JIS_X0212_1990 | JIS_X0212_1990_0 | X0212 | CSISO159JISX02121990
101 | CN | GB_1988_80 | ISO_IR_57 | ISO646_CN | CSISO57GB1988
102 | CHINESE | GB_2312_80 | ISO_IR_58 | CSISO58GB231280
103 | CN_GB_ISOIR165 | ISO_IR_165
104 | ISO_IR_149 | KOREAN | KSC_5601 | KS_C_5601_1987 | KS_C_5601_1989 | CSKSC56011987
105 | EUC_JP | EUCJP | EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE | CSEUCPKDFMTJAPANESE
106 | MS_KANJI | SHIFT_JIS | SJIS | CSSHIFTJIS
107 | CP932
108 | ISO_2022_JP | CSISO2022JP
109 | ISO_2022_JP_1
110 | ISO_2022_JP_2 | CSISO2022JP2
111 | CN_GB | EUC_CN | EUCCN | GB2312 | CSGB2312
112 | CP936 | GBK
113 | GB18030
114 | ISO_2022_CN | CSISO2022CN
115 | ISO_2022_CN_EXT
116 | HZ | HZ_GB_2312
117 | EUC_TW | EUCTW | CSEUCTW
118 | BIG_5 | BIG_FIVE | BIG5 | BIGFIVE | CN_BIG5 | CSBIG5
119 | CP950
120 | BIG5_HKSCS | BIG5HKSCS
121 | EUC_KR | EUCKR | CSEUCKR
122 | CP949 | UHC
123 | CP1361 | JOHAB
124 | ISO_2022_KR | CSISO2022KR
125 | I_437 | CP437 | IBM437 | CSPC8CODEPAGE437
126 | CP737
127 | CP775 | IBM775 | CSPC775BALTIC
128 | I_852 | CP852 | IBM852 | CSPCP852
129 | CP853
130 | I_855 | CP855 | IBM855 | CSIBM855
131 | I_857 | CP857 | IBM857 | CSIBM857
132 | CP858
133 | I_860 | CP860 | IBM860 | CSIBM860
134 | I_861 | CP_IS | CP861 | IBM861 | CSIBM861
135 | I_863 | CP863 | IBM863 | CSIBM863
136 | CP864 | IBM864 | CSIBM864
137 | I_865 | CP865 | IBM865 | CSIBM865
138 | I_869 | CP_GR | CP869 | IBM869 | CSIBM869
139 | CP1125
141 (** @return ASCII if nothing matches *)
142 val charset_from_string : string -> charset
144 val charset_to_string : charset -> string
146 (** [convert ~from_charset ~to_charset s]
147 @raise CharsetError if the string s is not entirely convertible. *)
148 val convert : from_charset : charset -> to_charset : charset -> string -> string
150 (** [safe_convert enc s] convert [s] from encoding [enc] to UTF-8.
151 Return unmodified string if conversion fails.
153 val safe_convert: string -> string -> string
155 (** [is_utf8 s]
156 returns TRUE if s is a valid UTF-8, otherwise returns FALSE.
157 Other functions assume strings are valid UTF-8, so it is prudent
158 to test their validity for strings from untrusted origins. *)
159 val is_utf8 : string -> bool
161 (** [utf8_get s n]
162 returns [n]-th Unicode character of [s].
163 The call requires O(n)-time. *)
164 val utf8_get : string -> int -> uchar
166 (** [utf8_length s]
167 returns the number of Unicode characters contained in s *)
168 val utf8_length : string -> int
170 (** [add_uchar buf u]
171 add one Unicode character to the buffer. *)
172 val add_uchar : Buffer.t -> uchar -> unit
174 (** Locale dependent conversions *)
175 module Locale : sig
177 (** [to_utf8 s]
178 Converts the input string to UTF-8. *)
179 val to_utf8 : string -> string
181 (** [to_locale s]
182 Converts the input string to the encoding of the current locale. *)
183 val to_locale : string -> string
185 val default_language : string
186 val locale_string : string
187 val conversion_enabled : bool ref