1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Communicator client code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 1998
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
12 # This library is free software; you can redistribute it and/or
13 # modify it under the terms of the GNU Lesser General Public
14 # License as published by the Free Software Foundation; either
15 # version 2.1 of the License, or (at your option) any later version.
17 # This library is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 # Lesser General Public License for more details.
22 # You should have received a copy of the GNU Lesser General Public
23 # License along with this library; if not, write to the Free Software
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 ######################### END LICENSE BLOCK #########################
28 from .euctwfreq
import (EUCTWCharToFreqOrder
, EUCTW_TABLE_SIZE
,
29 EUCTW_TYPICAL_DISTRIBUTION_RATIO
)
30 from .euckrfreq
import (EUCKRCharToFreqOrder
, EUCKR_TABLE_SIZE
,
31 EUCKR_TYPICAL_DISTRIBUTION_RATIO
)
32 from .gb2312freq
import (GB2312CharToFreqOrder
, GB2312_TABLE_SIZE
,
33 GB2312_TYPICAL_DISTRIBUTION_RATIO
)
34 from .big5freq
import (Big5CharToFreqOrder
, BIG5_TABLE_SIZE
,
35 BIG5_TYPICAL_DISTRIBUTION_RATIO
)
36 from .jisfreq
import (JISCharToFreqOrder
, JIS_TABLE_SIZE
,
37 JIS_TYPICAL_DISTRIBUTION_RATIO
)
38 from .compat
import wrap_ord
40 ENOUGH_DATA_THRESHOLD
= 1024
43 MINIMUM_DATA_THRESHOLD
= 3
46 class CharDistributionAnalysis
:
48 # Mapping table to get frequency order from char order (get from
50 self
._mCharToFreqOrder
= None
51 self
._mTableSize
= None # Size of above table
52 # This is a constant value which varies from language to language,
53 # used in calculating confidence. See
54 # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
56 self
._mTypicalDistributionRatio
= None
60 """reset analyser, clear any state"""
61 # If this flag is set to True, detection is done and conclusion has
64 self
._mTotalChars
= 0 # Total characters encountered
65 # The number of characters whose frequency order is less than 512
68 def feed(self
, aBuf
, aCharLen
):
69 """feed a character with known length"""
71 # we only care about 2-bytes character in our distribution analysis
72 order
= self
.get_order(aBuf
)
76 self
._mTotalChars
+= 1
78 if order
< self
._mTableSize
:
79 if 512 > self
._mCharToFreqOrder
[order
]:
82 def get_confidence(self
):
83 """return confidence based on existing data"""
84 # if we didn't receive any character in our consideration range,
85 # return negative answer
86 if self
._mTotalChars
<= 0 or self
._mFreqChars
<= MINIMUM_DATA_THRESHOLD
:
89 if self
._mTotalChars
!= self
._mFreqChars
:
90 r
= (self
._mFreqChars
/ ((self
._mTotalChars
- self
._mFreqChars
)
91 * self
._mTypicalDistributionRatio
))
95 # normalize confidence (we don't want to be 100% sure)
98 def got_enough_data(self
):
99 # It is not necessary to receive all data to draw conclusion.
100 # For charset detection, certain amount of data is enough
101 return self
._mTotalChars
> ENOUGH_DATA_THRESHOLD
103 def get_order(self
, aBuf
):
104 # We do not handle characters based on the original encoding string,
105 # but convert this encoding string to a number, here called order.
106 # This allows multiple encodings of a language to share one frequency
111 class EUCTWDistributionAnalysis(CharDistributionAnalysis
):
113 CharDistributionAnalysis
.__init
__(self
)
114 self
._mCharToFreqOrder
= EUCTWCharToFreqOrder
115 self
._mTableSize
= EUCTW_TABLE_SIZE
116 self
._mTypicalDistributionRatio
= EUCTW_TYPICAL_DISTRIBUTION_RATIO
118 def get_order(self
, aBuf
):
119 # for euc-TW encoding, we are interested
120 # first byte range: 0xc4 -- 0xfe
121 # second byte range: 0xa1 -- 0xfe
122 # no validation needed here. State machine has done that
123 first_char
= wrap_ord(aBuf
[0])
124 if first_char
>= 0xC4:
125 return 94 * (first_char
- 0xC4) + wrap_ord(aBuf
[1]) - 0xA1
130 class EUCKRDistributionAnalysis(CharDistributionAnalysis
):
132 CharDistributionAnalysis
.__init
__(self
)
133 self
._mCharToFreqOrder
= EUCKRCharToFreqOrder
134 self
._mTableSize
= EUCKR_TABLE_SIZE
135 self
._mTypicalDistributionRatio
= EUCKR_TYPICAL_DISTRIBUTION_RATIO
137 def get_order(self
, aBuf
):
138 # for euc-KR encoding, we are interested
139 # first byte range: 0xb0 -- 0xfe
140 # second byte range: 0xa1 -- 0xfe
141 # no validation needed here. State machine has done that
142 first_char
= wrap_ord(aBuf
[0])
143 if first_char
>= 0xB0:
144 return 94 * (first_char
- 0xB0) + wrap_ord(aBuf
[1]) - 0xA1
149 class GB2312DistributionAnalysis(CharDistributionAnalysis
):
151 CharDistributionAnalysis
.__init
__(self
)
152 self
._mCharToFreqOrder
= GB2312CharToFreqOrder
153 self
._mTableSize
= GB2312_TABLE_SIZE
154 self
._mTypicalDistributionRatio
= GB2312_TYPICAL_DISTRIBUTION_RATIO
156 def get_order(self
, aBuf
):
157 # for GB2312 encoding, we are interested
158 # first byte range: 0xb0 -- 0xfe
159 # second byte range: 0xa1 -- 0xfe
160 # no validation needed here. State machine has done that
161 first_char
, second_char
= wrap_ord(aBuf
[0]), wrap_ord(aBuf
[1])
162 if (first_char
>= 0xB0) and (second_char
>= 0xA1):
163 return 94 * (first_char
- 0xB0) + second_char
- 0xA1
168 class Big5DistributionAnalysis(CharDistributionAnalysis
):
170 CharDistributionAnalysis
.__init
__(self
)
171 self
._mCharToFreqOrder
= Big5CharToFreqOrder
172 self
._mTableSize
= BIG5_TABLE_SIZE
173 self
._mTypicalDistributionRatio
= BIG5_TYPICAL_DISTRIBUTION_RATIO
175 def get_order(self
, aBuf
):
176 # for big5 encoding, we are interested
177 # first byte range: 0xa4 -- 0xfe
178 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
179 # no validation needed here. State machine has done that
180 first_char
, second_char
= wrap_ord(aBuf
[0]), wrap_ord(aBuf
[1])
181 if first_char
>= 0xA4:
182 if second_char
>= 0xA1:
183 return 157 * (first_char
- 0xA4) + second_char
- 0xA1 + 63
185 return 157 * (first_char
- 0xA4) + second_char
- 0x40
190 class SJISDistributionAnalysis(CharDistributionAnalysis
):
192 CharDistributionAnalysis
.__init
__(self
)
193 self
._mCharToFreqOrder
= JISCharToFreqOrder
194 self
._mTableSize
= JIS_TABLE_SIZE
195 self
._mTypicalDistributionRatio
= JIS_TYPICAL_DISTRIBUTION_RATIO
197 def get_order(self
, aBuf
):
198 # for sjis encoding, we are interested
199 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
200 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
201 # no validation needed here. State machine has done that
202 first_char
, second_char
= wrap_ord(aBuf
[0]), wrap_ord(aBuf
[1])
203 if (first_char
>= 0x81) and (first_char
<= 0x9F):
204 order
= 188 * (first_char
- 0x81)
205 elif (first_char
>= 0xE0) and (first_char
<= 0xEF):
206 order
= 188 * (first_char
- 0xE0 + 31)
209 order
= order
+ second_char
- 0x40
210 if second_char
> 0x7F:
215 class EUCJPDistributionAnalysis(CharDistributionAnalysis
):
217 CharDistributionAnalysis
.__init
__(self
)
218 self
._mCharToFreqOrder
= JISCharToFreqOrder
219 self
._mTableSize
= JIS_TABLE_SIZE
220 self
._mTypicalDistributionRatio
= JIS_TYPICAL_DISTRIBUTION_RATIO
222 def get_order(self
, aBuf
):
223 # for euc-JP encoding, we are interested
224 # first byte range: 0xa0 -- 0xfe
225 # second byte range: 0xa1 -- 0xfe
226 # no validation needed here. State machine has done that
227 char
= wrap_ord(aBuf
[0])
229 return 94 * (char
- 0xA1) + wrap_ord(aBuf
[1]) - 0xa1