1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Universal charset detector code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 2001
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
11 # Shy Shalom - original C code
13 # This library is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU Lesser General Public
15 # License as published by the Free Software Foundation; either
16 # version 2.1 of the License, or (at your option) any later version.
18 # This library is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # Lesser General Public License for more details.
23 # You should have received a copy of the GNU Lesser General Public
24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 ######################### END LICENSE BLOCK #########################
30 from . import constants
31 from .charsetprober
import CharSetProber
32 from .compat
import wrap_ord
35 SB_ENOUGH_REL_THRESHOLD
= 1024
36 POSITIVE_SHORTCUT_THRESHOLD
= 0.95
37 NEGATIVE_SHORTCUT_THRESHOLD
= 0.05
38 SYMBOL_CAT_ORDER
= 250
40 POSITIVE_CAT
= NUMBER_OF_SEQ_CAT
- 1
44 class SingleByteCharSetProber(CharSetProber
):
45 def __init__(self
, model
, reversed=False, nameProber
=None):
46 CharSetProber
.__init
__(self
)
48 # TRUE if we need to reverse every pair in the model lookup
49 self
._mReversed
= reversed
50 # Optional auxiliary prober for name decision
51 self
._mNameProber
= nameProber
55 CharSetProber
.reset(self
)
56 # char order of last character
57 self
._mLastOrder
= 255
58 self
._mSeqCounters
= [0] * NUMBER_OF_SEQ_CAT
61 # characters that fall in our sampling range
64 def get_charset_name(self
):
66 return self
._mNameProber
.get_charset_name()
68 return self
._mModel
['charsetName']
71 if not self
._mModel
['keepEnglishLetter']:
72 aBuf
= self
.filter_without_english_letters(aBuf
)
75 return self
.get_state()
77 order
= self
._mModel
['charToOrderMap'][wrap_ord(c
)]
78 if order
< SYMBOL_CAT_ORDER
:
80 if order
< SAMPLE_SIZE
:
82 if self
._mLastOrder
< SAMPLE_SIZE
:
84 if not self
._mReversed
:
85 i
= (self
._mLastOrder
* SAMPLE_SIZE
) + order
86 model
= self
._mModel
['precedenceMatrix'][i
]
87 else: # reverse the order of the letters in the lookup
88 i
= (order
* SAMPLE_SIZE
) + self
._mLastOrder
89 model
= self
._mModel
['precedenceMatrix'][i
]
90 self
._mSeqCounters
[model
] += 1
91 self
._mLastOrder
= order
93 if self
.get_state() == constants
.eDetecting
:
94 if self
._mTotalSeqs
> SB_ENOUGH_REL_THRESHOLD
:
95 cf
= self
.get_confidence()
96 if cf
> POSITIVE_SHORTCUT_THRESHOLD
:
98 sys
.stderr
.write('%s confidence = %s, we have a'
100 (self
._mModel
['charsetName'], cf
))
101 self
._mState
= constants
.eFoundIt
102 elif cf
< NEGATIVE_SHORTCUT_THRESHOLD
:
104 sys
.stderr
.write('%s confidence = %s, below negative'
105 'shortcut threshhold %s\n' %
106 (self
._mModel
['charsetName'], cf
,
107 NEGATIVE_SHORTCUT_THRESHOLD
))
108 self
._mState
= constants
.eNotMe
110 return self
.get_state()
112 def get_confidence(self
):
114 if self
._mTotalSeqs
> 0:
115 r
= ((1.0 * self
._mSeqCounters
[POSITIVE_CAT
]) / self
._mTotalSeqs
116 / self
._mModel
['mTypicalPositiveRatio'])
117 r
= r
* self
._mFreqChar
/ self
._mTotalChar