1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Original Code is mozilla.org code.
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 1998
19 * the Initial Developer. All Rights Reserved.
22 * Craig Dunn <craig dot dunn at conceptdevelopment dot net>
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 namespace org
.mozilla
.intl
.chardet
{
39 //import java.lang.* ;
41 public abstract class nsPSMDetector
{
43 public const int ALL
= 0 ;
44 public const int JAPANESE
= 1 ;
45 public const int CHINESE
= 2 ;
46 public const int SIMPLIFIED_CHINESE
= 3 ;
47 public const int TRADITIONAL_CHINESE
= 4 ;
48 public const int KOREAN
= 5 ;
50 public const int NO_OF_LANGUAGES
= 6 ;
51 public const int MAX_VERIFIERS
= 16 ;
53 nsVerifier
[] mVerifier
;
54 nsEUCStatistics
[] mStatisticsData
;
56 nsEUCSampler mSampler
= new nsEUCSampler() ;
57 byte[] mState
= new byte[MAX_VERIFIERS
] ;
58 int[] mItemIdx
= new int[MAX_VERIFIERS
] ;
63 protected bool mDone
;
64 protected bool mRunSampler
;
65 protected bool mClassRunSampler
;
67 public nsPSMDetector() {
68 initVerifiers( nsPSMDetector
.ALL
);
72 public nsPSMDetector(int langFlag
) {
73 initVerifiers(langFlag
);
77 public nsPSMDetector(int aItems
, nsVerifier
[] aVerifierSet
,
78 nsEUCStatistics
[] aStatisticsSet
) {
79 mClassRunSampler
= ( aStatisticsSet
!= null ) ;
80 mStatisticsData
= aStatisticsSet
;
81 mVerifier
= aVerifierSet
;
83 mClassItems
= aItems
;
89 mRunSampler
= mClassRunSampler
;
91 mItems
= mClassItems
;
93 for(int i
=0; i
<mItems
; i
++) {
101 protected void initVerifiers(int currVerSet
) {
104 int currVerifierSet
;
106 if (currVerSet
>=0 && currVerSet
< NO_OF_LANGUAGES
) {
107 currVerifierSet
= currVerSet
;
110 currVerifierSet
= nsPSMDetector
.ALL
;
114 mStatisticsData
= null ;
116 if ( currVerifierSet
== nsPSMDetector
.TRADITIONAL_CHINESE
) {
118 mVerifier
= new nsVerifier
[] {
119 new nsUTF8Verifier(),
120 new nsBIG5Verifier(),
121 new nsISO2022CNVerifier(),
122 new nsEUCTWVerifier(),
123 new nsCP1252Verifier(),
124 new nsUCS2BEVerifier(),
125 new nsUCS2LEVerifier()
128 mStatisticsData
= new nsEUCStatistics
[] {
130 new Big5Statistics(),
132 new EUCTWStatistics(),
139 //==========================================================
140 else if ( currVerifierSet
== nsPSMDetector
.KOREAN
) {
142 mVerifier
= new nsVerifier
[] {
143 new nsUTF8Verifier(),
144 new nsEUCKRVerifier(),
145 new nsISO2022KRVerifier(),
146 new nsCP1252Verifier(),
147 new nsUCS2BEVerifier(),
148 new nsUCS2LEVerifier()
152 //==========================================================
153 else if ( currVerifierSet
== nsPSMDetector
.SIMPLIFIED_CHINESE
) {
155 mVerifier
= new nsVerifier
[] {
156 new nsUTF8Verifier(),
157 new nsGB2312Verifier(),
158 new nsGB18030Verifier(),
159 new nsISO2022CNVerifier(),
161 new nsCP1252Verifier(),
162 new nsUCS2BEVerifier(),
163 new nsUCS2LEVerifier()
167 //==========================================================
168 else if ( currVerifierSet
== nsPSMDetector
.JAPANESE
) {
170 mVerifier
= new nsVerifier
[] {
171 new nsUTF8Verifier(),
172 new nsSJISVerifier(),
173 new nsEUCJPVerifier(),
174 new nsISO2022JPVerifier(),
175 new nsCP1252Verifier(),
176 new nsUCS2BEVerifier(),
177 new nsUCS2LEVerifier()
180 //==========================================================
181 else if ( currVerifierSet
== nsPSMDetector
.CHINESE
) {
183 mVerifier
= new nsVerifier
[] {
184 new nsUTF8Verifier(),
185 new nsGB2312Verifier(),
186 new nsGB18030Verifier(),
187 new nsBIG5Verifier(),
188 new nsISO2022CNVerifier(),
190 new nsEUCTWVerifier(),
191 new nsCP1252Verifier(),
192 new nsUCS2BEVerifier(),
193 new nsUCS2LEVerifier()
196 mStatisticsData
= new nsEUCStatistics
[] {
198 new GB2312Statistics(),
200 new Big5Statistics(),
203 new EUCTWStatistics(),
210 //==========================================================
211 else if ( currVerifierSet
== nsPSMDetector
.ALL
) {
213 mVerifier
= new nsVerifier
[] {
214 new nsUTF8Verifier(),
215 new nsSJISVerifier(),
216 new nsEUCJPVerifier(),
217 new nsISO2022JPVerifier(),
218 new nsEUCKRVerifier(),
219 new nsISO2022KRVerifier(),
220 new nsBIG5Verifier(),
221 new nsEUCTWVerifier(),
222 new nsGB2312Verifier(),
223 new nsGB18030Verifier(),
224 new nsISO2022CNVerifier(),
226 new nsCP1252Verifier(),
227 new nsUCS2BEVerifier(),
228 new nsUCS2LEVerifier()
231 mStatisticsData
= new nsEUCStatistics
[] {
234 new EUCJPStatistics(),
236 new EUCKRStatistics(),
238 new Big5Statistics(),
239 new EUCTWStatistics(),
240 new GB2312Statistics(),
250 mClassRunSampler
= ( mStatisticsData
!= null ) ;
251 mClassItems
= mVerifier
.Length
;
255 public abstract void Report(String charset
) ;
257 public bool HandleData(byte[] aBuf
, int len
) {
263 for( i
=0; i
< len
; i
++) {
266 for (j
=0; j
< mItems
; )
268 st
= nsVerifier
.getNextState( mVerifier
[mItemIdx
[j
]],
271 //System.out.println( "state(0x" + Integer.toHexString(0xFF&b) +") =>"+ Integer.toHexString(st&0xFF)+ " " + mVerifier[mItemIdx[j]].charset());
273 if (st
== nsVerifier
.eItsMe
) {
275 //System.out.println( "eItsMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
277 Report( mVerifier
[mItemIdx
[j
]].charset() );
281 } else if (st
== nsVerifier
.eError
) {
283 //System.out.println( "eNotMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
286 mItemIdx
[j
] = mItemIdx
[mItems
];
287 mState
[j
] = mState
[mItems
];
300 Report( mVerifier
[mItemIdx
[0]].charset() );
311 for(j
=0; j
<mItems
; j
++) {
312 if ( (!(mVerifier
[mItemIdx
[j
]].isUCS2())) &&
313 (!(mVerifier
[mItemIdx
[j
]].isUCS2())) )
320 if (1 == nonUCS2Num
) {
321 Report( mVerifier
[mItemIdx
[nonUCS2Idx
]].charset() );
328 } // End of for( i=0; i < len ...
337 public void DataEnd() {
343 if ((mVerifier
[mItemIdx
[0]].charset()) == ("GB18030")) {
344 Report(mVerifier
[mItemIdx
[1]].charset()) ;
346 } else if ((mVerifier
[mItemIdx
[1]].charset()) == ("GB18030")) {
347 Report(mVerifier
[mItemIdx
[0]].charset()) ;
353 Sample(null, 0, true);
356 public void Sample(byte[] aBuf
, int aLen
) {
357 Sample(aBuf
, aLen
, false) ;
360 public void Sample(byte[] aBuf
, int aLen
, bool aLastChance
)
362 int possibleCandidateNum
= 0;
366 for (j
=0; j
< mItems
; j
++) {
367 if (null != mStatisticsData
[mItemIdx
[j
]])
369 if ((!mVerifier
[mItemIdx
[j
]].isUCS2()) &&
370 (!(mVerifier
[mItemIdx
[j
]].charset() == "GB18030")))
371 possibleCandidateNum
++ ;
374 mRunSampler
= (eucNum
> 1) ;
377 mRunSampler
= mSampler
.Sample(aBuf
, aLen
);
378 if(((aLastChance
&& mSampler
.GetSomeData()) ||
379 mSampler
.EnoughData())
380 && (eucNum
== possibleCandidateNum
)) {
385 float bestScore
= 0.0f
;
386 for(j
= 0; j
< mItems
; j
++) {
387 if((null != mStatisticsData
[mItemIdx
[j
]]) &&
388 (!(mVerifier
[mItemIdx
[j
]].charset() == "Big5")))
390 float score
= mSampler
.GetScore(
391 mStatisticsData
[mItemIdx
[j
]].mFirstByteFreq(),
392 mStatisticsData
[mItemIdx
[j
]].mFirstByteWeight(),
393 mStatisticsData
[mItemIdx
[j
]].mSecondByteFreq(),
394 mStatisticsData
[mItemIdx
[j
]].mSecondByteWeight() );
395 //System.out.println("FequencyScore("+mVerifier[mItemIdx[j]].charset()+")= "+ score);
396 if(( 0 == eucCnt
++) || (bestScore
> score
)) {
399 } // if(( 0 == eucCnt++) || (bestScore > score ))
404 Report( mVerifier
[mItemIdx
[bestIdx
]].charset());
407 } // if (eucNum == possibleCandidateNum)
411 public String
[] getProbableCharsets() {
414 String
[] nomatch
= new String
[1];
415 nomatch
[0] = "nomatch" ;
419 String
[] ret
= new String
[mItems
] ;
420 for (int i
=0; i
<mItems
; i
++)
421 ret
[i
] = mVerifier
[mItemIdx
[i
]].charset() ;