Merging the SubLib project with Gnome Subtitles
[gn-sub.git] / src / External / NCharDet / nsPSMDetector.cs
blob6f9b8b00674a5d7be9707859b2a729141649f74f
1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
14 * The Original Code is mozilla.org code.
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 1998
19 * the Initial Developer. All Rights Reserved.
21 * Contributor(s):
22 * Craig Dunn <craig dot dunn at conceptdevelopment dot net>
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
37 using System;
38 namespace org.mozilla.intl.chardet {
39 //import java.lang.* ;
41 public abstract class nsPSMDetector {
43 public const int ALL = 0 ;
44 public const int JAPANESE = 1 ;
45 public const int CHINESE = 2 ;
46 public const int SIMPLIFIED_CHINESE = 3 ;
47 public const int TRADITIONAL_CHINESE = 4 ;
48 public const int KOREAN = 5 ;
50 public const int NO_OF_LANGUAGES = 6 ;
51 public const int MAX_VERIFIERS = 16 ;
53 nsVerifier[] mVerifier ;
54 nsEUCStatistics[] mStatisticsData ;
56 nsEUCSampler mSampler = new nsEUCSampler() ;
57 byte[] mState = new byte[MAX_VERIFIERS] ;
58 int[] mItemIdx = new int[MAX_VERIFIERS] ;
60 int mItems ;
61 int mClassItems ;
63 protected bool mDone ;
64 protected bool mRunSampler ;
65 protected bool mClassRunSampler ;
67 public nsPSMDetector() {
68 initVerifiers( nsPSMDetector.ALL );
69 Reset() ;
72 public nsPSMDetector(int langFlag) {
73 initVerifiers(langFlag);
74 Reset() ;
77 public nsPSMDetector(int aItems, nsVerifier[] aVerifierSet,
78 nsEUCStatistics[] aStatisticsSet) {
79 mClassRunSampler = ( aStatisticsSet != null ) ;
80 mStatisticsData = aStatisticsSet ;
81 mVerifier = aVerifierSet ;
83 mClassItems = aItems ;
84 Reset() ;
88 public void Reset() {
89 mRunSampler = mClassRunSampler ;
90 mDone = false ;
91 mItems = mClassItems ;
93 for(int i=0; i<mItems; i++) {
94 mState[i] = 0;
95 mItemIdx[i] = i;
98 mSampler.Reset() ;
101 protected void initVerifiers(int currVerSet) {
103 //int idx = 0 ;
104 int currVerifierSet ;
106 if (currVerSet >=0 && currVerSet < NO_OF_LANGUAGES ) {
107 currVerifierSet = currVerSet ;
109 else {
110 currVerifierSet = nsPSMDetector.ALL ;
113 mVerifier = null ;
114 mStatisticsData = null ;
116 if ( currVerifierSet == nsPSMDetector.TRADITIONAL_CHINESE ) {
118 mVerifier = new nsVerifier[] {
119 new nsUTF8Verifier(),
120 new nsBIG5Verifier(),
121 new nsISO2022CNVerifier(),
122 new nsEUCTWVerifier(),
123 new nsCP1252Verifier(),
124 new nsUCS2BEVerifier(),
125 new nsUCS2LEVerifier()
128 mStatisticsData = new nsEUCStatistics[] {
129 null,
130 new Big5Statistics(),
131 null,
132 new EUCTWStatistics(),
133 null,
134 null,
135 null
139 //==========================================================
140 else if ( currVerifierSet == nsPSMDetector.KOREAN ) {
142 mVerifier = new nsVerifier[] {
143 new nsUTF8Verifier(),
144 new nsEUCKRVerifier(),
145 new nsISO2022KRVerifier(),
146 new nsCP1252Verifier(),
147 new nsUCS2BEVerifier(),
148 new nsUCS2LEVerifier()
152 //==========================================================
153 else if ( currVerifierSet == nsPSMDetector.SIMPLIFIED_CHINESE ) {
155 mVerifier = new nsVerifier[] {
156 new nsUTF8Verifier(),
157 new nsGB2312Verifier(),
158 new nsGB18030Verifier(),
159 new nsISO2022CNVerifier(),
160 new nsHZVerifier(),
161 new nsCP1252Verifier(),
162 new nsUCS2BEVerifier(),
163 new nsUCS2LEVerifier()
167 //==========================================================
168 else if ( currVerifierSet == nsPSMDetector.JAPANESE ) {
170 mVerifier = new nsVerifier[] {
171 new nsUTF8Verifier(),
172 new nsSJISVerifier(),
173 new nsEUCJPVerifier(),
174 new nsISO2022JPVerifier(),
175 new nsCP1252Verifier(),
176 new nsUCS2BEVerifier(),
177 new nsUCS2LEVerifier()
180 //==========================================================
181 else if ( currVerifierSet == nsPSMDetector.CHINESE ) {
183 mVerifier = new nsVerifier[] {
184 new nsUTF8Verifier(),
185 new nsGB2312Verifier(),
186 new nsGB18030Verifier(),
187 new nsBIG5Verifier(),
188 new nsISO2022CNVerifier(),
189 new nsHZVerifier(),
190 new nsEUCTWVerifier(),
191 new nsCP1252Verifier(),
192 new nsUCS2BEVerifier(),
193 new nsUCS2LEVerifier()
196 mStatisticsData = new nsEUCStatistics[] {
197 null,
198 new GB2312Statistics(),
199 null,
200 new Big5Statistics(),
201 null,
202 null,
203 new EUCTWStatistics(),
204 null,
205 null,
206 null
210 //==========================================================
211 else if ( currVerifierSet == nsPSMDetector.ALL ) {
213 mVerifier = new nsVerifier[] {
214 new nsUTF8Verifier(),
215 new nsSJISVerifier(),
216 new nsEUCJPVerifier(),
217 new nsISO2022JPVerifier(),
218 new nsEUCKRVerifier(),
219 new nsISO2022KRVerifier(),
220 new nsBIG5Verifier(),
221 new nsEUCTWVerifier(),
222 new nsGB2312Verifier(),
223 new nsGB18030Verifier(),
224 new nsISO2022CNVerifier(),
225 new nsHZVerifier(),
226 new nsCP1252Verifier(),
227 new nsUCS2BEVerifier(),
228 new nsUCS2LEVerifier()
231 mStatisticsData = new nsEUCStatistics[] {
232 null,
233 null,
234 new EUCJPStatistics(),
235 null,
236 new EUCKRStatistics(),
237 null,
238 new Big5Statistics(),
239 new EUCTWStatistics(),
240 new GB2312Statistics(),
241 null,
242 null,
243 null,
244 null,
245 null,
246 null
250 mClassRunSampler = ( mStatisticsData != null ) ;
251 mClassItems = mVerifier.Length ;
255 public abstract void Report(String charset) ;
257 public bool HandleData(byte[] aBuf, int len) {
260 int i,j;
261 byte b, st;
263 for( i=0; i < len; i++) {
264 b = aBuf[i] ;
266 for (j=0; j < mItems; )
268 st = nsVerifier.getNextState( mVerifier[mItemIdx[j]],
269 b, mState[j]) ;
270 //if (st != 0)
271 //System.out.println( "state(0x" + Integer.toHexString(0xFF&b) +") =>"+ Integer.toHexString(st&0xFF)+ " " + mVerifier[mItemIdx[j]].charset());
273 if (st == nsVerifier.eItsMe) {
275 //System.out.println( "eItsMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
277 Report( mVerifier[mItemIdx[j]].charset() );
278 mDone = true ;
279 return mDone ;
281 } else if (st == nsVerifier.eError ) {
283 //System.out.println( "eNotMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
284 mItems--;
285 if (j < mItems ) {
286 mItemIdx[j] = mItemIdx[mItems];
287 mState[j] = mState[mItems];
290 } else {
292 mState[j++] = st ;
297 if ( mItems <= 1 ) {
299 if( 1 == mItems) {
300 Report( mVerifier[mItemIdx[0]].charset() );
302 mDone = true ;
303 return mDone ;
306 else {
308 int nonUCS2Num=0;
309 int nonUCS2Idx=0;
311 for(j=0; j<mItems; j++) {
312 if ( (!(mVerifier[mItemIdx[j]].isUCS2())) &&
313 (!(mVerifier[mItemIdx[j]].isUCS2())) )
315 nonUCS2Num++ ;
316 nonUCS2Idx = j ;
320 if (1 == nonUCS2Num) {
321 Report( mVerifier[mItemIdx[nonUCS2Idx]].charset() );
322 mDone = true ;
323 return mDone ;
328 } // End of for( i=0; i < len ...
330 if (mRunSampler)
331 Sample(aBuf, len);
333 return mDone ;
337 public void DataEnd() {
339 if (mDone == true)
340 return ;
342 if (mItems == 2) {
343 if ((mVerifier[mItemIdx[0]].charset()) == ("GB18030")) {
344 Report(mVerifier[mItemIdx[1]].charset()) ;
345 mDone = true ;
346 } else if ((mVerifier[mItemIdx[1]].charset()) == ("GB18030")) {
347 Report(mVerifier[mItemIdx[0]].charset()) ;
348 mDone = true ;
352 if (mRunSampler)
353 Sample(null, 0, true);
356 public void Sample(byte[] aBuf, int aLen) {
357 Sample(aBuf, aLen, false) ;
360 public void Sample(byte[] aBuf, int aLen, bool aLastChance)
362 int possibleCandidateNum = 0;
363 int j;
364 int eucNum=0 ;
366 for (j=0; j< mItems; j++) {
367 if (null != mStatisticsData[mItemIdx[j]])
368 eucNum++ ;
369 if ((!mVerifier[mItemIdx[j]].isUCS2()) &&
370 (!(mVerifier[mItemIdx[j]].charset() == "GB18030")))
371 possibleCandidateNum++ ;
374 mRunSampler = (eucNum > 1) ;
376 if (mRunSampler) {
377 mRunSampler = mSampler.Sample(aBuf, aLen);
378 if(((aLastChance && mSampler.GetSomeData()) ||
379 mSampler.EnoughData())
380 && (eucNum == possibleCandidateNum)) {
381 mSampler.CalFreq();
383 int bestIdx = -1;
384 int eucCnt=0;
385 float bestScore = 0.0f;
386 for(j = 0; j < mItems; j++) {
387 if((null != mStatisticsData[mItemIdx[j]]) &&
388 (!(mVerifier[mItemIdx[j]].charset() == "Big5")))
390 float score = mSampler.GetScore(
391 mStatisticsData[mItemIdx[j]].mFirstByteFreq(),
392 mStatisticsData[mItemIdx[j]].mFirstByteWeight(),
393 mStatisticsData[mItemIdx[j]].mSecondByteFreq(),
394 mStatisticsData[mItemIdx[j]].mSecondByteWeight() );
395 //System.out.println("FequencyScore("+mVerifier[mItemIdx[j]].charset()+")= "+ score);
396 if(( 0 == eucCnt++) || (bestScore > score )) {
397 bestScore = score;
398 bestIdx = j;
399 } // if(( 0 == eucCnt++) || (bestScore > score ))
400 } // if(null != ...)
401 } // for
402 if (bestIdx >= 0)
404 Report( mVerifier[mItemIdx[bestIdx]].charset());
405 mDone = true;
407 } // if (eucNum == possibleCandidateNum)
408 } // if(mRunSampler)
411 public String[] getProbableCharsets() {
413 if (mItems <= 0) {
414 String[] nomatch = new String[1];
415 nomatch[0] = "nomatch" ;
416 return nomatch ;
419 String[] ret = new String[mItems] ;
420 for (int i=0; i<mItems; i++)
421 ret[i] = mVerifier[mItemIdx[i]].charset() ;
422 return ret ;
428 } // namespace