Bringing tree up to date.
[galago.git] / java / galago / src / galago / parse / UniversalParser.java
blob31ef1a4b58823ab6d2bc58b58e70644e88bd521c
1 /*
2 * UniversalParser
4 * March 16, 2007 -- Trevor Strohman
6 * BSD License (http://www.galagosearch.org/license)
7 */
9 package galago.parse;
11 import galago.tupleflow.InputClass;
12 import galago.tupleflow.OutputClass;
13 import galago.tupleflow.StandardStep;
14 import galago.tupleflow.execution.Verified;
15 import galago.types.FileName;
16 import java.io.IOException;
17 import java.util.logging.Logger;
19 /**
21 * @author trevor
23 @Verified
24 @InputClass(className="galago.types.FileName")
25 @OutputClass(className="galago.parse.Document")
26 public class UniversalParser extends StandardStep<FileName, Document> {
27 public void process( FileName fileName ) throws IOException {
28 String collectionFilename = fileName.filename;
29 DocumentStreamParser parser;
31 if( collectionFilename.contains(".arc.") || collectionFilename.endsWith(".arc") ) {
32 parser = new ArcParser( collectionFilename );
33 } else if( collectionFilename.contains(".dat.") || collectionFilename.endsWith(".dat") ) {
34 parser = new TrecTextParser( collectionFilename );
35 } else {
36 parser = new TrecWebParser( collectionFilename );
39 Logger logger = Logger.getLogger(UniversalParser.class.toString());
40 Document document;
41 int count = 0;
43 logger.info( "Starting " + collectionFilename );
45 while( (document = parser.nextDocument()) != null ) {
46 processor.process( document );
48 if( (++count % 5000) == 0 ) {
49 logger.info( "Documents: " + count );
53 logger.info( "Finished (" + count + ") " + collectionFilename );