4 * March 16, 2007 -- Trevor Strohman
6 * BSD License (http://www.galagosearch.org/license)
11 import galago
.tupleflow
.InputClass
;
12 import galago
.tupleflow
.OutputClass
;
13 import galago
.tupleflow
.StandardStep
;
14 import galago
.tupleflow
.execution
.Verified
;
15 import galago
.types
.FileName
;
16 import java
.io
.IOException
;
17 import java
.util
.logging
.Logger
;
24 @InputClass(className
="galago.types.FileName")
25 @OutputClass(className
="galago.parse.Document")
26 public class UniversalParser
extends StandardStep
<FileName
, Document
> {
27 public void process( FileName fileName
) throws IOException
{
28 String collectionFilename
= fileName
.filename
;
29 DocumentStreamParser parser
;
31 if( collectionFilename
.contains(".arc.") || collectionFilename
.endsWith(".arc") ) {
32 parser
= new ArcParser( collectionFilename
);
33 } else if( collectionFilename
.contains(".dat.") || collectionFilename
.endsWith(".dat") ) {
34 parser
= new TrecTextParser( collectionFilename
);
36 parser
= new TrecWebParser( collectionFilename
);
39 Logger logger
= Logger
.getLogger(UniversalParser
.class.toString());
43 logger
.info( "Starting " + collectionFilename
);
45 while( (document
= parser
.nextDocument()) != null ) {
46 processor
.process( document
);
48 if( (++count
% 5000) == 0 ) {
49 logger
.info( "Documents: " + count
);
53 logger
.info( "Finished (" + count
+ ") " + collectionFilename
);