4 * March 28, 2007 -- Trevor Strohman
6 * BSD License (http://www.galagosearch.org/license)
12 import galago
.tupleflow
.InputClass
;
13 import galago
.tupleflow
.TupleFlowParameters
;
14 import galago
.tupleflow
.Parameters
;
15 import galago
.tupleflow
.execution
.ErrorHandler
;
16 import galago
.tupleflow
.execution
.Verification
;
17 import galago
.types
.NumberWordPosition
;
18 import java
.io
.DataOutputStream
;
20 import java
.io
.FileNotFoundException
;
21 import java
.io
.IOException
;
22 import java
.util
.TreeMap
;
29 @InputClass(className
="galago.types.NumberWordPosition", order
={"+word", "+document", "+position"})
30 public class PositionListWriter
implements NumberWordPosition
.WordDocumentPositionOrder
.ShreddedProcessor
{
31 int blockSize
= 32768;
33 long lastPosition
= 0;
34 long lastDocument
= 0;
36 int skipMinimumBinLength
;
37 TreeMap
<Integer
, Integer
> skipLengths
;
39 public class PositionsList
implements InvertedList
{
40 public PositionsList() {
41 documents
= new BackedCompressedByteBuffer();
42 counts
= new BackedCompressedByteBuffer();
43 positions
= new BackedCompressedByteBuffer();
44 header
= new BackedCompressedByteBuffer();
47 public void close() throws IOException
{
50 if( documents
.length() > 0 )
51 counts
.add( positionCount
);
53 header
.add( options
);
55 header
.add( documentCount
);
56 header
.add( totalPositionCount
);
58 header
.add( documents
.length() );
59 header
.add( counts
.length() );
60 header
.add( positions
.length() );
63 public long dataLength() {
66 listLength
+= header
.length();
67 listLength
+= counts
.length();
68 listLength
+= positions
.length();
69 listLength
+= documents
.length();
74 public void write( final DataOutputStream output
) throws IOException
{
78 documents
.write(output
);
84 positions
.write(output
);
88 public byte[] word() {
92 public void setWord( byte[] word
) {
94 this.lastDocument
= 0;
95 this.lastPosition
= 0;
96 this.totalPositionCount
= 0;
97 this.positionCount
= 0;
100 public void addDocument( long documentID
) throws IOException
{
101 // add the last document's counts
102 if( documents
.length() > 0 )
103 counts
.add( positionCount
);
105 documents
.add( documentID
- lastDocument
);
106 lastDocument
= documentID
;
113 public void addPosition( int position
) throws IOException
{
115 totalPositionCount
++;
116 positions
.add( position
- lastPosition
);
117 lastPosition
= position
;
120 private long lastDocument
;
121 private int lastPosition
;
122 private int positionCount
;
123 private int documentCount
;
124 private int totalPositionCount
;
127 public BackedCompressedByteBuffer header
;
128 public BackedCompressedByteBuffer documents
;
129 public BackedCompressedByteBuffer counts
;
130 public BackedCompressedByteBuffer positions
;
133 long maximumDocumentCount
= 0;
134 long maximumDocumentNumber
= 0;
136 PositionsList invertedList
;
137 DataOutputStream output
;
140 InvertedListWriter writer
;
141 long documentCount
= 0;
142 long collectionLength
= 0;
146 * Creates a new instance of BinnedListWriter
148 public PositionListWriter( TupleFlowParameters parameters
) throws FileNotFoundException
, IOException
{
149 writer
= new InvertedListWriter( parameters
);
150 header
= parameters
.getXML();
153 public void processWord( byte[] wordBytes
) throws IOException
{
154 if( invertedList
!= null ) {
155 invertedList
.close();
156 writer
.add( invertedList
);
160 resetDocumentCount();
162 invertedList
= new PositionsList();
163 invertedList
.setWord( wordBytes
);
165 assert lastWord
== null || 0 != Utility
.compare( lastWord
, wordBytes
) : "Duplicate word";
166 lastWord
= wordBytes
;
169 public void processDocument( int document
) throws IOException
{
170 invertedList
.addDocument( document
);
172 maximumDocumentNumber
= Math
.max( document
, maximumDocumentNumber
);
173 lastDocument
= document
;
176 public void processPosition( int position
) throws IOException
{
177 invertedList
.addPosition( position
);
180 public void processTuple() {
184 private void resetDocumentCount() {
185 maximumDocumentCount
= Math
.max( documentCount
, maximumDocumentCount
);
189 public void close() throws IOException
{
190 if( invertedList
!= null ) {
191 invertedList
.close();
192 writer
.add( invertedList
);
198 public long documentCount() {
199 return maximumDocumentNumber
;
202 public long maximumDocumentCount() {
203 return maximumDocumentCount
;
206 public static void verify( TupleFlowParameters parameters
, ErrorHandler handler
) {
207 if( !parameters
.getXML().containsKey( "index" ) ) {
208 handler
.addError( "PositionsListWriter requires an 'index' parameter." );
212 String index
= parameters
.getXML().get( "index" );
213 Verification
.requireWriteableDirectory( index
, handler
);