Bringing tree up to date.
[galago.git] / java / galago / src / galago / index / PositionListWriter.java
blob27dfe350747dffbf29e34b7dea558ee6b0b36b91
1 /*
2 * PositionsListWriter
4 * March 28, 2007 -- Trevor Strohman
6 * BSD License (http://www.galagosearch.org/license)
7 */
9 package galago.index;
11 import galago.*;
12 import galago.tupleflow.InputClass;
13 import galago.tupleflow.TupleFlowParameters;
14 import galago.tupleflow.Parameters;
15 import galago.tupleflow.execution.ErrorHandler;
16 import galago.tupleflow.execution.Verification;
17 import galago.types.NumberWordPosition;
18 import java.io.DataOutputStream;
19 import java.io.File;
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.util.TreeMap;
24 /**
26 * @author trevor
29 @InputClass(className="galago.types.NumberWordPosition", order={"+word", "+document", "+position"})
30 public class PositionListWriter implements NumberWordPosition.WordDocumentPositionOrder.ShreddedProcessor {
31 int blockSize = 32768;
32 byte[] lastWord;
33 long lastPosition = 0;
34 long lastDocument = 0;
36 int skipMinimumBinLength;
37 TreeMap<Integer, Integer> skipLengths;
39 public class PositionsList implements InvertedList {
40 public PositionsList() {
41 documents = new BackedCompressedByteBuffer();
42 counts = new BackedCompressedByteBuffer();
43 positions = new BackedCompressedByteBuffer();
44 header = new BackedCompressedByteBuffer();
47 public void close() throws IOException {
48 int options = 0;
50 if( documents.length() > 0 )
51 counts.add( positionCount );
53 header.add( options );
55 header.add( documentCount );
56 header.add( totalPositionCount );
58 header.add( documents.length() );
59 header.add( counts.length() );
60 header.add( positions.length() );
63 public long dataLength() {
64 long listLength = 0;
66 listLength += header.length();
67 listLength += counts.length();
68 listLength += positions.length();
69 listLength += documents.length();
71 return listLength;
74 public void write( final DataOutputStream output ) throws IOException {
75 header.write(output);
76 header.clear();
78 documents.write(output);
79 documents.clear();
81 counts.write(output);
82 counts.clear();
84 positions.write(output);
85 positions.clear();
88 public byte[] word() {
89 return word;
92 public void setWord( byte[] word ) {
93 this.word = word;
94 this.lastDocument = 0;
95 this.lastPosition = 0;
96 this.totalPositionCount = 0;
97 this.positionCount = 0;
100 public void addDocument( long documentID ) throws IOException {
101 // add the last document's counts
102 if( documents.length() > 0 )
103 counts.add( positionCount );
105 documents.add( documentID - lastDocument );
106 lastDocument = documentID;
108 lastPosition = 0;
109 positionCount = 0;
110 documentCount++;
113 public void addPosition( int position ) throws IOException {
114 positionCount++;
115 totalPositionCount++;
116 positions.add( position - lastPosition );
117 lastPosition = position;
120 private long lastDocument;
121 private int lastPosition;
122 private int positionCount;
123 private int documentCount;
124 private int totalPositionCount;
126 public byte[] word;
127 public BackedCompressedByteBuffer header;
128 public BackedCompressedByteBuffer documents;
129 public BackedCompressedByteBuffer counts;
130 public BackedCompressedByteBuffer positions;
133 long maximumDocumentCount = 0;
134 long maximumDocumentNumber = 0;
136 PositionsList invertedList;
137 DataOutputStream output;
138 long filePosition;
140 InvertedListWriter writer;
141 long documentCount = 0;
142 long collectionLength = 0;
143 Parameters header;
146 * Creates a new instance of BinnedListWriter
148 public PositionListWriter( TupleFlowParameters parameters ) throws FileNotFoundException, IOException {
149 writer = new InvertedListWriter( parameters );
150 header = parameters.getXML();
153 public void processWord( byte[] wordBytes ) throws IOException {
154 if( invertedList != null ) {
155 invertedList.close();
156 writer.add( invertedList );
157 invertedList = null;
160 resetDocumentCount();
162 invertedList = new PositionsList();
163 invertedList.setWord( wordBytes );
165 assert lastWord == null || 0 != Utility.compare( lastWord, wordBytes ) : "Duplicate word";
166 lastWord = wordBytes;
169 public void processDocument( int document ) throws IOException {
170 invertedList.addDocument( document );
171 documentCount++;
172 maximumDocumentNumber = Math.max( document, maximumDocumentNumber );
173 lastDocument = document;
176 public void processPosition( int position ) throws IOException {
177 invertedList.addPosition( position );
180 public void processTuple() {
181 // does nothing
184 private void resetDocumentCount() {
185 maximumDocumentCount = Math.max( documentCount, maximumDocumentCount );
186 documentCount = 0;
189 public void close() throws IOException {
190 if( invertedList != null ) {
191 invertedList.close();
192 writer.add( invertedList );
195 writer.close();
198 public long documentCount() {
199 return maximumDocumentNumber;
202 public long maximumDocumentCount() {
203 return maximumDocumentCount;
206 public static void verify( TupleFlowParameters parameters, ErrorHandler handler ) {
207 if( !parameters.getXML().containsKey( "index" ) ) {
208 handler.addError( "PositionsListWriter requires an 'index' parameter." );
209 return;
212 String index = parameters.getXML().get( "index" );
213 Verification.requireWriteableDirectory( index, handler );