Finish off eventusermodel based Excel Extractor, and update the xls to csv converter...
[poi.git] / src / java / org / apache / poi / hssf / extractor / EventBasedExcelExtractor.java
blob8f3eebb2d35b43fb2c999edf6ab4b0ccae8f630c
1 /* ====================================================================
2 Licensed to the Apache Software Foundation (ASF) under one or more
3 contributor license agreements. See the NOTICE file distributed with
4 this work for additional information regarding copyright ownership.
5 The ASF licenses this file to You under the Apache License, Version 2.0
6 (the "License"); you may not use this file except in compliance with
7 the License. You may obtain a copy of the License at
9 http://www.apache.org/licenses/LICENSE-2.0
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 ==================================================================== */
17 package org.apache.poi.hssf.extractor;
19 import java.io.IOException;
20 import java.text.DateFormat;
21 import java.text.DecimalFormat;
22 import java.text.SimpleDateFormat;
23 import java.util.ArrayList;
24 import java.util.Date;
25 import java.util.List;
27 import org.apache.poi.POIOLE2TextExtractor;
28 import org.apache.poi.hpsf.DocumentSummaryInformation;
29 import org.apache.poi.hpsf.SummaryInformation;
30 import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
31 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
32 import org.apache.poi.hssf.eventusermodel.HSSFListener;
33 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
34 import org.apache.poi.hssf.model.FormulaParser;
35 import org.apache.poi.hssf.record.BOFRecord;
36 import org.apache.poi.hssf.record.BoundSheetRecord;
37 import org.apache.poi.hssf.record.CellValueRecordInterface;
38 import org.apache.poi.hssf.record.FormulaRecord;
39 import org.apache.poi.hssf.record.LabelRecord;
40 import org.apache.poi.hssf.record.LabelSSTRecord;
41 import org.apache.poi.hssf.record.NoteRecord;
42 import org.apache.poi.hssf.record.NumberRecord;
43 import org.apache.poi.hssf.record.Record;
44 import org.apache.poi.hssf.record.SSTRecord;
45 import org.apache.poi.hssf.record.StringRecord;
46 import org.apache.poi.hssf.usermodel.HSSFDateUtil;
47 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
49 /**
50 * A text extractor for Excel files, that is based
51 * on the hssf eventusermodel api.
52 * It will typically use less memory than
53 * {@link ExcelExtractor}, but may not provide
54 * the same richness of formatting.
55 * Returns the textual content of the file, suitable for
56 * indexing by something like Lucene, but not really
57 * intended for display to the user.
58 * To turn an excel file into a CSV or similar, then see
59 * the XLS2CSVmra example
60 * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
62 public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
63 private POIFSFileSystem fs;
64 private boolean includeSheetNames = true;
65 private boolean formulasNotResults = false;
67 public EventBasedExcelExtractor(POIFSFileSystem fs) throws IOException {
68 super(null);
69 this.fs = fs;
72 /**
73 * Would return the document information metadata for the document,
74 * if we supported it
76 public DocumentSummaryInformation getDocSummaryInformation() {
77 throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
79 /**
80 * Would return the summary information metadata for the document,
81 * if we supported it
83 public SummaryInformation getSummaryInformation() {
84 throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
88 /**
89 * Should sheet names be included? Default is true
91 public void setIncludeSheetNames(boolean includeSheetNames) {
92 this.includeSheetNames = includeSheetNames;
94 /**
95 * Should we return the formula itself, and not
96 * the result it produces? Default is false
98 public void setFormulasNotResults(boolean formulasNotResults) {
99 this.formulasNotResults = formulasNotResults;
104 * Retreives the text contents of the file
106 public String getText() {
107 String text = null;
108 try {
109 TextListener tl = triggerExtraction();
111 text = tl.text.toString();
112 if(! text.endsWith("\n")) {
113 text = text + "\n";
115 } catch(IOException e) {
116 throw new RuntimeException(e);
119 return text;
122 private TextListener triggerExtraction() throws IOException {
123 TextListener tl = new TextListener();
124 FormatTrackingHSSFListener ft = new FormatTrackingHSSFListener(tl);
125 tl.ft = ft;
127 // Register and process
128 HSSFEventFactory factory = new HSSFEventFactory();
129 HSSFRequest request = new HSSFRequest();
130 request.addListenerForAllRecords(ft);
132 factory.processWorkbookEvents(request, fs);
134 return tl;
137 private class TextListener implements HSSFListener {
138 private FormatTrackingHSSFListener ft;
139 private SSTRecord sstRecord;
141 private List sheetNames = new ArrayList();
142 private StringBuffer text = new StringBuffer();
143 private int sheetNum = -1;
144 private int rowNum;
146 private boolean outputNextStringValue = false;
147 private int nextRow = -1;
149 public void processRecord(Record record) {
150 String thisText = null;
151 int thisRow = -1;
153 switch(record.getSid()) {
154 case BoundSheetRecord.sid:
155 BoundSheetRecord sr = (BoundSheetRecord)record;
156 sheetNames.add(sr.getSheetname());
157 break;
158 case BOFRecord.sid:
159 BOFRecord bof = (BOFRecord)record;
160 if(bof.getType() == BOFRecord.TYPE_WORKSHEET) {
161 sheetNum++;
162 rowNum = -1;
164 if(includeSheetNames) {
165 if(text.length() > 0) text.append("\n");
166 text.append(sheetNames.get(sheetNum));
169 break;
170 case SSTRecord.sid:
171 sstRecord = (SSTRecord)record;
172 break;
174 case FormulaRecord.sid:
175 FormulaRecord frec = (FormulaRecord) record;
176 thisRow = frec.getRow();
178 if(formulasNotResults) {
179 thisText = FormulaParser.toFormulaString(null, frec.getParsedExpression());
180 } else {
181 if(Double.isNaN( frec.getValue() )) {
182 // Formula result is a string
183 // This is stored in the next record
184 outputNextStringValue = true;
185 nextRow = frec.getRow();
186 } else {
187 thisText = formatNumberDateCell(frec, frec.getValue());
190 break;
191 case StringRecord.sid:
192 if(outputNextStringValue) {
193 // String for formula
194 StringRecord srec = (StringRecord)record;
195 thisText = srec.getString();
196 thisRow = nextRow;
197 outputNextStringValue = false;
199 break;
200 case LabelRecord.sid:
201 LabelRecord lrec = (LabelRecord) record;
202 thisRow = lrec.getRow();
203 thisText = lrec.getValue();
204 break;
205 case LabelSSTRecord.sid:
206 LabelSSTRecord lsrec = (LabelSSTRecord) record;
207 thisRow = lsrec.getRow();
208 if(sstRecord == null) {
209 throw new IllegalStateException("No SST record found");
211 thisText = sstRecord.getString(lsrec.getSSTIndex()).toString();
212 break;
213 case NoteRecord.sid:
214 NoteRecord nrec = (NoteRecord) record;
215 thisRow = nrec.getRow();
216 // TODO: Find object to match nrec.getShapeId()
217 break;
218 case NumberRecord.sid:
219 NumberRecord numrec = (NumberRecord) record;
220 thisRow = numrec.getRow();
221 thisText = formatNumberDateCell(numrec, numrec.getValue());
222 break;
223 default:
224 break;
227 if(thisText != null) {
228 if(thisRow != rowNum) {
229 rowNum = thisRow;
230 if(text.length() > 0)
231 text.append("\n");
232 } else {
233 text.append("\t");
235 text.append(thisText);
240 * Formats a number or date cell, be that a real number, or the
241 * answer to a formula
243 private String formatNumberDateCell(CellValueRecordInterface cell, double value) {
244 // Get the built in format, if there is one
245 int formatIndex = ft.getFormatIndex(cell);
246 String formatString = ft.getFormatString(cell);
248 if(formatString == null) {
249 return Double.toString(value);
250 } else {
251 // Is it a date?
252 if(HSSFDateUtil.isADateFormat(formatIndex,formatString) &&
253 HSSFDateUtil.isValidExcelDate(value)) {
254 // Java wants M not m for month
255 formatString = formatString.replace('m','M');
256 // Change \- into -, if it's there
257 formatString = formatString.replaceAll("\\\\-","-");
259 // Format as a date
260 Date d = HSSFDateUtil.getJavaDate(value, false);
261 DateFormat df = new SimpleDateFormat(formatString);
262 return df.format(d);
263 } else {
264 if(formatString == "General") {
265 // Some sort of wierd default
266 return Double.toString(value);
269 // Format as a number
270 DecimalFormat df = new DecimalFormat(formatString);
271 return df.format(value);