1 /* ====================================================================
2 Licensed to the Apache Software Foundation (ASF) under one or more
3 contributor license agreements. See the NOTICE file distributed with
4 this work for additional information regarding copyright ownership.
5 The ASF licenses this file to You under the Apache License, Version 2.0
6 (the "License"); you may not use this file except in compliance with
7 the License. You may obtain a copy of the License at
9 http://www.apache.org/licenses/LICENSE-2.0
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 ==================================================================== */
17 package org
.apache
.poi
.hssf
.extractor
;
19 import java
.io
.IOException
;
20 import java
.text
.DateFormat
;
21 import java
.text
.DecimalFormat
;
22 import java
.text
.SimpleDateFormat
;
23 import java
.util
.ArrayList
;
24 import java
.util
.Date
;
25 import java
.util
.List
;
27 import org
.apache
.poi
.POIOLE2TextExtractor
;
28 import org
.apache
.poi
.hpsf
.DocumentSummaryInformation
;
29 import org
.apache
.poi
.hpsf
.SummaryInformation
;
30 import org
.apache
.poi
.hssf
.eventusermodel
.FormatTrackingHSSFListener
;
31 import org
.apache
.poi
.hssf
.eventusermodel
.HSSFEventFactory
;
32 import org
.apache
.poi
.hssf
.eventusermodel
.HSSFListener
;
33 import org
.apache
.poi
.hssf
.eventusermodel
.HSSFRequest
;
34 import org
.apache
.poi
.hssf
.model
.FormulaParser
;
35 import org
.apache
.poi
.hssf
.record
.BOFRecord
;
36 import org
.apache
.poi
.hssf
.record
.BoundSheetRecord
;
37 import org
.apache
.poi
.hssf
.record
.CellValueRecordInterface
;
38 import org
.apache
.poi
.hssf
.record
.FormulaRecord
;
39 import org
.apache
.poi
.hssf
.record
.LabelRecord
;
40 import org
.apache
.poi
.hssf
.record
.LabelSSTRecord
;
41 import org
.apache
.poi
.hssf
.record
.NoteRecord
;
42 import org
.apache
.poi
.hssf
.record
.NumberRecord
;
43 import org
.apache
.poi
.hssf
.record
.Record
;
44 import org
.apache
.poi
.hssf
.record
.SSTRecord
;
45 import org
.apache
.poi
.hssf
.record
.StringRecord
;
46 import org
.apache
.poi
.hssf
.usermodel
.HSSFDateUtil
;
47 import org
.apache
.poi
.poifs
.filesystem
.POIFSFileSystem
;
50 * A text extractor for Excel files, that is based
51 * on the hssf eventusermodel api.
52 * It will typically use less memory than
53 * {@link ExcelExtractor}, but may not provide
54 * the same richness of formatting.
55 * Returns the textual content of the file, suitable for
56 * indexing by something like Lucene, but not really
57 * intended for display to the user.
58 * To turn an excel file into a CSV or similar, then see
59 * the XLS2CSVmra example
60 * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
62 public class EventBasedExcelExtractor
extends POIOLE2TextExtractor
{
63 private POIFSFileSystem fs
;
64 private boolean includeSheetNames
= true;
65 private boolean formulasNotResults
= false;
67 public EventBasedExcelExtractor(POIFSFileSystem fs
) throws IOException
{
73 * Would return the document information metadata for the document,
76 public DocumentSummaryInformation
getDocSummaryInformation() {
77 throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
80 * Would return the summary information metadata for the document,
83 public SummaryInformation
getSummaryInformation() {
84 throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
89 * Should sheet names be included? Default is true
91 public void setIncludeSheetNames(boolean includeSheetNames
) {
92 this.includeSheetNames
= includeSheetNames
;
95 * Should we return the formula itself, and not
96 * the result it produces? Default is false
98 public void setFormulasNotResults(boolean formulasNotResults
) {
99 this.formulasNotResults
= formulasNotResults
;
104 * Retreives the text contents of the file
106 public String
getText() {
109 TextListener tl
= triggerExtraction();
111 text
= tl
.text
.toString();
112 if(! text
.endsWith("\n")) {
115 } catch(IOException e
) {
116 throw new RuntimeException(e
);
122 private TextListener
triggerExtraction() throws IOException
{
123 TextListener tl
= new TextListener();
124 FormatTrackingHSSFListener ft
= new FormatTrackingHSSFListener(tl
);
127 // Register and process
128 HSSFEventFactory factory
= new HSSFEventFactory();
129 HSSFRequest request
= new HSSFRequest();
130 request
.addListenerForAllRecords(ft
);
132 factory
.processWorkbookEvents(request
, fs
);
137 private class TextListener
implements HSSFListener
{
138 private FormatTrackingHSSFListener ft
;
139 private SSTRecord sstRecord
;
141 private List sheetNames
= new ArrayList();
142 private StringBuffer text
= new StringBuffer();
143 private int sheetNum
= -1;
146 private boolean outputNextStringValue
= false;
147 private int nextRow
= -1;
149 public void processRecord(Record record
) {
150 String thisText
= null;
153 switch(record
.getSid()) {
154 case BoundSheetRecord
.sid
:
155 BoundSheetRecord sr
= (BoundSheetRecord
)record
;
156 sheetNames
.add(sr
.getSheetname());
159 BOFRecord bof
= (BOFRecord
)record
;
160 if(bof
.getType() == BOFRecord
.TYPE_WORKSHEET
) {
164 if(includeSheetNames
) {
165 if(text
.length() > 0) text
.append("\n");
166 text
.append(sheetNames
.get(sheetNum
));
171 sstRecord
= (SSTRecord
)record
;
174 case FormulaRecord
.sid
:
175 FormulaRecord frec
= (FormulaRecord
) record
;
176 thisRow
= frec
.getRow();
178 if(formulasNotResults
) {
179 thisText
= FormulaParser
.toFormulaString(null, frec
.getParsedExpression());
181 if(Double
.isNaN( frec
.getValue() )) {
182 // Formula result is a string
183 // This is stored in the next record
184 outputNextStringValue
= true;
185 nextRow
= frec
.getRow();
187 thisText
= formatNumberDateCell(frec
, frec
.getValue());
191 case StringRecord
.sid
:
192 if(outputNextStringValue
) {
193 // String for formula
194 StringRecord srec
= (StringRecord
)record
;
195 thisText
= srec
.getString();
197 outputNextStringValue
= false;
200 case LabelRecord
.sid
:
201 LabelRecord lrec
= (LabelRecord
) record
;
202 thisRow
= lrec
.getRow();
203 thisText
= lrec
.getValue();
205 case LabelSSTRecord
.sid
:
206 LabelSSTRecord lsrec
= (LabelSSTRecord
) record
;
207 thisRow
= lsrec
.getRow();
208 if(sstRecord
== null) {
209 throw new IllegalStateException("No SST record found");
211 thisText
= sstRecord
.getString(lsrec
.getSSTIndex()).toString();
214 NoteRecord nrec
= (NoteRecord
) record
;
215 thisRow
= nrec
.getRow();
216 // TODO: Find object to match nrec.getShapeId()
218 case NumberRecord
.sid
:
219 NumberRecord numrec
= (NumberRecord
) record
;
220 thisRow
= numrec
.getRow();
221 thisText
= formatNumberDateCell(numrec
, numrec
.getValue());
227 if(thisText
!= null) {
228 if(thisRow
!= rowNum
) {
230 if(text
.length() > 0)
235 text
.append(thisText
);
240 * Formats a number or date cell, be that a real number, or the
241 * answer to a formula
243 private String
formatNumberDateCell(CellValueRecordInterface cell
, double value
) {
244 // Get the built in format, if there is one
245 int formatIndex
= ft
.getFormatIndex(cell
);
246 String formatString
= ft
.getFormatString(cell
);
248 if(formatString
== null) {
249 return Double
.toString(value
);
252 if(HSSFDateUtil
.isADateFormat(formatIndex
,formatString
) &&
253 HSSFDateUtil
.isValidExcelDate(value
)) {
254 // Java wants M not m for month
255 formatString
= formatString
.replace('m','M');
256 // Change \- into -, if it's there
257 formatString
= formatString
.replaceAll("\\\\-","-");
260 Date d
= HSSFDateUtil
.getJavaDate(value
, false);
261 DateFormat df
= new SimpleDateFormat(formatString
);
264 if(formatString
== "General") {
265 // Some sort of wierd default
266 return Double
.toString(value
);
269 // Format as a number
270 DecimalFormat df
= new DecimalFormat(formatString
);
271 return df
.format(value
);