2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.parser
.microsoft
;
19 import java
.awt
.Point
;
20 import java
.io
.IOException
;
21 import java
.io
.Serializable
;
22 import java
.util
.ArrayList
;
23 import java
.util
.Comparator
;
24 import java
.util
.List
;
26 import java
.util
.SortedMap
;
27 import java
.util
.TreeMap
;
29 import org
.apache
.commons
.logging
.Log
;
30 import org
.apache
.commons
.logging
.LogFactory
;
31 import org
.apache
.poi
.hssf
.eventusermodel
.HSSFEventFactory
;
32 import org
.apache
.poi
.hssf
.eventusermodel
.HSSFListener
;
33 import org
.apache
.poi
.hssf
.eventusermodel
.HSSFRequest
;
34 import org
.apache
.poi
.hssf
.record
.BOFRecord
;
35 import org
.apache
.poi
.hssf
.record
.BoundSheetRecord
;
36 import org
.apache
.poi
.hssf
.record
.CellValueRecordInterface
;
37 import org
.apache
.poi
.hssf
.record
.CountryRecord
;
38 import org
.apache
.poi
.hssf
.record
.DateWindow1904Record
;
39 import org
.apache
.poi
.hssf
.record
.EOFRecord
;
40 import org
.apache
.poi
.hssf
.record
.ExtendedFormatRecord
;
41 import org
.apache
.poi
.hssf
.record
.FormatRecord
;
42 import org
.apache
.poi
.hssf
.record
.FormulaRecord
;
43 //import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release
44 import org
.apache
.poi
.hssf
.record
.LabelRecord
;
45 import org
.apache
.poi
.hssf
.record
.LabelSSTRecord
;
46 import org
.apache
.poi
.hssf
.record
.NumberRecord
;
47 import org
.apache
.poi
.hssf
.record
.RKRecord
;
48 import org
.apache
.poi
.hssf
.record
.Record
;
49 import org
.apache
.poi
.hssf
.record
.SSTRecord
;
50 import org
.apache
.poi
.poifs
.filesystem
.DocumentInputStream
;
51 import org
.apache
.poi
.poifs
.filesystem
.POIFSFileSystem
;
52 import org
.apache
.tika
.sax
.XHTMLContentHandler
;
53 import org
.xml
.sax
.SAXException
;
56 * Excel parser implementation which uses POI's Event API
57 * to handle the contents of a Workbook.
59 * The Event API uses a much smaller memory footprint than
60 * <code>HSSFWorkbook</code> when processing excel files
61 * but at the cost of more complexity.
63 * With the Event API a <i>listener</i> is registered for
64 * specific record types and those records are created,
65 * fired off to the listener and then discarded as the stream
68 * @see org.apache.poi.hssf.eventusermodel.HSSFListener
69 * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
70 * POI Event API How To</a>
72 public class ExcelExtractor
{
74 /** Logging instance */
75 private static final Log log
= LogFactory
.getLog(ExcelExtractor
.class);
78 * <code>true</code> if the HSSFListener should be registered
79 * to listen for all records or <code>false</code> (the default)
80 * if the listener should be configured to only receive specified
83 private boolean listenForAllRecords
= false;
86 * Returns <code>true</code> if this parser is configured to listen
87 * for all records instead of just the specified few.
89 public boolean isListenForAllRecords() {
90 return listenForAllRecords
;
94 * Specifies whether this parser should to listen for all
95 * records or just for the specified few.
97 * <strong>Note:</strong> Under normal operation this setting should
98 * be <code>false</code> (the default), but you can experiment with
99 * this setting for testing and debugging purposes.
101 * @param listenForAllRecords <code>true</code> if the HSSFListener
102 * should be registered to listen for all records or <code>false</code>
103 * if the listener should be configured to only receive specified records.
105 public void setListenForAllRecords(boolean listenForAllRecords
) {
106 this.listenForAllRecords
= listenForAllRecords
;
110 * Extracts text from an Excel Workbook writing the extracted content
111 * to the specified {@link Appendable}.
113 * @param filesystem POI file system
114 * @throws IOException if an error occurs processing the workbook
115 * or writing the extracted content
117 protected void parse(POIFSFileSystem filesystem
, XHTMLContentHandler xhtml
)
118 throws IOException
, SAXException
{
119 log
.debug("Starting listenForAllRecords=" + listenForAllRecords
);
121 // Set up listener and register the records we want to process
122 TikaHSSFListener listener
= new TikaHSSFListener(xhtml
);
123 HSSFRequest hssfRequest
= new HSSFRequest();
124 if (listenForAllRecords
) {
125 hssfRequest
.addListenerForAllRecords(listener
);
127 hssfRequest
.addListener(listener
, BOFRecord
.sid
);
128 hssfRequest
.addListener(listener
, EOFRecord
.sid
);
129 hssfRequest
.addListener(listener
, DateWindow1904Record
.sid
);
130 hssfRequest
.addListener(listener
, CountryRecord
.sid
);
131 hssfRequest
.addListener(listener
, BoundSheetRecord
.sid
);
132 hssfRequest
.addListener(listener
, FormatRecord
.sid
);
133 hssfRequest
.addListener(listener
, ExtendedFormatRecord
.sid
);
134 hssfRequest
.addListener(listener
, SSTRecord
.sid
);
135 hssfRequest
.addListener(listener
, FormulaRecord
.sid
);
136 hssfRequest
.addListener(listener
, LabelRecord
.sid
);
137 hssfRequest
.addListener(listener
, LabelSSTRecord
.sid
);
138 hssfRequest
.addListener(listener
, NumberRecord
.sid
);
139 hssfRequest
.addListener(listener
, RKRecord
.sid
);
140 //hssfRequest.addListener(listener, HyperlinkRecord.sid); // FIXME - requires POI release
143 // Create event factory and process Workbook (fire events)
144 DocumentInputStream documentInputStream
= filesystem
.createDocumentInputStream("Workbook");
145 HSSFEventFactory eventFactory
= new HSSFEventFactory();
147 eventFactory
.processEvents(hssfRequest
, documentInputStream
);
148 listener
.throwStoredException();
151 // ======================================================================
154 * HSSF Listener implementation which processes the HSSF records.
156 private static class TikaHSSFListener
implements HSSFListener
, Serializable
{
158 private final XHTMLContentHandler handler
;
160 private SAXException exception
;
162 private SSTRecord sstRecord
;
163 private List
<String
> sheetNames
= new ArrayList
<String
>();
164 private short currentSheetIndex
;
166 private boolean insideWorksheet
= false;
168 private SortedMap
<Point
, Cell
> currentSheet
=
169 new TreeMap
<Point
, Cell
>(new Comparator
<Point
> () {
170 public int compare(Point a
, Point b
) {
171 int diff
= a
.y
- b
.y
;
180 * Contstruct a new listener instance outputting parsed data to
181 * the specified XHTML content handler.
183 * @param handler Destination to write the parsed output to
185 private TikaHSSFListener(XHTMLContentHandler handler
) {
186 this.handler
= handler
;
187 this.exception
= null;
191 * Process a HSSF record.
193 * @param record HSSF Record
195 public void processRecord(Record record
) {
197 if (log
.isDebugEnabled()) {
198 log
.debug(record
.toString());
200 internalProcessRecord(record
);
201 } catch (SAXException e
) {
202 if (exception
== null) {
208 public void throwStoredException() throws SAXException
{
209 if (exception
!= null) {
214 private void internalProcessRecord(Record record
) throws SAXException
{
215 switch (record
.getSid()) {
217 /* BOFRecord: indicates start of workbook, worksheet etc. records */
219 switch (((BOFRecord
) record
).getType()) {
220 case BOFRecord
.TYPE_WORKBOOK
:
221 currentSheetIndex
= -1;
223 case BOFRecord
.TYPE_WORKSHEET
:
225 currentSheet
.clear();
226 insideWorksheet
= true;
231 /* EOFRecord: indicates end of workbook, worksheet etc. records */
233 // ignore empty sheets
234 if (insideWorksheet
&& !currentSheet
.isEmpty()) {
237 insideWorksheet
= false;
240 /* SSTRecord: holds all the strings for LabelSSTRecords */
242 sstRecord
= (SSTRecord
)record
;
245 /* BoundSheetRecord: Worksheet index record */
246 case BoundSheetRecord
.sid
:
247 BoundSheetRecord boundSheetRecord
= (BoundSheetRecord
)record
;
248 String sheetName
= boundSheetRecord
.getSheetname();
249 sheetNames
.add(sheetName
);
252 // FIXME - requires POI release
253 ///* HyperlinkRecord: holds a URL associated with a cell */
254 //case HyperlinkRecord.sid:
255 // HyperlinkRecord hyperlinkRecord = (HyperlinkRecord)record;
256 // if (insideWorksheet) {
257 // int row = hyperlinkRecord.getFirstRow();
258 // short column = hyperlinkRecord.getFirstColumn();
259 // Point point = new Point(column, row);
260 // Cell cell = currentSheet.get(point);
261 // if (cell != null) {
262 // cell = new LinkedCell(cell, hyperlinkRecord.getAddress());
263 // currentSheet.put(point, cell);
270 && record
instanceof CellValueRecordInterface
) {
271 CellValueRecordInterface value
=
272 (CellValueRecordInterface
) record
;
273 Cell cell
= getCellValue(record
.getSid(), value
);
276 new Point(value
.getColumn(), value
.getRow());
277 currentSheet
.put(point
, cell
);
285 * Process a Cell Value record.
287 * @param sid record type identifier
288 * @param record The cell value record
290 private Cell
getCellValue(
291 short sid
, CellValueRecordInterface record
)
292 throws SAXException
{
296 /* FormulaRecord: Cell value from a formula */
297 case FormulaRecord
.sid
:
298 return new NumberCell(((FormulaRecord
) record
).getValue());
300 /* LabelRecord: strings stored directly in the cell */
301 case LabelRecord
.sid
:
302 text
= ((LabelRecord
)record
).getValue();
305 /* LabelSSTRecord: Ref. a string in the shared string table */
306 case LabelSSTRecord
.sid
:
307 LabelSSTRecord labelSSTRecord
= (LabelSSTRecord
) record
;
308 int sstIndex
= labelSSTRecord
.getSSTIndex();
309 text
= sstRecord
.getString(sstIndex
).getString();
312 /* NumberRecord: Contains a numeric cell value */
313 case NumberRecord
.sid
:
314 return new NumberCell(((NumberRecord
) record
).getValue());
316 /* RKRecord: Excel internal number record */
318 return new NumberCell(((RKRecord
)record
).getRKNumber());
323 if (text
!= null && text
.length() > 0) {
324 return new TextCell(text
);
330 * Process an excel sheet.
332 * @throws SAXException if an error occurs
334 private void processSheet() throws SAXException
{
336 handler
.startElement("div", "class", "page");
337 if (currentSheetIndex
< sheetNames
.size()) {
338 handler
.element("h1", sheetNames
.get(currentSheetIndex
));
340 handler
.characters("\n");
341 handler
.startElement("table");
342 handler
.startElement("tbody");
346 int currentColumn
= 1;
347 handler
.startElement("tr");
348 handler
.startElement("td");
349 for (Map
.Entry
<Point
, Cell
> entry
: currentSheet
.entrySet()) {
350 while (currentRow
< entry
.getKey().y
) {
351 handler
.endElement("td");
352 handler
.endElement("tr");
353 handler
.characters("\n");
354 handler
.startElement("tr");
355 handler
.startElement("td");
360 while (currentColumn
< entry
.getKey().x
) {
361 handler
.endElement("td");
362 handler
.characters("\t");
363 handler
.startElement("td");
367 entry
.getValue().render(handler
);
369 handler
.endElement("td");
370 handler
.endElement("tr");
373 handler
.endElement("tbody");
374 handler
.endElement("table");
375 handler
.endElement("div");
376 handler
.characters("\n");