TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
[tika.git] / src / main / java / org / apache / tika / parser / microsoft / ExcelExtractor.java
blob350805d0ed105b629067eb8ad483abe2a5b51738
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser.microsoft;
19 import java.awt.Point;
20 import java.io.IOException;
21 import java.io.Serializable;
22 import java.util.ArrayList;
23 import java.util.Comparator;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.SortedMap;
27 import java.util.TreeMap;
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
32 import org.apache.poi.hssf.eventusermodel.HSSFListener;
33 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
34 import org.apache.poi.hssf.record.BOFRecord;
35 import org.apache.poi.hssf.record.BoundSheetRecord;
36 import org.apache.poi.hssf.record.CellValueRecordInterface;
37 import org.apache.poi.hssf.record.CountryRecord;
38 import org.apache.poi.hssf.record.DateWindow1904Record;
39 import org.apache.poi.hssf.record.EOFRecord;
40 import org.apache.poi.hssf.record.ExtendedFormatRecord;
41 import org.apache.poi.hssf.record.FormatRecord;
42 import org.apache.poi.hssf.record.FormulaRecord;
43 //import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release
44 import org.apache.poi.hssf.record.LabelRecord;
45 import org.apache.poi.hssf.record.LabelSSTRecord;
46 import org.apache.poi.hssf.record.NumberRecord;
47 import org.apache.poi.hssf.record.RKRecord;
48 import org.apache.poi.hssf.record.Record;
49 import org.apache.poi.hssf.record.SSTRecord;
50 import org.apache.poi.poifs.filesystem.DocumentInputStream;
51 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
52 import org.apache.tika.sax.XHTMLContentHandler;
53 import org.xml.sax.SAXException;
55 /**
56 * Excel parser implementation which uses POI's Event API
57 * to handle the contents of a Workbook.
58 * <p>
59 * The Event API uses a much smaller memory footprint than
60 * <code>HSSFWorkbook</code> when processing excel files
61 * but at the cost of more complexity.
62 * <p>
63 * With the Event API a <i>listener</i> is registered for
64 * specific record types and those records are created,
65 * fired off to the listener and then discarded as the stream
66 * is being processed.
68 * @see org.apache.poi.hssf.eventusermodel.HSSFListener
69 * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
70 * POI Event API How To</a>
72 public class ExcelExtractor {
74 /** Logging instance */
75 private static final Log log = LogFactory.getLog(ExcelExtractor.class);
77 /**
78 * <code>true</code> if the HSSFListener should be registered
79 * to listen for all records or <code>false</code> (the default)
80 * if the listener should be configured to only receive specified
81 * records.
83 private boolean listenForAllRecords = false;
85 /**
86 * Returns <code>true</code> if this parser is configured to listen
87 * for all records instead of just the specified few.
89 public boolean isListenForAllRecords() {
90 return listenForAllRecords;
93 /**
94 * Specifies whether this parser should to listen for all
95 * records or just for the specified few.
96 * <p>
97 * <strong>Note:</strong> Under normal operation this setting should
98 * be <code>false</code> (the default), but you can experiment with
99 * this setting for testing and debugging purposes.
101 * @param listenForAllRecords <code>true</code> if the HSSFListener
102 * should be registered to listen for all records or <code>false</code>
103 * if the listener should be configured to only receive specified records.
105 public void setListenForAllRecords(boolean listenForAllRecords) {
106 this.listenForAllRecords = listenForAllRecords;
110 * Extracts text from an Excel Workbook writing the extracted content
111 * to the specified {@link Appendable}.
113 * @param filesystem POI file system
114 * @throws IOException if an error occurs processing the workbook
115 * or writing the extracted content
117 protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
118 throws IOException, SAXException {
119 log.debug("Starting listenForAllRecords=" + listenForAllRecords);
121 // Set up listener and register the records we want to process
122 TikaHSSFListener listener = new TikaHSSFListener(xhtml);
123 HSSFRequest hssfRequest = new HSSFRequest();
124 if (listenForAllRecords) {
125 hssfRequest.addListenerForAllRecords(listener);
126 } else {
127 hssfRequest.addListener(listener, BOFRecord.sid);
128 hssfRequest.addListener(listener, EOFRecord.sid);
129 hssfRequest.addListener(listener, DateWindow1904Record.sid);
130 hssfRequest.addListener(listener, CountryRecord.sid);
131 hssfRequest.addListener(listener, BoundSheetRecord.sid);
132 hssfRequest.addListener(listener, FormatRecord.sid);
133 hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
134 hssfRequest.addListener(listener, SSTRecord.sid);
135 hssfRequest.addListener(listener, FormulaRecord.sid);
136 hssfRequest.addListener(listener, LabelRecord.sid);
137 hssfRequest.addListener(listener, LabelSSTRecord.sid);
138 hssfRequest.addListener(listener, NumberRecord.sid);
139 hssfRequest.addListener(listener, RKRecord.sid);
140 //hssfRequest.addListener(listener, HyperlinkRecord.sid); // FIXME - requires POI release
143 // Create event factory and process Workbook (fire events)
144 DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
145 HSSFEventFactory eventFactory = new HSSFEventFactory();
147 eventFactory.processEvents(hssfRequest, documentInputStream);
148 listener.throwStoredException();
151 // ======================================================================
154 * HSSF Listener implementation which processes the HSSF records.
156 private static class TikaHSSFListener implements HSSFListener, Serializable {
158 private final XHTMLContentHandler handler;
160 private SAXException exception;
162 private SSTRecord sstRecord;
163 private List<String> sheetNames = new ArrayList<String>();
164 private short currentSheetIndex;
166 private boolean insideWorksheet = false;
168 private SortedMap<Point, Cell> currentSheet =
169 new TreeMap<Point, Cell>(new Comparator<Point> () {
170 public int compare(Point a, Point b) {
171 int diff = a.y - b.y;
172 if (diff == 0) {
173 diff = a.x - b.x;
175 return diff;
180 * Contstruct a new listener instance outputting parsed data to
181 * the specified XHTML content handler.
183 * @param handler Destination to write the parsed output to
185 private TikaHSSFListener(XHTMLContentHandler handler) {
186 this.handler = handler;
187 this.exception = null;
191 * Process a HSSF record.
193 * @param record HSSF Record
195 public void processRecord(Record record) {
196 try {
197 if (log.isDebugEnabled()) {
198 log.debug(record.toString());
200 internalProcessRecord(record);
201 } catch (SAXException e) {
202 if (exception == null) {
203 exception = e;
208 public void throwStoredException() throws SAXException {
209 if (exception != null) {
210 throw exception;
214 private void internalProcessRecord(Record record) throws SAXException {
215 switch (record.getSid()) {
217 /* BOFRecord: indicates start of workbook, worksheet etc. records */
218 case BOFRecord.sid:
219 switch (((BOFRecord) record).getType()) {
220 case BOFRecord.TYPE_WORKBOOK:
221 currentSheetIndex = -1;
222 break;
223 case BOFRecord.TYPE_WORKSHEET:
224 currentSheetIndex++;
225 currentSheet.clear();
226 insideWorksheet = true;
227 break;
229 break;
231 /* EOFRecord: indicates end of workbook, worksheet etc. records */
232 case EOFRecord.sid:
233 // ignore empty sheets
234 if (insideWorksheet && !currentSheet.isEmpty()) {
235 processSheet();
237 insideWorksheet = false;
238 break;
240 /* SSTRecord: holds all the strings for LabelSSTRecords */
241 case SSTRecord.sid:
242 sstRecord = (SSTRecord)record;
243 break;
245 /* BoundSheetRecord: Worksheet index record */
246 case BoundSheetRecord.sid:
247 BoundSheetRecord boundSheetRecord = (BoundSheetRecord)record;
248 String sheetName = boundSheetRecord.getSheetname();
249 sheetNames.add(sheetName);
250 break;
252 // FIXME - requires POI release
253 ///* HyperlinkRecord: holds a URL associated with a cell */
254 //case HyperlinkRecord.sid:
255 // HyperlinkRecord hyperlinkRecord = (HyperlinkRecord)record;
256 // if (insideWorksheet) {
257 // int row = hyperlinkRecord.getFirstRow();
258 // short column = hyperlinkRecord.getFirstColumn();
259 // Point point = new Point(column, row);
260 // Cell cell = currentSheet.get(point);
261 // if (cell != null) {
262 // cell = new LinkedCell(cell, hyperlinkRecord.getAddress());
263 // currentSheet.put(point, cell);
264 // }
265 // }
266 // break;
268 default:
269 if (insideWorksheet
270 && record instanceof CellValueRecordInterface) {
271 CellValueRecordInterface value =
272 (CellValueRecordInterface) record;
273 Cell cell = getCellValue(record.getSid(), value);
274 if (cell != null) {
275 Point point =
276 new Point(value.getColumn(), value.getRow());
277 currentSheet.put(point, cell);
280 break;
285 * Process a Cell Value record.
287 * @param sid record type identifier
288 * @param record The cell value record
290 private Cell getCellValue(
291 short sid, CellValueRecordInterface record)
292 throws SAXException {
294 String text = null;
295 switch (sid) {
296 /* FormulaRecord: Cell value from a formula */
297 case FormulaRecord.sid:
298 return new NumberCell(((FormulaRecord) record).getValue());
300 /* LabelRecord: strings stored directly in the cell */
301 case LabelRecord.sid:
302 text = ((LabelRecord)record).getValue();
303 break;
305 /* LabelSSTRecord: Ref. a string in the shared string table */
306 case LabelSSTRecord.sid:
307 LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record;
308 int sstIndex = labelSSTRecord.getSSTIndex();
309 text = sstRecord.getString(sstIndex).getString();
310 break;
312 /* NumberRecord: Contains a numeric cell value */
313 case NumberRecord.sid:
314 return new NumberCell(((NumberRecord) record).getValue());
316 /* RKRecord: Excel internal number record */
317 case RKRecord.sid:
318 return new NumberCell(((RKRecord)record).getRKNumber());
320 if (text != null) {
321 text = text.trim();
323 if (text != null && text.length() > 0) {
324 return new TextCell(text);
326 return null;
330 * Process an excel sheet.
332 * @throws SAXException if an error occurs
334 private void processSheet() throws SAXException {
335 // Sheet Start
336 handler.startElement("div", "class", "page");
337 if (currentSheetIndex < sheetNames.size()) {
338 handler.element("h1", sheetNames.get(currentSheetIndex));
340 handler.characters("\n");
341 handler.startElement("table");
342 handler.startElement("tbody");
344 // Process Rows
345 int currentRow = 1;
346 int currentColumn = 1;
347 handler.startElement("tr");
348 handler.startElement("td");
349 for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
350 while (currentRow < entry.getKey().y) {
351 handler.endElement("td");
352 handler.endElement("tr");
353 handler.characters("\n");
354 handler.startElement("tr");
355 handler.startElement("td");
356 currentRow++;
357 currentColumn = 1;
360 while (currentColumn < entry.getKey().x) {
361 handler.endElement("td");
362 handler.characters("\t");
363 handler.startElement("td");
364 currentColumn++;
367 entry.getValue().render(handler);
369 handler.endElement("td");
370 handler.endElement("tr");
372 // Sheet End
373 handler.endElement("tbody");
374 handler.endElement("table");
375 handler.endElement("div");
376 handler.characters("\n");