TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
[tika.git] / src / main / java / org / apache / tika / parser / microsoft / ExcelExtractor.java
blob1cf1fde04f64dba96cc026ed6bc055301ca5a10d
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser.microsoft;
19 import java.awt.Point;
20 import java.io.IOException;
21 import java.io.Serializable;
22 import java.util.ArrayList;
23 import java.util.Comparator;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.SortedMap;
27 import java.util.TreeMap;
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
32 import org.apache.poi.hssf.eventusermodel.HSSFListener;
33 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
34 import org.apache.poi.hssf.record.BOFRecord;
35 import org.apache.poi.hssf.record.BoundSheetRecord;
36 import org.apache.poi.hssf.record.CellValueRecordInterface;
37 import org.apache.poi.hssf.record.CountryRecord;
38 import org.apache.poi.hssf.record.DateWindow1904Record;
39 import org.apache.poi.hssf.record.EOFRecord;
40 import org.apache.poi.hssf.record.ExtendedFormatRecord;
41 import org.apache.poi.hssf.record.FormatRecord;
42 import org.apache.poi.hssf.record.FormulaRecord;
43 //import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release
44 import org.apache.poi.hssf.record.LabelRecord;
45 import org.apache.poi.hssf.record.LabelSSTRecord;
46 import org.apache.poi.hssf.record.NumberRecord;
47 import org.apache.poi.hssf.record.RKRecord;
48 import org.apache.poi.hssf.record.Record;
49 import org.apache.poi.hssf.record.SSTRecord;
50 import org.apache.poi.poifs.filesystem.DocumentInputStream;
51 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
52 import org.apache.tika.sax.XHTMLContentHandler;
53 import org.xml.sax.SAXException;
55 /**
56 * Excel parser implementation which uses POI's Event API
57 * to handle the contents of a Workbook.
58 * <p>
59 * The Event API uses a much smaller memory footprint than
60 * <code>HSSFWorkbook</code> when processing excel files
61 * but at the cost of more complexity.
62 * <p>
63 * With the Event API a <i>listener</i> is registered for
64 * specific record types and those records are created,
65 * fired off to the listener and then discarded as the stream
66 * is being processed.
68 * @see org.apache.poi.hssf.eventusermodel.HSSFListener
69 * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
70 * POI Event API How To</a>
72 public class ExcelExtractor {
74 /** Logging instance */
75 private static final Log log = LogFactory.getLog(ExcelExtractor.class);
77 /**
78 * <code>true</code> if the HSSFListener should be registered
79 * to listen for all records or <code>false</code> (the default)
80 * if the listener should be configured to only receive specified
81 * records.
83 private boolean listenForAllRecords = false;
85 /**
86 * Returns <code>true</code> if this parser is configured to listen
87 * for all records instead of just the specified few.
89 public boolean isListenForAllRecords() {
90 return listenForAllRecords;
93 /**
94 * Specifies whether this parser should to listen for all
95 * records or just for the specified few.
96 * <p>
97 * <strong>Note:</strong> Under normal operation this setting should
98 * be <code>false</code> (the default), but you can experiment with
99 * this setting for testing and debugging purposes.
101 * @param listenForAllRecords <code>true</code> if the HSSFListener
102 * should be registered to listen for all records or <code>false</code>
103 * if the listener should be configured to only receive specified records.
105 public void setListenForAllRecords(boolean listenForAllRecords) {
106 this.listenForAllRecords = listenForAllRecords;
110 * Extracts text from an Excel Workbook writing the extracted content
111 * to the specified {@link Appendable}.
113 * @param filesystem POI file system
114 * @throws IOException if an error occurs processing the workbook
115 * or writing the extracted content
117 protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
118 throws IOException, SAXException {
119 log.debug("Starting listenForAllRecords=" + listenForAllRecords);
121 // Set up listener and register the records we want to process
122 TikaHSSFListener listener = new TikaHSSFListener(xhtml);
123 HSSFRequest hssfRequest = new HSSFRequest();
124 if (listenForAllRecords) {
125 hssfRequest.addListenerForAllRecords(listener);
126 } else {
127 hssfRequest.addListener(listener, BOFRecord.sid);
128 hssfRequest.addListener(listener, EOFRecord.sid);
129 hssfRequest.addListener(listener, DateWindow1904Record.sid);
130 hssfRequest.addListener(listener, CountryRecord.sid);
131 hssfRequest.addListener(listener, BoundSheetRecord.sid);
132 hssfRequest.addListener(listener, FormatRecord.sid);
133 hssfRequest.addListener(listener, ExtendedFormatRecord.sid);
134 hssfRequest.addListener(listener, SSTRecord.sid);
135 hssfRequest.addListener(listener, FormulaRecord.sid);
136 hssfRequest.addListener(listener, LabelRecord.sid);
137 hssfRequest.addListener(listener, LabelSSTRecord.sid);
138 hssfRequest.addListener(listener, NumberRecord.sid);
139 hssfRequest.addListener(listener, RKRecord.sid);
140 //hssfRequest.addListener(listener, HyperlinkRecord.sid); // FIXME - requires POI release
143 // Create event factory and process Workbook (fire events)
144 DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
145 HSSFEventFactory eventFactory = new HSSFEventFactory();
147 eventFactory.processEvents(hssfRequest, documentInputStream);
148 listener.throwStoredException();
151 // ======================================================================
154 * HSSF Listener implementation which processes the HSSF records.
156 private static class TikaHSSFListener implements HSSFListener, Serializable {
158 private final XHTMLContentHandler handler;
160 private SAXException exception;
162 private SSTRecord sstRecord;
163 private List<String> sheetNames = new ArrayList<String>();
164 private short currentSheetIndex;
166 private boolean insideWorksheet = false;
168 private SortedMap<Point, Cell> currentSheet =
169 new TreeMap<Point, Cell>(new Comparator<Point> () {
170 public int compare(Point a, Point b) {
171 int diff = a.y - b.y;
172 if (diff == 0) {
173 diff = a.x - b.x;
175 return diff;
180 * Contstruct a new listener instance outputting parsed data to
181 * the specified XHTML content handler.
183 * @param handler Destination to write the parsed output to
185 private TikaHSSFListener(XHTMLContentHandler handler) {
186 this.handler = handler;
187 this.exception = null;
191 * Process a HSSF record.
193 * @param record HSSF Record
195 public void processRecord(Record record) {
196 try {
197 if (log.isDebugEnabled()) {
198 log.debug(record.toString());
200 internalProcessRecord(record);
201 } catch (SAXException e) {
202 if (exception == null) {
203 exception = e;
208 public void throwStoredException() throws SAXException {
209 if (exception != null) {
210 throw exception;
214 private void internalProcessRecord(Record record) throws SAXException {
215 switch (record.getSid()) {
217 /* BOFRecord: indicates start of workbook, worksheet etc. records */
218 case BOFRecord.sid:
219 switch (((BOFRecord) record).getType()) {
220 case BOFRecord.TYPE_WORKBOOK:
221 currentSheetIndex = -1;
222 break;
223 case BOFRecord.TYPE_WORKSHEET:
224 currentSheetIndex++;
225 currentSheet.clear();
226 insideWorksheet = true;
227 break;
229 break;
231 /* EOFRecord: indicates end of workbook, worksheet etc. records */
232 case EOFRecord.sid:
233 // ignore empty sheets
234 if (insideWorksheet && !currentSheet.isEmpty()) {
235 processSheet();
237 insideWorksheet = false;
238 break;
240 /* SSTRecord: holds all the strings for LabelSSTRecords */
241 case SSTRecord.sid:
242 sstRecord = (SSTRecord)record;
243 break;
245 /* BoundSheetRecord: Worksheet index record */
246 case BoundSheetRecord.sid:
247 BoundSheetRecord boundSheetRecord = (BoundSheetRecord)record;
248 String sheetName = boundSheetRecord.getSheetname();
249 sheetNames.add(sheetName);
250 break;
252 // FIXME - requires POI release
253 ///* HyperlinkRecord: holds a URL associated with a cell */
254 //case HyperlinkRecord.sid:
255 // HyperlinkRecord hyperlinkRecord = (HyperlinkRecord)record;
256 // if (insideWorksheet) {
257 // int row = hyperlinkRecord.getFirstRow();
258 // short column = hyperlinkRecord.getFirstColumn();
259 // Point point = new Point(column, row);
260 // Cell cell = currentSheet.get(point);
261 // if (cell != null) {
262 // cell = new LinkedCell(cell, hyperlinkRecord.getAddress());
263 // currentSheet.put(point, cell);
264 // }
265 // }
266 // break;
268 /* FormulaRecord: Cell value from a formula */
269 case FormulaRecord.sid:
270 FormulaRecord formula = (FormulaRecord) record;
271 addCell(record, new NumberCell(formula.getValue()));
272 break;
274 /* LabelRecord: strings stored directly in the cell */
275 case LabelRecord.sid:
276 LabelRecord label = (LabelRecord) record;
277 addCell(record, getTextCell(label.getValue()));
278 break;
280 /* LabelSSTRecord: Ref. a string in the shared string table */
281 case LabelSSTRecord.sid:
282 LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record;
283 int sstIndex = labelSSTRecord.getSSTIndex();
284 String sstLabel = sstRecord.getString(sstIndex).getString();
285 addCell(record, getTextCell(sstLabel));
286 break;
288 /* NumberRecord: Contains a numeric cell value */
289 case NumberRecord.sid:
290 NumberRecord number = (NumberRecord) record;
291 addCell(record, new NumberCell(number.getValue()));
292 break;
294 /* RKRecord: Excel internal number record */
295 case RKRecord.sid:
296 RKRecord rk = (RKRecord) record;
297 addCell(record, new NumberCell(rk.getRKNumber()));
298 break;
303 * Adds the given cell (unless <code>null</code>) to the current
304 * worksheet (if any) at the position (if any) of the given record.
306 * @param record record that holds the cell value
307 * @param cell cell value (or <code>null</code>)
309 private void addCell(Record record, Cell cell) {
310 if (!insideWorksheet) {
311 // Ignore cells outside sheets
312 } else if (cell == null) {
313 // Ignore empty cells
314 } else if (record instanceof CellValueRecordInterface) {
315 CellValueRecordInterface value =
316 (CellValueRecordInterface) record;
317 Point point = new Point(value.getColumn(), value.getRow());
318 currentSheet.put(point, cell);
323 * Returns a text cell with the given text comment. The given text
324 * is trimmed, and ignored if <code>null</code> or empty.
326 * @param text text content, may be <code>null</code>
327 * @return text cell, or <code>null</code>
329 private Cell getTextCell(String text) {
330 if (text != null) {
331 text = text.trim();
332 if (text.length() > 0) {
333 return new TextCell(text);
336 return null;
340 * Process an excel sheet.
342 * @throws SAXException if an error occurs
344 private void processSheet() throws SAXException {
345 // Sheet Start
346 handler.startElement("div", "class", "page");
347 if (currentSheetIndex < sheetNames.size()) {
348 handler.element("h1", sheetNames.get(currentSheetIndex));
350 handler.characters("\n");
351 handler.startElement("table");
352 handler.startElement("tbody");
354 // Process Rows
355 int currentRow = 1;
356 int currentColumn = 1;
357 handler.startElement("tr");
358 handler.startElement("td");
359 for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
360 while (currentRow < entry.getKey().y) {
361 handler.endElement("td");
362 handler.endElement("tr");
363 handler.characters("\n");
364 handler.startElement("tr");
365 handler.startElement("td");
366 currentRow++;
367 currentColumn = 1;
370 while (currentColumn < entry.getKey().x) {
371 handler.endElement("td");
372 handler.characters("\t");
373 handler.startElement("td");
374 currentColumn++;
377 entry.getValue().render(handler);
379 handler.endElement("td");
380 handler.endElement("tr");
382 // Sheet End
383 handler.endElement("tbody");
384 handler.endElement("table");
385 handler.endElement("div");
386 handler.characters("\n");