From e2334261889e081fca4b3237b201c069429b0610 Mon Sep 17 00:00:00 2001 From: Jukka Lauri Zitting Date: Wed, 26 Mar 2008 21:04:41 +0000 Subject: [PATCH] TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support - Removed the insideWorksheet flag - Improved javadocs - Extracted PointComparator to an explicit utility class git-svn-id: https://svn.eu.apache.org/repos/asf/incubator/tika/trunk@641575 13f79535-47bb-0310-9956-ffa450edef68 --- .../tika/parser/microsoft/ExcelExtractor.java | 65 ++++++++++++++-------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java index 0ed8377..2a5e616 100644 --- a/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java +++ b/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java @@ -170,21 +170,23 @@ public class ExcelExtractor { private SAXException exception = null; private SSTRecord sstRecord; + + /** + * List of worksheet names. + */ private List sheetNames = new ArrayList(); - private short currentSheetIndex; - private boolean insideWorksheet = false; + /** + * Index of the current worksheet within the workbook. + * Used to find the worksheet name in the {@link #sheetNames} list. + */ + private short currentSheetIndex; - private SortedMap currentSheet = - new TreeMap(new Comparator () { - public int compare(Point a, Point b) { - int diff = a.y - b.y; - if (diff == 0) { - diff = a.x - b.x; - } - return diff; - } - }); + /** + * Content of the current worksheet, or null if no + * worksheet is currently active. + */ + private SortedMap currentSheet = null; /** * Contstruct a new listener instance outputting parsed data to @@ -228,16 +230,16 @@ public class ExcelExtractor { currentSheetIndex = -1; } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { currentSheetIndex++; - currentSheet.clear(); - insideWorksheet = true; + currentSheet = + new TreeMap(new PointComparator()); } break; case EOFRecord.sid: // end of workbook, worksheet etc. records - if (insideWorksheet && !currentSheet.isEmpty()) { + if (currentSheet != null && !currentSheet.isEmpty()) { processSheet(); } - insideWorksheet = false; + currentSheet = null; break; case BoundSheetRecord.sid: // Worksheet index record @@ -277,12 +279,14 @@ public class ExcelExtractor { // FIXME - requires POI release // case HyperlinkRecord.sid: // holds a URL associated with a cell - // HyperlinkRecord link = (HyperlinkRecord) record; - // Point point = - // new Point(link.getFirstColumn(), link.getFirstRow()); - // Cell cell = currentSheet.get(point); - // if (cell != null) { - // addCell(record, new LinkedCell(cell, link.getAddress())); + // if (currentSheet != null) { + // HyperlinkRecord link = (HyperlinkRecord) record; + // Point point = + // new Point(link.getFirstColumn(), link.getFirstRow()); + // Cell cell = currentSheet.get(point); + // if (cell != null) { + // addCell(record, new LinkedCell(cell, link.getAddress())); + // } // } // break; } @@ -296,7 +300,7 @@ public class ExcelExtractor { * @param cell cell value (or null) */ private void addCell(Record record, Cell cell) { - if (!insideWorksheet) { + if (currentSheet == null) { // Ignore cells outside sheets } else if (cell == null) { // Ignore empty cells @@ -375,4 +379,19 @@ public class ExcelExtractor { } } + /** + * Utility comparator for points. + */ + private static class PointComparator implements Comparator { + + public int compare(Point a, Point b) { + int diff = a.y - b.y; + if (diff == 0) { + diff = a.x - b.x; + } + return diff; + } + + } + } -- 2.11.4.GIT