From 8bf805449ebd76d938e84d8601308a8f06114635 Mon Sep 17 00:00:00 2001 From: Jukka Lauri Zitting Date: Fri, 11 Apr 2008 14:29:33 +0000 Subject: [PATCH] TIKA-139: Add a composite parser git-svn-id: https://svn.eu.apache.org/repos/asf/incubator/tika/trunk@647181 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../java/org/apache/tika/config/TikaConfig.java | 6 +- .../org/apache/tika/parser/AutoDetectParser.java | 34 +++--- .../org/apache/tika/parser/CompositeParser.java | 119 +++++++++++++++++++++ 4 files changed, 141 insertions(+), 21 deletions(-) create mode 100644 src/main/java/org/apache/tika/parser/CompositeParser.java diff --git a/CHANGES.txt b/CHANGES.txt index 5cb6be9..b77634d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -46,6 +46,9 @@ Unreleased changes (0.2-incubating) 19. TIKA-113 - Metadata (such as title) should not be part of content (Jukka Zitting) +20. TIKA-139 - Add a composite parser (Jukka Zitting) + + Release 0.1-incubating - 12/27/2007 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann) diff --git a/src/main/java/org/apache/tika/config/TikaConfig.java b/src/main/java/org/apache/tika/config/TikaConfig.java index a7c1888..2d5cb46 100644 --- a/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/src/main/java/org/apache/tika/config/TikaConfig.java @@ -107,7 +107,11 @@ public class TikaConfig { public Parser getParser(String mimeType) { return parsers.get(mimeType); } - + + public Map getParsers() { + return parsers; + } + public MimeTypes getMimeRepository(){ return mimeTypes; } diff --git a/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/src/main/java/org/apache/tika/parser/AutoDetectParser.java index cbd0928..0a2f04c 100644 --- a/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -30,9 +30,9 @@ import org.apache.tika.mime.MimeTypes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -public class AutoDetectParser extends AbstractParser { +public class AutoDetectParser extends CompositeParser { - private TikaConfig config; + private MimeTypes types; /** * Creates an auto-detecting parser instance using the default Tika @@ -40,7 +40,7 @@ public class AutoDetectParser extends AbstractParser { */ public AutoDetectParser() { try { - config = TikaConfig.getDefaultConfig(); + setConfig(TikaConfig.getDefaultConfig()); } catch (TikaException e) { // FIXME: This should never happen throw new RuntimeException(e); @@ -48,15 +48,20 @@ public class AutoDetectParser extends AbstractParser { } public AutoDetectParser(TikaConfig config) { - this.config = config; + setConfig(config); } - public TikaConfig getConfig() { - return config; + public void setConfig(TikaConfig config) { + setParsers(config.getParsers()); + setMimeTypes(config.getMimeRepository()); } - public void setConfig(TikaConfig config) { - this.config = config; + public MimeTypes getMimeTypes() { + return types; + } + + public void setMimeTypes(MimeTypes types) { + this.types = types; } public void parse( @@ -71,17 +76,8 @@ public class AutoDetectParser extends AbstractParser { MimeType type = getMimeType(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, type.getName()); - // Get the parser configured for the detected MIME type - Parser parser = config.getParser(type.getName()); - if (parser == null) { - parser = config.getParser(MimeTypes.DEFAULT); - } - if (parser == null) { - throw new TikaException("No parsers available: " + type.getName()); - } - // Parse the document - parser.parse(stream, handler, metadata); + super.parse(stream, handler, metadata); } /** @@ -99,8 +95,6 @@ public class AutoDetectParser extends AbstractParser { */ private MimeType getMimeType(InputStream stream, Metadata metadata) throws IOException { - MimeTypes types = config.getMimeRepository(); - // Get type based on magic prefix stream.mark(types.getMinLength()); try { diff --git a/src/main/java/org/apache/tika/parser/CompositeParser.java b/src/main/java/org/apache/tika/parser/CompositeParser.java new file mode 100644 index 0000000..b400f4a --- /dev/null +++ b/src/main/java/org/apache/tika/parser/CompositeParser.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Composite parser that delegates parsing tasks to a component parser + * based on the declared content type of the incoming document. A fallback + * parser is defined for cases where a parser for the given content type is + * not available. + */ +public class CompositeParser implements Parser { + + /** + * Set of component parsers, keyed by the supported media types. + */ + private Map parsers = new HashMap(); + + /** + * The fallback parser, used when no better parser is available. + */ + private Parser fallback = new EmptyParser(); + + /** + * Returns the component parsers. + * + * @return component parsers, keyed by media type + */ + public Map getParsers() { + return parsers; + } + + /** + * Sets the component parsers. + * + * @param parsers component parsers, keyed by media type + */ + public void setParsers(Map parsers) { + this.parsers = parsers; + } + + /** + * Returns the fallback parser. + * + * @return fallback parser + */ + public Parser getFallback() { + return fallback; + } + + /** + * Sets the fallback parser. + * + * @param fallback fallback parser + */ + public void setFallback(Parser fallback) { + this.fallback = fallback; + } + + /** + * Returns the parser that best matches the given metadata. By default + * looks for a parser that matches the content type metadata property, + * and uses the fallback parser if a better match is not found. + *

+ * Subclasses can override this method to provide more accurate + * parser resolution. + * + * @param metadata document metadata + * @return matching parser + */ + protected Parser getParser(Metadata metadata) { + Parser parser = parsers.get(metadata.get(Metadata.CONTENT_TYPE)); + if (parser == null) { + parser = fallback; + } + return parser; + } + + /** + * Delegates the call to the matching component parser. + */ + public void parse(InputStream stream, Metadata metadata) + throws IOException, TikaException { + getParser(metadata).parse(stream, metadata); + } + + /** + * Delegates the call to the matching component parser. + */ + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { + getParser(metadata).parse(stream, handler, metadata); + } + +} -- 2.11.4.GIT