TIKA-139: Add a composite parser
[tika.git] / src / main / java / org / apache / tika / config / TikaConfig.java
blob2d5cb468da7bfd88c6d0e6fa77bbfaceb93e1a5c
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.config;
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.util.HashMap;
24 import java.util.Map;
26 import javax.xml.parsers.DocumentBuilder;
27 import javax.xml.parsers.DocumentBuilderFactory;
28 import javax.xml.parsers.ParserConfigurationException;
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.mime.MimeTypes;
32 import org.apache.tika.mime.MimeTypesFactory;
33 import org.apache.tika.parser.Parser;
34 import org.w3c.dom.Document;
35 import org.w3c.dom.Element;
36 import org.w3c.dom.Node;
37 import org.w3c.dom.NodeList;
38 import org.xml.sax.SAXException;
40 /**
41 * Parse xml config file.
43 public class TikaConfig {
45 public static final String DEFAULT_CONFIG_LOCATION =
46 "/org/apache/tika/tika-config.xml";
48 private final Map<String, Parser> parsers = new HashMap<String, Parser>();
50 private static MimeTypes mimeTypes;
52 public TikaConfig(String file)
53 throws TikaException, IOException, SAXException {
54 this(new File(file));
57 public TikaConfig(File file)
58 throws TikaException, IOException, SAXException {
59 this(getBuilder().parse(file));
62 public TikaConfig(URL url)
63 throws TikaException, IOException, SAXException {
64 this(getBuilder().parse(url.toString()));
67 public TikaConfig(InputStream stream)
68 throws TikaException, IOException, SAXException {
69 this(getBuilder().parse(stream));
72 public TikaConfig(Document document) throws TikaException, IOException {
73 this(document.getDocumentElement());
76 public TikaConfig(Element element) throws TikaException, IOException {
77 Element mtr = getChild(element, "mimeTypeRepository");
78 if (mtr != null) {
79 mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
82 NodeList nodes = element.getElementsByTagName("parser");
83 for (int i = 0; i < nodes.getLength(); i++) {
84 Element node = (Element) nodes.item(i);
85 String name = node.getAttribute("class");
86 try {
87 Parser parser = (Parser) Class.forName(name).newInstance();
88 NodeList mimes = node.getElementsByTagName("mime");
89 for (int j = 0; j < mimes.getLength(); j++) {
90 Element mime = (Element) mimes.item(j);
91 parsers.put(mime.getTextContent().trim(), parser);
93 } catch (Exception e) {
94 throw new TikaException(
95 "Invalid parser configuration: " + name, e);
101 * Returns the parser instance configured for the given MIME type.
102 * Returns <code>null</code> if the given MIME type is unknown.
104 * @param mimeType MIME type
105 * @return configured Parser instance, or <code>null</code>
107 public Parser getParser(String mimeType) {
108 return parsers.get(mimeType);
111 public Map<String, Parser> getParsers() {
112 return parsers;
115 public MimeTypes getMimeRepository(){
116 return mimeTypes;
120 * Provides a default configuration (TikaConfig). Currently creates a
121 * new instance each time it's called; we may be able to have it
122 * return a shared instance once it is completely immutable.
124 * @return default configuration
125 * @throws TikaException if the default configuration is not available
127 public static TikaConfig getDefaultConfig() throws TikaException {
128 try {
129 InputStream stream =
130 TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
131 return new TikaConfig(stream);
132 } catch (IOException e) {
133 throw new TikaException("Unable to read default configuration", e);
134 } catch (SAXException e) {
135 throw new TikaException("Unable to parse default configuration", e);
139 private static DocumentBuilder getBuilder() throws TikaException {
140 try {
141 return DocumentBuilderFactory.newInstance().newDocumentBuilder();
142 } catch (ParserConfigurationException e) {
143 throw new TikaException("XML parser not available", e);
147 private static Element getChild(Element element, String name) {
148 Node child = element.getFirstChild();
149 while (child != null) {
150 if (child.getNodeType() == Node.ELEMENT_NODE
151 && name.equals(child.getNodeName())) {
152 return (Element) child;
154 child = child.getNextSibling();
156 return null;