2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.config
;
20 import java
.io
.IOException
;
21 import java
.io
.InputStream
;
23 import java
.util
.HashMap
;
26 import javax
.xml
.parsers
.DocumentBuilder
;
27 import javax
.xml
.parsers
.DocumentBuilderFactory
;
28 import javax
.xml
.parsers
.ParserConfigurationException
;
30 import org
.apache
.tika
.exception
.TikaException
;
31 import org
.apache
.tika
.mime
.MimeTypes
;
32 import org
.apache
.tika
.mime
.MimeTypesFactory
;
33 import org
.apache
.tika
.parser
.Parser
;
34 import org
.w3c
.dom
.Document
;
35 import org
.w3c
.dom
.Element
;
36 import org
.w3c
.dom
.Node
;
37 import org
.w3c
.dom
.NodeList
;
38 import org
.xml
.sax
.SAXException
;
41 * Parse xml config file.
43 public class TikaConfig
{
45 public static final String DEFAULT_CONFIG_LOCATION
=
46 "/org/apache/tika/tika-config.xml";
48 private final Map
<String
, Parser
> parsers
= new HashMap
<String
, Parser
>();
50 private static MimeTypes mimeTypes
;
52 public TikaConfig(String file
)
53 throws TikaException
, IOException
, SAXException
{
57 public TikaConfig(File file
)
58 throws TikaException
, IOException
, SAXException
{
59 this(getBuilder().parse(file
));
62 public TikaConfig(URL url
)
63 throws TikaException
, IOException
, SAXException
{
64 this(getBuilder().parse(url
.toString()));
67 public TikaConfig(InputStream stream
)
68 throws TikaException
, IOException
, SAXException
{
69 this(getBuilder().parse(stream
));
72 public TikaConfig(Document document
) throws TikaException
, IOException
{
73 this(document
.getDocumentElement());
76 public TikaConfig(Element element
) throws TikaException
, IOException
{
77 Element mtr
= getChild(element
, "mimeTypeRepository");
79 mimeTypes
= MimeTypesFactory
.create(mtr
.getAttribute("resource"));
82 NodeList nodes
= element
.getElementsByTagName("parser");
83 for (int i
= 0; i
< nodes
.getLength(); i
++) {
84 Element node
= (Element
) nodes
.item(i
);
85 String name
= node
.getAttribute("class");
87 Parser parser
= (Parser
) Class
.forName(name
).newInstance();
88 NodeList mimes
= node
.getElementsByTagName("mime");
89 for (int j
= 0; j
< mimes
.getLength(); j
++) {
90 Element mime
= (Element
) mimes
.item(j
);
91 parsers
.put(mime
.getTextContent().trim(), parser
);
93 } catch (Exception e
) {
94 throw new TikaException(
95 "Invalid parser configuration: " + name
, e
);
101 * Returns the parser instance configured for the given MIME type.
102 * Returns <code>null</code> if the given MIME type is unknown.
104 * @param mimeType MIME type
105 * @return configured Parser instance, or <code>null</code>
107 public Parser
getParser(String mimeType
) {
108 return parsers
.get(mimeType
);
111 public Map
<String
, Parser
> getParsers() {
115 public MimeTypes
getMimeRepository(){
120 * Provides a default configuration (TikaConfig). Currently creates a
121 * new instance each time it's called; we may be able to have it
122 * return a shared instance once it is completely immutable.
124 * @return default configuration
125 * @throws TikaException if the default configuration is not available
127 public static TikaConfig
getDefaultConfig() throws TikaException
{
130 TikaConfig
.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION
);
131 return new TikaConfig(stream
);
132 } catch (IOException e
) {
133 throw new TikaException("Unable to read default configuration", e
);
134 } catch (SAXException e
) {
135 throw new TikaException("Unable to parse default configuration", e
);
139 private static DocumentBuilder
getBuilder() throws TikaException
{
141 return DocumentBuilderFactory
.newInstance().newDocumentBuilder();
142 } catch (ParserConfigurationException e
) {
143 throw new TikaException("XML parser not available", e
);
147 private static Element
getChild(Element element
, String name
) {
148 Node child
= element
.getFirstChild();
149 while (child
!= null) {
150 if (child
.getNodeType() == Node
.ELEMENT_NODE
151 && name
.equals(child
.getNodeName())) {
152 return (Element
) child
;
154 child
= child
.getNextSibling();