TIKA-139: Add a composite parser
[tika.git] / src / main / java / org / apache / tika / parser / AutoDetectParser.java
blob0a2f04c6095142a64cf7c504399b22700e7499b2
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser;
19 import java.io.BufferedInputStream;
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
24 import org.apache.tika.config.TikaConfig;
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.mime.MimeType;
28 import org.apache.tika.mime.MimeTypeException;
29 import org.apache.tika.mime.MimeTypes;
30 import org.xml.sax.ContentHandler;
31 import org.xml.sax.SAXException;
33 public class AutoDetectParser extends CompositeParser {
35 private MimeTypes types;
37 /**
38 * Creates an auto-detecting parser instance using the default Tika
39 * configuration.
41 public AutoDetectParser() {
42 try {
43 setConfig(TikaConfig.getDefaultConfig());
44 } catch (TikaException e) {
45 // FIXME: This should never happen
46 throw new RuntimeException(e);
50 public AutoDetectParser(TikaConfig config) {
51 setConfig(config);
54 public void setConfig(TikaConfig config) {
55 setParsers(config.getParsers());
56 setMimeTypes(config.getMimeRepository());
59 public MimeTypes getMimeTypes() {
60 return types;
63 public void setMimeTypes(MimeTypes types) {
64 this.types = types;
67 public void parse(
68 InputStream stream, ContentHandler handler, Metadata metadata)
69 throws IOException, SAXException, TikaException {
70 // We need buffering to enable MIME magic detection before parsing
71 if (!stream.markSupported()) {
72 stream = new BufferedInputStream(stream);
75 // Automatically detect the MIME type of the document
76 MimeType type = getMimeType(stream, metadata);
77 metadata.set(Metadata.CONTENT_TYPE, type.getName());
79 // Parse the document
80 super.parse(stream, handler, metadata);
83 /**
84 * Automatically detects the MIME type of a document based on magic
85 * markers in the stream prefix and any given metadata hints.
86 * <p>
87 * The given stream is expected to support marks, so that this method
88 * can reset the stream to the position it was in before this method
89 * was called.
91 * @param stream document stream
92 * @param metadata metadata hints
93 * @return MIME type of the document
94 * @throws IOException if the document stream could not be read
96 private MimeType getMimeType(InputStream stream, Metadata metadata)
97 throws IOException {
98 // Get type based on magic prefix
99 stream.mark(types.getMinLength());
100 try {
101 byte[] prefix = getPrefix(stream, types.getMinLength());
102 MimeType type = types.getMimeType(prefix);
103 if (type != null) {
104 return type;
106 } finally {
107 stream.reset();
110 // Get type based on resourceName hint (if available)
111 String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
112 if (resourceName != null) {
113 MimeType type = types.getMimeType(resourceName);
114 if (type != null) {
115 return type;
119 // Get type based on metadata hint (if available)
120 String typename = metadata.get(Metadata.CONTENT_TYPE);
121 if (typename != null) {
122 try {
123 return types.forName(typename);
124 } catch (MimeTypeException e) {
125 // Malformed type name, ignore
129 // Finally, use the default type if no matches found
130 try {
131 return types.forName(MimeTypes.DEFAULT);
132 } catch (MimeTypeException e) {
133 // Should never happen
134 return null;
139 * Reads and returns the first <code>length</code> bytes from the
140 * given stream. If the stream ends before that, returns all bytes
141 * from the stream.
143 * @param input input stream
144 * @param length number of bytes to read and return
145 * @return stream prefix
146 * @throws IOException if the stream could not be read
148 private byte[] getPrefix(InputStream input, int length) throws IOException {
149 ByteArrayOutputStream output = new ByteArrayOutputStream();
150 byte[] buffer = new byte[Math.min(1024, length)];
151 int n = input.read(buffer);
152 while (n != -1) {
153 output.write(buffer, 0, n);
154 int remaining = length - output.size();
155 if (remaining > 0) {
156 n = input.read(buffer, 0, Math.min(buffer.length, remaining));
157 } else {
158 n = -1;
161 return output.toByteArray();