2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.parser
;
19 import java
.io
.BufferedInputStream
;
20 import java
.io
.ByteArrayOutputStream
;
21 import java
.io
.IOException
;
22 import java
.io
.InputStream
;
24 import org
.apache
.tika
.config
.TikaConfig
;
25 import org
.apache
.tika
.exception
.TikaException
;
26 import org
.apache
.tika
.metadata
.Metadata
;
27 import org
.apache
.tika
.mime
.MimeType
;
28 import org
.apache
.tika
.mime
.MimeTypeException
;
29 import org
.apache
.tika
.mime
.MimeTypes
;
30 import org
.xml
.sax
.ContentHandler
;
31 import org
.xml
.sax
.SAXException
;
33 public class AutoDetectParser
extends CompositeParser
{
35 private MimeTypes types
;
38 * Creates an auto-detecting parser instance using the default Tika
41 public AutoDetectParser() {
43 setConfig(TikaConfig
.getDefaultConfig());
44 } catch (TikaException e
) {
45 // FIXME: This should never happen
46 throw new RuntimeException(e
);
50 public AutoDetectParser(TikaConfig config
) {
54 public void setConfig(TikaConfig config
) {
55 setParsers(config
.getParsers());
56 setMimeTypes(config
.getMimeRepository());
59 public MimeTypes
getMimeTypes() {
63 public void setMimeTypes(MimeTypes types
) {
68 InputStream stream
, ContentHandler handler
, Metadata metadata
)
69 throws IOException
, SAXException
, TikaException
{
70 // We need buffering to enable MIME magic detection before parsing
71 if (!stream
.markSupported()) {
72 stream
= new BufferedInputStream(stream
);
75 // Automatically detect the MIME type of the document
76 MimeType type
= getMimeType(stream
, metadata
);
77 metadata
.set(Metadata
.CONTENT_TYPE
, type
.getName());
80 super.parse(stream
, handler
, metadata
);
84 * Automatically detects the MIME type of a document based on magic
85 * markers in the stream prefix and any given metadata hints.
87 * The given stream is expected to support marks, so that this method
88 * can reset the stream to the position it was in before this method
91 * @param stream document stream
92 * @param metadata metadata hints
93 * @return MIME type of the document
94 * @throws IOException if the document stream could not be read
96 private MimeType
getMimeType(InputStream stream
, Metadata metadata
)
98 // Get type based on magic prefix
99 stream
.mark(types
.getMinLength());
101 byte[] prefix
= getPrefix(stream
, types
.getMinLength());
102 MimeType type
= types
.getMimeType(prefix
);
110 // Get type based on resourceName hint (if available)
111 String resourceName
= metadata
.get(Metadata
.RESOURCE_NAME_KEY
);
112 if (resourceName
!= null) {
113 MimeType type
= types
.getMimeType(resourceName
);
119 // Get type based on metadata hint (if available)
120 String typename
= metadata
.get(Metadata
.CONTENT_TYPE
);
121 if (typename
!= null) {
123 return types
.forName(typename
);
124 } catch (MimeTypeException e
) {
125 // Malformed type name, ignore
129 // Finally, use the default type if no matches found
131 return types
.forName(MimeTypes
.DEFAULT
);
132 } catch (MimeTypeException e
) {
133 // Should never happen
139 * Reads and returns the first <code>length</code> bytes from the
140 * given stream. If the stream ends before that, returns all bytes
143 * @param input input stream
144 * @param length number of bytes to read and return
145 * @return stream prefix
146 * @throws IOException if the stream could not be read
148 private byte[] getPrefix(InputStream input
, int length
) throws IOException
{
149 ByteArrayOutputStream output
= new ByteArrayOutputStream();
150 byte[] buffer
= new byte[Math
.min(1024, length
)];
151 int n
= input
.read(buffer
);
153 output
.write(buffer
, 0, n
);
154 int remaining
= length
- output
.size();
156 n
= input
.read(buffer
, 0, Math
.min(buffer
.length
, remaining
));
161 return output
.toByteArray();