2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.mime
;
21 import java
.io
.IOException
;
22 import java
.io
.InputStream
;
24 import java
.util
.Arrays
;
26 import java
.util
.HashMap
;
27 import java
.util
.ArrayList
;
28 import java
.util
.Collections
;
29 import java
.util
.Comparator
;
30 import java
.util
.List
;
33 * This class is a MimeType repository. It gathers a set of MimeTypes and
34 * enables to retrieves a content-type from its name, from a file name, or from
35 * a magic character sequence.
37 * The MIME type detection methods that take an {@link InputStream} as
38 * an argument will never reads more than {@link #getMinLength()} bytes
39 * from the stream. Also the given stream is never
40 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
41 * or {@link InputStream#reset() reset} by the methods. Thus a client can
42 * use the {@link InputStream#markSupported() mark feature} of the stream
43 * (if available) to restore the stream back to the state it was before type
44 * detection if it wants to process the stream based on the detected type.
46 public final class MimeTypes
{
48 /** The default <code>application/octet-stream</code> MimeType */
49 public final static String DEFAULT
= "application/octet-stream";
51 /** All the registered MimeTypes indexed on their name */
52 private Map
<String
, MimeInfo
> types
= new HashMap
<String
, MimeInfo
>();
54 /** The patterns matcher */
55 private Patterns patterns
= new Patterns();
57 /** List of all registered magics */
58 private ArrayList
<Magic
> magics
= new ArrayList
<Magic
>();
60 /** List of all registered rootXML */
61 private ArrayList
<MimeInfo
> xmls
= new ArrayList
<MimeInfo
>();
63 private Map
<String
, List
<MimeInfo
>> unsolvedDeps
=
64 new HashMap
<String
, List
<MimeInfo
>>();
67 * A comparator used to sort the mime types based on their magics (it is
68 * sorted first on the magic's priority, then on the magic's size).
70 final static Comparator
<Magic
> MAGICS_COMPARATOR
= new Comparator
<Magic
>() {
71 public int compare(Magic m1
, Magic m2
) {
72 int p1
= m1
.getPriority();
73 int p2
= m2
.getPriority();
77 return m2
.size() - m1
.size();
82 * A comparator used to sort the mime types based on their level (the level
83 * is the number of super-types for a type)
85 private final static Comparator
<MimeInfo
> LEVELS_COMPARATOR
=
86 new Comparator
<MimeInfo
>() {
87 public int compare(MimeInfo o1
, MimeInfo o2
) {
88 return o2
.getLevel() - o1
.getLevel();
92 /** The minimum length of data to provide to check all MimeTypes */
93 private int minLength
= 0;
96 * Find the Mime Content Type of a file.
100 * @return the Mime Content Type of the specified file, or <code>null</code>
103 public MimeType
getMimeType(File file
) {
104 return getMimeType(file
.getName());
108 * Find the Mime Content Type of a document from its URL.
111 * of the document to analyze.
112 * @return the Mime Content Type of the specified document URL, or
113 * <code>null</code> if none is found.
115 public MimeType
getMimeType(URL url
) {
116 return getMimeType(url
.getPath());
120 * Find the Mime Content Type of a document from its name.
123 * of the document to analyze.
124 * @return the Mime Content Type of the specified document name, or
125 * <code>null</code> if none is found.
127 public MimeType
getMimeType(String name
) {
128 MimeType type
= patterns
.matches(name
.toLowerCase());
131 // if it's null here, then return the default type
132 return forName(DEFAULT
);
136 * Returns the MIME type that best matches the given first few bytes
137 * of a document stream.
139 * The given byte array is expected to be at least {@link #getMinLength()}
140 * long, or shorter only if the document stream itself is shorter.
142 * @param data first few bytes of a document stream
143 * @return matching MIME type, or <code>null</code> if no match is found
145 public MimeType
getMimeType(byte[] data
) {
148 // First, check for XML descriptions (level by level)
149 for (MimeInfo info
: xmls
) {
150 MimeType type
= info
.getType();
151 if (type
.matchesXML(data
)) {
156 // Then, check for magic bytes
157 for (Magic magic
: magics
) {
158 if (magic
.eval(data
)) {
159 return magic
.getType();
167 * Returns the MIME type that best matches the first few bytes of the
168 * given document stream.
170 * @see #getMimeType(byte[])
171 * @param stream document stream
172 * @return matching MIME type, or <code>null</code> if no match is found
173 * @throws IOException if the stream can be read
175 public MimeType
getMimeType(InputStream stream
) throws IOException
{
176 return getMimeType(readMagicHeader(stream
));
180 * Reads the first {@link #getMinLength()} bytes from the given stream.
181 * If the stream is shorter, then the entire content of the stream is
184 * The given stream is never {@link InputStream#close() closed},
185 * {@link InputStream#mark(int) marked}, or
186 * {@link InputStream#reset() reset} by this method.
188 * @param stream stream to be read
189 * @return first {@link #getMinLength()} (or fewer) bytes of the stream
190 * @throws IOException if the stream can not be read
192 private byte[] readMagicHeader(InputStream stream
) throws IOException
{
193 assert stream
!= null;
195 byte[] bytes
= new byte[getMinLength()];
198 int lastRead
= stream
.read(bytes
);
199 while (lastRead
!= -1) {
200 totalRead
+= lastRead
;
201 if (totalRead
== bytes
.length
) {
204 lastRead
= stream
.read(bytes
, totalRead
, bytes
.length
- totalRead
);
207 byte[] shorter
= new byte[totalRead
];
208 System
.arraycopy(bytes
, 0, shorter
, 0, totalRead
);
213 * Find the Mime Content Type of a document from its name and its content.
214 * The policy used to guess the Mime Content Type is:
216 * <li>Try to find the type based on the provided data.</li>
217 * <li>If a type is found, then return it, otherwise try to find the type
218 * based on the file name</li>
222 * of the document to analyze.
224 * are the first bytes of the document's content.
225 * @return the Mime Content Type of the specified document, or
226 * <code>null</code> if none is found.
227 * @see #getMinLength()
229 public MimeType
getMimeType(String name
, byte[] data
) {
230 // First, try to get the mime-type from the content
231 MimeType mimeType
= getMimeType(data
);
233 // If no mime-type found, then try to get the mime-type from
235 if (mimeType
== null) {
236 mimeType
= getMimeType(name
);
243 * Returns the MIME type that best matches the given document name and
244 * the first few bytes of the given document stream.
246 * @see #getMimeType(String, byte[])
247 * @param name document name
248 * @param stream document stream
249 * @return matching MIME type, or <code>null</code> if no match is found
250 * @throws IOException if the stream can not be read
252 public MimeType
getMimeType(String name
, InputStream stream
)
254 return getMimeType(name
, readMagicHeader(stream
));
258 * Find a Mime Content Type from its name.
261 * is the content type name
262 * @return the MimeType for the specified name, or <code>null</code> if no
263 * MimeType is registered for this name.
265 public MimeType
forName(String name
) {
266 MimeInfo info
= types
.get(name
);
267 return (info
== null) ?
null : info
.getType();
271 * Return the minimum length of data to provide to analyzing methods based
272 * on the document's content in order to check all the known MimeTypes.
274 * @return the minimum length of data to provide.
275 * @see #getMimeType(byte[])
276 * @see #getMimeType(String, byte[])
278 public int getMinLength() {
284 * Add the specified mime-types in the repository.
287 * are the mime-types to add.
289 void add(MimeType
[] types
) {
293 for (int i
= 0; i
< types
.length
; i
++) {
299 * Add the specified mime-type in the repository.
302 * is the mime-type to add.
304 void add(MimeType type
) {
309 // Add the new type in the repository
310 MimeInfo info
= new MimeInfo(type
);
311 types
.put(type
.getName(), info
);
313 // Checks for some unsolved dependencies on this new type
314 List
<MimeInfo
> deps
= unsolvedDeps
.get(type
.getName());
316 int level
= info
.getLevel();
317 for (MimeInfo dep
: deps
) {
318 level
= Math
.max(level
, dep
.getLevel() + 1);
320 info
.setLevel(level
);
321 unsolvedDeps
.remove(type
.getName());
324 for (String name
: type
.getSuperTypes()) {
325 MimeInfo superType
= types
.get(name
);
326 if (superType
== null) {
327 deps
= unsolvedDeps
.get(name
);
329 deps
= new ArrayList
<MimeInfo
>();
330 unsolvedDeps
.put(name
, deps
);
337 minLength
= Math
.max(minLength
, type
.getMinLength());
338 // Update the extensions index...
339 patterns
.add(type
.getPatterns(), type
);
340 // Update the magics index...
341 if (type
.hasMagic()) {
342 magics
.addAll(Arrays
.asList(type
.getMagics()));
344 Collections
.sort(magics
, MAGICS_COMPARATOR
);
346 // Update the xml (xmlRoot) index...
347 if (type
.hasRootXML()) {
350 Collections
.sort(xmls
, LEVELS_COMPARATOR
);
354 public String
toString() {
355 StringBuilder builder
= new StringBuilder();
356 for (MimeInfo info
: types
.values()) {
357 builder
.append(info
.getType()).append("\n");
359 return builder
.toString();
362 private final class MimeInfo
{
364 private final MimeType type
;
368 MimeInfo(MimeType type
) {
381 void setLevel(int level
) {
382 if (level
> this.level
) {
385 // Update all my super-types
386 for (String name
: type
.getSuperTypes()) {
387 MimeInfo info
= types
.get(name
);
389 info
.setLevel(level
+ 1);