TIKA-87 - MimeTypes should allow modification of MIME types
[tika.git] / src / main / java / org / apache / tika / mime / MimeTypes.java
blob38708f38e46564371aabce25c37cd63d7165efc6
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.mime;
19 // JDK imports
20 import java.io.File;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.net.URL;
24 import java.util.Arrays;
25 import java.util.Map;
26 import java.util.HashMap;
27 import java.util.ArrayList;
28 import java.util.Collections;
29 import java.util.Comparator;
30 import java.util.List;
32 /**
33 * This class is a MimeType repository. It gathers a set of MimeTypes and
34 * enables to retrieves a content-type from its name, from a file name, or from
35 * a magic character sequence.
36 * <p>
37 * The MIME type detection methods that take an {@link InputStream} as
38 * an argument will never reads more than {@link #getMinLength()} bytes
39 * from the stream. Also the given stream is never
40 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
41 * or {@link InputStream#reset() reset} by the methods. Thus a client can
42 * use the {@link InputStream#markSupported() mark feature} of the stream
43 * (if available) to restore the stream back to the state it was before type
44 * detection if it wants to process the stream based on the detected type.
46 public final class MimeTypes {
48 /** The default <code>application/octet-stream</code> MimeType */
49 public final static String DEFAULT = "application/octet-stream";
51 /** All the registered MimeTypes indexed on their name */
52 private Map<String, MimeInfo> types = new HashMap<String, MimeInfo>();
54 /** The patterns matcher */
55 private Patterns patterns = new Patterns();
57 /** List of all registered magics */
58 private ArrayList<Magic> magics = new ArrayList<Magic>();
60 /** List of all registered rootXML */
61 private ArrayList<MimeInfo> xmls = new ArrayList<MimeInfo>();
63 private Map<String, List<MimeInfo>> unsolvedDeps =
64 new HashMap<String, List<MimeInfo>>();
66 /**
67 * A comparator used to sort the mime types based on their magics (it is
68 * sorted first on the magic's priority, then on the magic's size).
70 final static Comparator<Magic> MAGICS_COMPARATOR = new Comparator<Magic>() {
71 public int compare(Magic m1, Magic m2) {
72 int p1 = m1.getPriority();
73 int p2 = m2.getPriority();
74 if (p1 != p2) {
75 return p2 - p1;
77 return m2.size() - m1.size();
81 /**
82 * A comparator used to sort the mime types based on their level (the level
83 * is the number of super-types for a type)
85 private final static Comparator<MimeInfo> LEVELS_COMPARATOR =
86 new Comparator<MimeInfo>() {
87 public int compare(MimeInfo o1, MimeInfo o2) {
88 return o2.getLevel() - o1.getLevel();
92 /** The minimum length of data to provide to check all MimeTypes */
93 private int minLength = 0;
95 /**
96 * Find the Mime Content Type of a file.
98 * @param file
99 * to analyze.
100 * @return the Mime Content Type of the specified file, or <code>null</code>
101 * if none is found.
103 public MimeType getMimeType(File file) {
104 return getMimeType(file.getName());
108 * Find the Mime Content Type of a document from its URL.
110 * @param url
111 * of the document to analyze.
112 * @return the Mime Content Type of the specified document URL, or
113 * <code>null</code> if none is found.
115 public MimeType getMimeType(URL url) {
116 return getMimeType(url.getPath());
120 * Find the Mime Content Type of a document from its name.
122 * @param name
123 * of the document to analyze.
124 * @return the Mime Content Type of the specified document name, or
125 * <code>null</code> if none is found.
127 public MimeType getMimeType(String name) {
128 MimeType type = patterns.matches(name.toLowerCase());
129 if (type != null)
130 return type;
131 // if it's null here, then return the default type
132 return forName(DEFAULT);
136 * Returns the MIME type that best matches the given first few bytes
137 * of a document stream.
138 * <p>
139 * The given byte array is expected to be at least {@link #getMinLength()}
140 * long, or shorter only if the document stream itself is shorter.
142 * @param data first few bytes of a document stream
143 * @return matching MIME type, or <code>null</code> if no match is found
145 public MimeType getMimeType(byte[] data) {
146 assert data != null;
148 // First, check for XML descriptions (level by level)
149 for (MimeInfo info : xmls) {
150 MimeType type = info.getType();
151 if (type.matchesXML(data)) {
152 return type;
156 // Then, check for magic bytes
157 for (Magic magic : magics) {
158 if (magic.eval(data)) {
159 return magic.getType();
163 return null;
167 * Returns the MIME type that best matches the first few bytes of the
168 * given document stream.
170 * @see #getMimeType(byte[])
171 * @param stream document stream
172 * @return matching MIME type, or <code>null</code> if no match is found
173 * @throws IOException if the stream can be read
175 public MimeType getMimeType(InputStream stream) throws IOException {
176 return getMimeType(readMagicHeader(stream));
180 * Reads the first {@link #getMinLength()} bytes from the given stream.
181 * If the stream is shorter, then the entire content of the stream is
182 * returned.
183 * <p>
184 * The given stream is never {@link InputStream#close() closed},
185 * {@link InputStream#mark(int) marked}, or
186 * {@link InputStream#reset() reset} by this method.
188 * @param stream stream to be read
189 * @return first {@link #getMinLength()} (or fewer) bytes of the stream
190 * @throws IOException if the stream can not be read
192 private byte[] readMagicHeader(InputStream stream) throws IOException {
193 assert stream != null;
195 byte[] bytes = new byte[getMinLength()];
196 int totalRead = 0;
198 int lastRead = stream.read(bytes);
199 while (lastRead != -1) {
200 totalRead += lastRead;
201 if (totalRead == bytes.length) {
202 return bytes;
204 lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
207 byte[] shorter = new byte[totalRead];
208 System.arraycopy(bytes, 0, shorter, 0, totalRead);
209 return shorter;
213 * Find the Mime Content Type of a document from its name and its content.
214 * The policy used to guess the Mime Content Type is:
215 * <ol>
216 * <li>Try to find the type based on the provided data.</li>
217 * <li>If a type is found, then return it, otherwise try to find the type
218 * based on the file name</li>
219 * </ol>
221 * @param name
222 * of the document to analyze.
223 * @param data
224 * are the first bytes of the document's content.
225 * @return the Mime Content Type of the specified document, or
226 * <code>null</code> if none is found.
227 * @see #getMinLength()
229 public MimeType getMimeType(String name, byte[] data) {
230 // First, try to get the mime-type from the content
231 MimeType mimeType = getMimeType(data);
233 // If no mime-type found, then try to get the mime-type from
234 // the document name
235 if (mimeType == null) {
236 mimeType = getMimeType(name);
239 return mimeType;
243 * Returns the MIME type that best matches the given document name and
244 * the first few bytes of the given document stream.
246 * @see #getMimeType(String, byte[])
247 * @param name document name
248 * @param stream document stream
249 * @return matching MIME type, or <code>null</code> if no match is found
250 * @throws IOException if the stream can not be read
252 public MimeType getMimeType(String name, InputStream stream)
253 throws IOException {
254 return getMimeType(name, readMagicHeader(stream));
258 * Find a Mime Content Type from its name.
260 * @param name
261 * is the content type name
262 * @return the MimeType for the specified name, or <code>null</code> if no
263 * MimeType is registered for this name.
265 public MimeType forName(String name) {
266 MimeInfo info = types.get(name);
267 return (info == null) ? null : info.getType();
271 * Return the minimum length of data to provide to analyzing methods based
272 * on the document's content in order to check all the known MimeTypes.
274 * @return the minimum length of data to provide.
275 * @see #getMimeType(byte[])
276 * @see #getMimeType(String, byte[])
278 public int getMinLength() {
279 return 1024;
280 // return minLength;
284 * Add the specified mime-types in the repository.
286 * @param types
287 * are the mime-types to add.
289 void add(MimeType[] types) {
290 if (types == null) {
291 return;
293 for (int i = 0; i < types.length; i++) {
294 add(types[i]);
299 * Add the specified mime-type in the repository.
301 * @param type
302 * is the mime-type to add.
304 void add(MimeType type) {
305 if (type == null) {
306 return;
309 // Add the new type in the repository
310 MimeInfo info = new MimeInfo(type);
311 types.put(type.getName(), info);
313 // Checks for some unsolved dependencies on this new type
314 List<MimeInfo> deps = unsolvedDeps.get(type.getName());
315 if (deps != null) {
316 int level = info.getLevel();
317 for (MimeInfo dep : deps) {
318 level = Math.max(level, dep.getLevel() + 1);
320 info.setLevel(level);
321 unsolvedDeps.remove(type.getName());
324 for (String name : type.getSuperTypes()) {
325 MimeInfo superType = types.get(name);
326 if (superType == null) {
327 deps = unsolvedDeps.get(name);
328 if (deps == null) {
329 deps = new ArrayList<MimeInfo>();
330 unsolvedDeps.put(name, deps);
332 deps.add(info);
336 // Update minLentgth
337 minLength = Math.max(minLength, type.getMinLength());
338 // Update the extensions index...
339 patterns.add(type.getPatterns(), type);
340 // Update the magics index...
341 if (type.hasMagic()) {
342 magics.addAll(Arrays.asList(type.getMagics()));
344 Collections.sort(magics, MAGICS_COMPARATOR);
346 // Update the xml (xmlRoot) index...
347 if (type.hasRootXML()) {
348 this.xmls.add(info);
350 Collections.sort(xmls, LEVELS_COMPARATOR);
353 // Inherited Javadoc
354 public String toString() {
355 StringBuilder builder = new StringBuilder();
356 for (MimeInfo info : types.values()) {
357 builder.append(info.getType()).append("\n");
359 return builder.toString();
362 private final class MimeInfo {
364 private final MimeType type;
366 private int level;
368 MimeInfo(MimeType type) {
369 this.type = type;
370 this.level = 0;
373 MimeType getType() {
374 return type;
377 int getLevel() {
378 return level;
381 void setLevel(int level) {
382 if (level > this.level) {
383 this.level = level;
385 // Update all my super-types
386 for (String name : type.getSuperTypes()) {
387 MimeInfo info = types.get(name);
388 if (info != null) {
389 info.setLevel(level + 1);