TIKA-87 - MimeTypes should allow modification of MIME types
[tika.git] / src / main / java / org / apache / tika / mime / MimeTypes.java
blob9048d4d198632b9a4cdf0c95b705d1b737e78bfd
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.mime;
19 // JDK imports
20 import java.io.File;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.net.URL;
24 import java.util.Arrays;
25 import java.util.Map;
26 import java.util.HashMap;
27 import java.util.SortedSet;
28 import java.util.TreeSet;
30 /**
31 * This class is a MimeType repository. It gathers a set of MimeTypes and
32 * enables to retrieves a content-type from its name, from a file name, or from
33 * a magic character sequence.
34 * <p>
35 * The MIME type detection methods that take an {@link InputStream} as
36 * an argument will never reads more than {@link #getMinLength()} bytes
37 * from the stream. Also the given stream is never
38 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
39 * or {@link InputStream#reset() reset} by the methods. Thus a client can
40 * use the {@link InputStream#markSupported() mark feature} of the stream
41 * (if available) to restore the stream back to the state it was before type
42 * detection if it wants to process the stream based on the detected type.
44 public final class MimeTypes {
46 /** The default <code>application/octet-stream</code> MimeType */
47 public final static String DEFAULT = "application/octet-stream";
49 private final MimeType root;
51 /** All the registered MimeTypes indexed on their name */
52 private final Map<String, MimeType> types = new HashMap<String, MimeType>();
54 /** The patterns matcher */
55 private Patterns patterns = new Patterns();
57 /** List of all registered magics */
58 private SortedSet<Magic> magics = new TreeSet<Magic>();
60 /** List of all registered rootXML */
61 private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
63 public MimeTypes() {
64 root = new MimeType(this, DEFAULT);
65 types.put(root.getName(), root);
68 /**
69 * Find the Mime Content Type of a file.
71 * @param file
72 * to analyze.
73 * @return the Mime Content Type of the specified file, or <code>null</code>
74 * if none is found.
76 public MimeType getMimeType(File file) {
77 return getMimeType(file.getName());
80 /**
81 * Find the Mime Content Type of a document from its URL.
83 * @param url
84 * of the document to analyze.
85 * @return the Mime Content Type of the specified document URL, or
86 * <code>null</code> if none is found.
88 public MimeType getMimeType(URL url) {
89 return getMimeType(url.getPath());
92 /**
93 * Find the Mime Content Type of a document from its name.
95 * @param name
96 * of the document to analyze.
97 * @return the Mime Content Type of the specified document name
99 public MimeType getMimeType(String name) {
100 MimeType type = patterns.matches(name);
101 if (type != null) {
102 return type;
104 type = patterns.matches(name.toLowerCase());
105 if (type != null) {
106 return type;
107 } else {
108 return root;
113 * Returns the MIME type that best matches the given first few bytes
114 * of a document stream.
115 * <p>
116 * The given byte array is expected to be at least {@link #getMinLength()}
117 * long, or shorter only if the document stream itself is shorter.
119 * @param data first few bytes of a document stream
120 * @return matching MIME type, or <code>null</code> if no match is found
122 public MimeType getMimeType(byte[] data) {
123 assert data != null;
125 // First, check for XML descriptions (level by level)
126 for (MimeType type : xmls) {
127 if (type.matchesXML(data)) {
128 return type;
132 // Then, check for magic bytes
133 for (Magic magic : magics) {
134 if (magic.eval(data)) {
135 return magic.getType();
139 return null;
143 * Returns the MIME type that best matches the first few bytes of the
144 * given document stream.
146 * @see #getMimeType(byte[])
147 * @param stream document stream
148 * @return matching MIME type, or <code>null</code> if no match is found
149 * @throws IOException if the stream can be read
151 public MimeType getMimeType(InputStream stream) throws IOException {
152 return getMimeType(readMagicHeader(stream));
156 * Reads the first {@link #getMinLength()} bytes from the given stream.
157 * If the stream is shorter, then the entire content of the stream is
158 * returned.
159 * <p>
160 * The given stream is never {@link InputStream#close() closed},
161 * {@link InputStream#mark(int) marked}, or
162 * {@link InputStream#reset() reset} by this method.
164 * @param stream stream to be read
165 * @return first {@link #getMinLength()} (or fewer) bytes of the stream
166 * @throws IOException if the stream can not be read
168 private byte[] readMagicHeader(InputStream stream) throws IOException {
169 assert stream != null;
171 byte[] bytes = new byte[getMinLength()];
172 int totalRead = 0;
174 int lastRead = stream.read(bytes);
175 while (lastRead != -1) {
176 totalRead += lastRead;
177 if (totalRead == bytes.length) {
178 return bytes;
180 lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
183 byte[] shorter = new byte[totalRead];
184 System.arraycopy(bytes, 0, shorter, 0, totalRead);
185 return shorter;
188 public String getType(String typeName, String url, byte[] data) {
189 MimeType type = getMimeType(url, data);
191 if (type == null && typeName != null) {
192 try {
193 type = forName(typeName);
194 } catch (MimeTypeException e) {
195 // Invalid type name hint
199 if (type == null) {
200 type = root;
203 return type.getName();
207 * Determines the MIME type of the resource pointed to by the specified URL.
208 * Examines the file's header, and if it cannot determine the MIME type
209 * from the header, guesses the MIME type from the URL extension
210 * (e.g. "pdf).
212 * @param url
213 * @return
214 * @throws IOException
216 public String getType(URL url) throws IOException {
217 InputStream stream = url.openStream();
218 try {
219 return getType(null, url.toString(), readMagicHeader(stream));
220 } finally {
221 stream.close();
226 * Find the Mime Content Type of a document from its name and its content.
227 * The policy used to guess the Mime Content Type is:
228 * <ol>
229 * <li>Try to find the type based on the provided data.</li>
230 * <li>If a type is found, then return it, otherwise try to find the type
231 * based on the file name</li>
232 * </ol>
234 * @param name
235 * of the document to analyze.
236 * @param data
237 * are the first bytes of the document's content.
238 * @return the Mime Content Type of the specified document, or
239 * <code>null</code> if none is found.
240 * @see #getMinLength()
242 public MimeType getMimeType(String name, byte[] data) {
243 // First, try to get the mime-type from the content
244 MimeType mimeType = getMimeType(data);
246 // If no mime-type found, then try to get the mime-type from
247 // the document name
248 if (mimeType == null) {
249 mimeType = getMimeType(name);
252 return mimeType;
256 * Returns the MIME type that best matches the given document name and
257 * the first few bytes of the given document stream.
259 * @see #getMimeType(String, byte[])
260 * @param name document name
261 * @param stream document stream
262 * @return matching MIME type, or <code>null</code> if no match is found
263 * @throws IOException if the stream can not be read
265 public MimeType getMimeType(String name, InputStream stream)
266 throws IOException {
267 return getMimeType(name, readMagicHeader(stream));
271 * Returns the registered media type with the given name (or alias).
272 * The named media type is automatically registered (and returned) if
273 * it doesn't already exist.
275 * @param name media type name (case-insensitive)
276 * @return the registered media type with the given name or alias
277 * @throws MimeTypeException if the given media type name is invalid
279 public synchronized MimeType forName(String name)
280 throws MimeTypeException {
281 if (MimeType.isValid(name)) {
282 name = name.toLowerCase();
283 MimeType type = types.get(name);
284 if (type == null) {
285 type = new MimeType(this, name);
286 type.setSuperType(root);
287 types.put(name, type);
289 return type;
290 } else {
291 throw new MimeTypeException("Invalid media type name: " + name);
296 * Adds an alias for the given media type. This method should only
297 * be called from {@link MimeType#addAlias(String)}.
299 * @param type media type
300 * @param alias media type alias (normalized to lower case)
301 * @throws MimeTypeException if the alias already exists
303 synchronized void addAlias(MimeType type, String alias)
304 throws MimeTypeException {
305 if (!types.containsKey(alias)) {
306 types.put(alias, type);
307 } else {
308 throw new MimeTypeException(
309 "Media type alias already exists: " + alias);
314 * Adds a file name pattern for the given media type.
316 * @param type media type
317 * @param pattern file name pattern
318 * @throws MimeTypeException if the pattern conflicts with existing ones
320 public void addPattern(MimeType type, String pattern)
321 throws MimeTypeException {
322 patterns.add(pattern, type);
326 * Return the minimum length of data to provide to analyzing methods based
327 * on the document's content in order to check all the known MimeTypes.
329 * @return the minimum length of data to provide.
330 * @see #getMimeType(byte[])
331 * @see #getMimeType(String, byte[])
333 public int getMinLength() {
334 return 1024;
335 // return minLength;
339 * Add the specified mime-type in the repository.
341 * @param type
342 * is the mime-type to add.
344 void add(MimeType type) {
345 // Update the magics index...
346 if (type.hasMagic()) {
347 magics.addAll(Arrays.asList(type.getMagics()));
350 // Update the xml (xmlRoot) index...
351 if (type.hasRootXML()) {
352 xmls.add(type);