2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.mime
;
21 import java
.io
.IOException
;
22 import java
.io
.InputStream
;
24 import java
.util
.Arrays
;
26 import java
.util
.HashMap
;
27 import java
.util
.SortedSet
;
28 import java
.util
.TreeSet
;
31 * This class is a MimeType repository. It gathers a set of MimeTypes and
32 * enables to retrieves a content-type from its name, from a file name, or from
33 * a magic character sequence.
35 * The MIME type detection methods that take an {@link InputStream} as
36 * an argument will never reads more than {@link #getMinLength()} bytes
37 * from the stream. Also the given stream is never
38 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
39 * or {@link InputStream#reset() reset} by the methods. Thus a client can
40 * use the {@link InputStream#markSupported() mark feature} of the stream
41 * (if available) to restore the stream back to the state it was before type
42 * detection if it wants to process the stream based on the detected type.
44 public final class MimeTypes
{
46 /** The default <code>application/octet-stream</code> MimeType */
47 public final static String DEFAULT
= "application/octet-stream";
49 private final MimeType root
;
51 /** All the registered MimeTypes indexed on their name */
52 private final Map
<String
, MimeType
> types
= new HashMap
<String
, MimeType
>();
54 /** The patterns matcher */
55 private Patterns patterns
= new Patterns();
57 /** List of all registered magics */
58 private SortedSet
<Magic
> magics
= new TreeSet
<Magic
>();
60 /** List of all registered rootXML */
61 private SortedSet
<MimeType
> xmls
= new TreeSet
<MimeType
>();
64 root
= new MimeType(this, DEFAULT
);
65 types
.put(root
.getName(), root
);
69 * Find the Mime Content Type of a file.
73 * @return the Mime Content Type of the specified file, or <code>null</code>
76 public MimeType
getMimeType(File file
) {
77 return getMimeType(file
.getName());
81 * Find the Mime Content Type of a document from its URL.
84 * of the document to analyze.
85 * @return the Mime Content Type of the specified document URL, or
86 * <code>null</code> if none is found.
88 public MimeType
getMimeType(URL url
) {
89 return getMimeType(url
.getPath());
93 * Find the Mime Content Type of a document from its name.
96 * of the document to analyze.
97 * @return the Mime Content Type of the specified document name
99 public MimeType
getMimeType(String name
) {
100 MimeType type
= patterns
.matches(name
);
104 type
= patterns
.matches(name
.toLowerCase());
113 * Returns the MIME type that best matches the given first few bytes
114 * of a document stream.
116 * The given byte array is expected to be at least {@link #getMinLength()}
117 * long, or shorter only if the document stream itself is shorter.
119 * @param data first few bytes of a document stream
120 * @return matching MIME type, or <code>null</code> if no match is found
122 public MimeType
getMimeType(byte[] data
) {
125 // First, check for XML descriptions (level by level)
126 for (MimeType type
: xmls
) {
127 if (type
.matchesXML(data
)) {
132 // Then, check for magic bytes
133 for (Magic magic
: magics
) {
134 if (magic
.eval(data
)) {
135 return magic
.getType();
143 * Returns the MIME type that best matches the first few bytes of the
144 * given document stream.
146 * @see #getMimeType(byte[])
147 * @param stream document stream
148 * @return matching MIME type, or <code>null</code> if no match is found
149 * @throws IOException if the stream can be read
151 public MimeType
getMimeType(InputStream stream
) throws IOException
{
152 return getMimeType(readMagicHeader(stream
));
156 * Reads the first {@link #getMinLength()} bytes from the given stream.
157 * If the stream is shorter, then the entire content of the stream is
160 * The given stream is never {@link InputStream#close() closed},
161 * {@link InputStream#mark(int) marked}, or
162 * {@link InputStream#reset() reset} by this method.
164 * @param stream stream to be read
165 * @return first {@link #getMinLength()} (or fewer) bytes of the stream
166 * @throws IOException if the stream can not be read
168 private byte[] readMagicHeader(InputStream stream
) throws IOException
{
169 assert stream
!= null;
171 byte[] bytes
= new byte[getMinLength()];
174 int lastRead
= stream
.read(bytes
);
175 while (lastRead
!= -1) {
176 totalRead
+= lastRead
;
177 if (totalRead
== bytes
.length
) {
180 lastRead
= stream
.read(bytes
, totalRead
, bytes
.length
- totalRead
);
183 byte[] shorter
= new byte[totalRead
];
184 System
.arraycopy(bytes
, 0, shorter
, 0, totalRead
);
188 public String
getType(String typeName
, String url
, byte[] data
) {
189 MimeType type
= getMimeType(url
, data
);
191 if (type
== null && typeName
!= null) {
193 type
= forName(typeName
);
194 } catch (MimeTypeException e
) {
195 // Invalid type name hint
203 return type
.getName();
207 * Determines the MIME type of the resource pointed to by the specified URL.
208 * Examines the file's header, and if it cannot determine the MIME type
209 * from the header, guesses the MIME type from the URL extension
214 * @throws IOException
216 public String
getType(URL url
) throws IOException
{
217 InputStream stream
= url
.openStream();
219 return getType(null, url
.toString(), readMagicHeader(stream
));
226 * Find the Mime Content Type of a document from its name and its content.
227 * The policy used to guess the Mime Content Type is:
229 * <li>Try to find the type based on the provided data.</li>
230 * <li>If a type is found, then return it, otherwise try to find the type
231 * based on the file name</li>
235 * of the document to analyze.
237 * are the first bytes of the document's content.
238 * @return the Mime Content Type of the specified document, or
239 * <code>null</code> if none is found.
240 * @see #getMinLength()
242 public MimeType
getMimeType(String name
, byte[] data
) {
243 // First, try to get the mime-type from the content
244 MimeType mimeType
= getMimeType(data
);
246 // If no mime-type found, then try to get the mime-type from
248 if (mimeType
== null) {
249 mimeType
= getMimeType(name
);
256 * Returns the MIME type that best matches the given document name and
257 * the first few bytes of the given document stream.
259 * @see #getMimeType(String, byte[])
260 * @param name document name
261 * @param stream document stream
262 * @return matching MIME type, or <code>null</code> if no match is found
263 * @throws IOException if the stream can not be read
265 public MimeType
getMimeType(String name
, InputStream stream
)
267 return getMimeType(name
, readMagicHeader(stream
));
271 * Returns the registered media type with the given name (or alias).
272 * The named media type is automatically registered (and returned) if
273 * it doesn't already exist.
275 * @param name media type name (case-insensitive)
276 * @return the registered media type with the given name or alias
277 * @throws MimeTypeException if the given media type name is invalid
279 public synchronized MimeType
forName(String name
)
280 throws MimeTypeException
{
281 if (MimeType
.isValid(name
)) {
282 name
= name
.toLowerCase();
283 MimeType type
= types
.get(name
);
285 type
= new MimeType(this, name
);
286 type
.setSuperType(root
);
287 types
.put(name
, type
);
291 throw new MimeTypeException("Invalid media type name: " + name
);
296 * Adds an alias for the given media type. This method should only
297 * be called from {@link MimeType#addAlias(String)}.
299 * @param type media type
300 * @param alias media type alias (normalized to lower case)
301 * @throws MimeTypeException if the alias already exists
303 synchronized void addAlias(MimeType type
, String alias
)
304 throws MimeTypeException
{
305 if (!types
.containsKey(alias
)) {
306 types
.put(alias
, type
);
308 throw new MimeTypeException(
309 "Media type alias already exists: " + alias
);
314 * Adds a file name pattern for the given media type.
316 * @param type media type
317 * @param pattern file name pattern
318 * @throws MimeTypeException if the pattern conflicts with existing ones
320 public void addPattern(MimeType type
, String pattern
)
321 throws MimeTypeException
{
322 patterns
.add(pattern
, type
);
326 * Return the minimum length of data to provide to analyzing methods based
327 * on the document's content in order to check all the known MimeTypes.
329 * @return the minimum length of data to provide.
330 * @see #getMimeType(byte[])
331 * @see #getMimeType(String, byte[])
333 public int getMinLength() {
339 * Add the specified mime-type in the repository.
342 * is the mime-type to add.
344 void add(MimeType type
) {
345 // Update the magics index...
346 if (type
.hasMagic()) {
347 magics
.addAll(Arrays
.asList(type
.getMagics()));
350 // Update the xml (xmlRoot) index...
351 if (type
.hasRootXML()) {