src/main/java/org/apache/tika/mime/MimeTypes.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17 package org.apache.tika.mime;
  18
  19 // JDK imports
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.InputStream;
  23 import java.net.URL;
  24 import java.util.Arrays;
  25 import java.util.Map;
  26 import java.util.HashMap;
  27 import java.util.SortedSet;
  28 import java.util.TreeSet;
  29
  30 /**
  31  * This class is a MimeType repository. It gathers a set of MimeTypes and
  32  * enables to retrieves a content-type from its name, from a file name, or from
  33  * a magic character sequence.
  34  * <p>
  35  * The MIME type detection methods that take an {@link InputStream} as
  36  * an argument will never reads more than {@link #getMinLength()} bytes
  37  * from the stream. Also the given stream is never
  38  * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
  39  * or {@link InputStream#reset() reset} by the methods. Thus a client can
  40  * use the {@link InputStream#markSupported() mark feature} of the stream
  41  * (if available) to restore the stream back to the state it was before type
  42  * detection if it wants to process the stream based on the detected type.
  43  */
  44 public final class MimeTypes {
  45
  46     /** The default <code>application/octet-stream</code> MimeType */
  47     public final static String DEFAULT = "application/octet-stream";
  48
  49     private final MimeType root;
  50
  51     /** All the registered MimeTypes indexed on their name */
  52     private final Map<String, MimeType> types = new HashMap<String, MimeType>();
  53
  54     /** The patterns matcher */
  55     private Patterns patterns = new Patterns();
  56
  57     /** List of all registered magics */
  58     private SortedSet<Magic> magics = new TreeSet<Magic>();
  59
  60     /** List of all registered rootXML */
  61     private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
  62
  63     public MimeTypes() {
  64         root = new MimeType(this, DEFAULT);
  65         types.put(root.getName(), root);
  66     }
  67
  68     /**
  69      * Find the Mime Content Type of a file.
  70      *
  71      * @param file
  72      *            to analyze.
  73      * @return the Mime Content Type of the specified file, or <code>null</code>
  74      *         if none is found.
  75      */
  76     public MimeType getMimeType(File file) {
  77         return getMimeType(file.getName());
  78     }
  79
  80     /**
  81      * Find the Mime Content Type of a document from its URL.
  82      *
  83      * @param url
  84      *            of the document to analyze.
  85      * @return the Mime Content Type of the specified document URL, or
  86      *         <code>null</code> if none is found.
  87      */
  88     public MimeType getMimeType(URL url) {
  89         return getMimeType(url.getPath());
  90     }
  91
  92     /**
  93      * Find the Mime Content Type of a document from its name.
  94      *
  95      * @param name
  96      *            of the document to analyze.
  97      * @return the Mime Content Type of the specified document name
  98      */
  99     public MimeType getMimeType(String name) {
 100         MimeType type = patterns.matches(name);
 101         if (type != null) {
 102             return type;
 103         }
 104         type = patterns.matches(name.toLowerCase());
 105         if (type != null) {
 106             return type;
 107         } else {
 108             return root;
 109         }
 110     }
 111
 112     /**
 113      * Returns the MIME type that best matches the given first few bytes
 114      * of a document stream.
 115      * <p>
 116      * The given byte array is expected to be at least {@link #getMinLength()}
 117      * long, or shorter only if the document stream itself is shorter.
 118      *
 119      * @param data first few bytes of a document stream
 120      * @return matching MIME type, or <code>null</code> if no match is found
 121      */
 122     public MimeType getMimeType(byte[] data) {
 123         assert data != null;
 124
 125         // First, check for XML descriptions (level by level)
 126         for (MimeType type : xmls) {
 127             if (type.matchesXML(data)) {
 128                 return type;
 129             }
 130         }
 131
 132         // Then, check for magic bytes
 133         for (Magic magic : magics) {
 134             if (magic.eval(data)) {
 135                 return magic.getType();
 136             }
 137         }
 138
 139         return null;
 140     }
 141
 142     /**
 143      * Returns the MIME type that best matches the first few bytes of the
 144      * given document stream.
 145      *
 146      * @see #getMimeType(byte[])
 147      * @param stream document stream
 148      * @return matching MIME type, or <code>null</code> if no match is found
 149      * @throws IOException if the stream can be read
 150      */
 151     public MimeType getMimeType(InputStream stream) throws IOException {
 152         return getMimeType(readMagicHeader(stream));
 153     }
 154
 155     /**
 156      * Reads the first {@link #getMinLength()} bytes from the given stream.
 157      * If the stream is shorter, then the entire content of the stream is
 158      * returned.
 159      * <p>
 160      * The given stream is never {@link InputStream#close() closed},
 161      * {@link InputStream#mark(int) marked}, or
 162      * {@link InputStream#reset() reset} by this method.
 163      *
 164      * @param stream stream to be read
 165      * @return first {@link #getMinLength()} (or fewer) bytes of the stream
 166      * @throws IOException if the stream can not be read
 167      */
 168     private byte[] readMagicHeader(InputStream stream) throws IOException {
 169         assert stream != null;
 170
 171         byte[] bytes = new byte[getMinLength()];
 172         int totalRead = 0;
 173
 174         int lastRead = stream.read(bytes);
 175         while (lastRead != -1) {
 176             totalRead += lastRead;
 177             if (totalRead == bytes.length) {
 178                 return bytes;
 179             }
 180             lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
 181         }
 182
 183         byte[] shorter = new byte[totalRead];
 184         System.arraycopy(bytes, 0, shorter, 0, totalRead);
 185         return shorter;
 186     }
 187
 188     public String getType(String typeName, String url, byte[] data) {
 189         MimeType type = getMimeType(url, data);
 190
 191         if (type == null && typeName != null) {
 192             try {
 193                 type = forName(typeName);
 194             } catch (MimeTypeException e) {
 195                 // Invalid type name hint
 196             }
 197         }
 198
 199         if (type == null) {
 200             type = root;
 201         }
 202
 203         return type.getName();
 204     }
 205
 206     /**
 207      * Determines the MIME type of the resource pointed to by the specified URL.
 208      * Examines the file's header, and if it cannot determine the MIME type
 209      * from the header, guesses the MIME type from the URL extension
 210      * (e.g. "pdf).
 211      *
 212      * @param url
 213      * @return
 214      * @throws IOException
 215      */
 216     public String getType(URL url) throws IOException {
 217         InputStream stream = url.openStream();
 218         try {
 219             return getType(null, url.toString(), readMagicHeader(stream));
 220         } finally {
 221             stream.close();
 222         }
 223     }
 224
 225     /**
 226      * Find the Mime Content Type of a document from its name and its content.
 227      * The policy used to guess the Mime Content Type is:
 228      * <ol>
 229      * <li>Try to find the type based on the provided data.</li>
 230      * <li>If a type is found, then return it, otherwise try to find the type
 231      * based on the file name</li>
 232      * </ol>
 233      *
 234      * @param name
 235      *            of the document to analyze.
 236      * @param data
 237      *            are the first bytes of the document's content.
 238      * @return the Mime Content Type of the specified document, or
 239      *         <code>null</code> if none is found.
 240      * @see #getMinLength()
 241      */
 242     public MimeType getMimeType(String name, byte[] data) {
 243         // First, try to get the mime-type from the content
 244         MimeType mimeType = getMimeType(data);
 245
 246         // If no mime-type found, then try to get the mime-type from
 247         // the document name
 248         if (mimeType == null) {
 249             mimeType = getMimeType(name);
 250         }
 251
 252         return mimeType;
 253     }
 254
 255     /**
 256      * Returns the MIME type that best matches the given document name and
 257      * the first few bytes of the given document stream.
 258      *
 259      * @see #getMimeType(String, byte[])
 260      * @param name document name
 261      * @param stream document stream
 262      * @return matching MIME type, or <code>null</code> if no match is found
 263      * @throws IOException if the stream can not be read
 264      */
 265     public MimeType getMimeType(String name, InputStream stream)
 266             throws IOException {
 267         return getMimeType(name, readMagicHeader(stream));
 268     }
 269
 270     /**
 271      * Returns the registered media type with the given name (or alias).
 272      * The named media type is automatically registered (and returned) if
 273      * it doesn't already exist.
 274      *
 275      * @param name media type name (case-insensitive)
 276      * @return the registered media type with the given name or alias
 277      * @throws MimeTypeException if the given media type name is invalid
 278      */
 279     public synchronized MimeType forName(String name)
 280             throws MimeTypeException {
 281         if (MimeType.isValid(name)) {
 282             name = name.toLowerCase();
 283             MimeType type = types.get(name);
 284             if (type == null) {
 285                 type = new MimeType(this, name);
 286                 type.setSuperType(root);
 287                 types.put(name, type);
 288             }
 289             return type;
 290         } else {
 291             throw new MimeTypeException("Invalid media type name: " + name);
 292         }
 293     }
 294
 295     /**
 296      * Adds an alias for the given media type. This method should only
 297      * be called from {@link MimeType#addAlias(String)}.
 298      *
 299      * @param type media type
 300      * @param alias media type alias (normalized to lower case)
 301      * @throws MimeTypeException if the alias already exists
 302      */
 303     synchronized void addAlias(MimeType type, String alias)
 304             throws MimeTypeException {
 305         if (!types.containsKey(alias)) {
 306             types.put(alias, type);
 307         } else {
 308             throw new MimeTypeException(
 309                     "Media type alias already exists: " + alias);
 310         }
 311     }
 312
 313     /**
 314      * Adds a file name pattern for the given media type.
 315      *
 316      * @param type media type
 317      * @param pattern file name pattern
 318      * @throws MimeTypeException if the pattern conflicts with existing ones
 319      */
 320     public void addPattern(MimeType type, String pattern)
 321             throws MimeTypeException {
 322         patterns.add(pattern, type);
 323     }
 324
 325     /**
 326      * Return the minimum length of data to provide to analyzing methods based
 327      * on the document's content in order to check all the known MimeTypes.
 328      *
 329      * @return the minimum length of data to provide.
 330      * @see #getMimeType(byte[])
 331      * @see #getMimeType(String, byte[])
 332      */
 333     public int getMinLength() {
 334         return 1024;
 335         // return minLength;
 336     }
 337
 338     /**
 339      * Add the specified mime-type in the repository.
 340      *
 341      * @param type
 342      *            is the mime-type to add.
 343      */
 344     void add(MimeType type) {
 345         // Update the magics index...
 346         if (type.hasMagic()) {
 347             magics.addAll(Arrays.asList(type.getMagics()));
 348         }
 349
 350         // Update the xml (xmlRoot) index...
 351         if (type.hasRootXML()) {
 352             xmls.add(type);
 353         }
 354     }
 355
 356 }