Lib/mimetypes.py

   1 """Guess the MIME type of a file.
   2
   3 This module defines two useful functions:
   4
   5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
   6
   7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
   8
   9 It also contains the following, for tuning the behavior:
  10
  11 Data:
  12
  13 knownfiles -- list of files to parse
  14 inited -- flag set when init() has been called
  15 suffix_map -- dictionary mapping suffixes to suffixes
  16 encodings_map -- dictionary mapping suffixes to encodings
  17 types_map -- dictionary mapping suffixes to types
  18
  19 Functions:
  20
  21 init([files]) -- parse a list of files, default knownfiles
  22 read_mime_types(file) -- parse one file, return a dictionary or None
  23 """
  24
  25 import os
  26 import posixpath
  27 import urllib
  28
  29 __all__ = [
  30     "guess_type","guess_extension","guess_all_extensions",
  31     "add_type","read_mime_types","init"
  32 ]
  33
  34 knownfiles = [
  35     "/etc/mime.types",
  36     "/usr/local/etc/httpd/conf/mime.types",
  37     "/usr/local/lib/netscape/mime.types",
  38     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
  39     "/usr/local/etc/mime.types",                # Apache 1.3
  40     ]
  41
  42 inited = False
  43
  44
  45 class MimeTypes:
  46     """MIME-types datastore.
  47
  48     This datastore can handle information from mime.types-style files
  49     and supports basic determination of MIME type from a filename or
  50     URL, and can guess a reasonable extension given a MIME type.
  51     """
  52
  53     def __init__(self, filenames=(), strict=True):
  54         if not inited:
  55             init()
  56         self.encodings_map = encodings_map.copy()
  57         self.suffix_map = suffix_map.copy()
  58         self.types_map = ({}, {}) # dict for (non-strict, strict)
  59         self.types_map_inv = ({}, {})
  60         for (ext, type) in types_map.items():
  61             self.add_type(type, ext, True)
  62         for (ext, type) in common_types.items():
  63             self.add_type(type, ext, False)
  64         for name in filenames:
  65             self.read(name, strict)
  66
  67     def add_type(self, type, ext, strict=True):
  68         """Add a mapping between a type and an extension.
  69
  70         When the extension is already known, the new
  71         type will replace the old one. When the type
  72         is already known the extension will be added
  73         to the list of known extensions.
  74
  75         If strict is true, information will be added to
  76         list of standard types, else to the list of non-standard
  77         types.
  78         """
  79         self.types_map[strict][ext] = type
  80         exts = self.types_map_inv[strict].setdefault(type, [])
  81         if ext not in exts:
  82             exts.append(ext)
  83
  84     def guess_type(self, url, strict=True):
  85         """Guess the type of a file based on its URL.
  86
  87         Return value is a tuple (type, encoding) where type is None if
  88         the type can't be guessed (no or unknown suffix) or a string
  89         of the form type/subtype, usable for a MIME Content-type
  90         header; and encoding is None for no encoding or the name of
  91         the program used to encode (e.g. compress or gzip).  The
  92         mappings are table driven.  Encoding suffixes are case
  93         sensitive; type suffixes are first tried case sensitive, then
  94         case insensitive.
  95
  96         The suffixes .tgz, .taz and .tz (case sensitive!) are all
  97         mapped to '.tar.gz'.  (This is table-driven too, using the
  98         dictionary suffix_map.)
  99
 100         Optional `strict' argument when False adds a bunch of commonly found,
 101         but non-standard types.
 102         """
 103         scheme, url = urllib.splittype(url)
 104         if scheme == 'data':
 105             # syntax of data URLs:
 106             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 107             # mediatype := [ type "/" subtype ] *( ";" parameter )
 108             # data      := *urlchar
 109             # parameter := attribute "=" value
 110             # type/subtype defaults to "text/plain"
 111             comma = url.find(',')
 112             if comma < 0:
 113                 # bad data URL
 114                 return None, None
 115             semi = url.find(';', 0, comma)
 116             if semi >= 0:
 117                 type = url[:semi]
 118             else:
 119                 type = url[:comma]
 120             if '=' in type or '/' not in type:
 121                 type = 'text/plain'
 122             return type, None           # never compressed, so encoding is None
 123         base, ext = posixpath.splitext(url)
 124         while ext in self.suffix_map:
 125             base, ext = posixpath.splitext(base + self.suffix_map[ext])
 126         if ext in self.encodings_map:
 127             encoding = self.encodings_map[ext]
 128             base, ext = posixpath.splitext(base)
 129         else:
 130             encoding = None
 131         types_map = self.types_map[True]
 132         if ext in types_map:
 133             return types_map[ext], encoding
 134         elif ext.lower() in types_map:
 135             return types_map[ext.lower()], encoding
 136         elif strict:
 137             return None, encoding
 138         types_map = self.types_map[False]
 139         if ext in types_map:
 140             return types_map[ext], encoding
 141         elif ext.lower() in types_map:
 142             return types_map[ext.lower()], encoding
 143         else:
 144             return None, encoding
 145
 146     def guess_all_extensions(self, type, strict=True):
 147         """Guess the extensions for a file based on its MIME type.
 148
 149         Return value is a list of strings giving the possible filename
 150         extensions, including the leading dot ('.').  The extension is not
 151         guaranteed to have been associated with any particular data stream,
 152         but would be mapped to the MIME type `type' by guess_type().
 153
 154         Optional `strict' argument when false adds a bunch of commonly found,
 155         but non-standard types.
 156         """
 157         type = type.lower()
 158         extensions = self.types_map_inv[True].get(type, [])
 159         if not strict:
 160             for ext in self.types_map_inv[False].get(type, []):
 161                 if ext not in extensions:
 162                     extensions.append(ext)
 163         return extensions
 164
 165     def guess_extension(self, type, strict=True):
 166         """Guess the extension for a file based on its MIME type.
 167
 168         Return value is a string giving a filename extension,
 169         including the leading dot ('.').  The extension is not
 170         guaranteed to have been associated with any particular data
 171         stream, but would be mapped to the MIME type `type' by
 172         guess_type().  If no extension can be guessed for `type', None
 173         is returned.
 174
 175         Optional `strict' argument when false adds a bunch of commonly found,
 176         but non-standard types.
 177         """
 178         extensions = self.guess_all_extensions(type, strict)
 179         if not extensions:
 180             return None
 181         return extensions[0]
 182
 183     def read(self, filename, strict=True):
 184         """
 185         Read a single mime.types-format file, specified by pathname.
 186
 187         If strict is true, information will be added to
 188         list of standard types, else to the list of non-standard
 189         types.
 190         """
 191         fp = open(filename)
 192         self.readfp(fp, strict)
 193         fp.close()
 194
 195     def readfp(self, fp, strict=True):
 196         """
 197         Read a single mime.types-format file.
 198
 199         If strict is true, information will be added to
 200         list of standard types, else to the list of non-standard
 201         types.
 202         """
 203         while 1:
 204             line = fp.readline()
 205             if not line:
 206                 break
 207             words = line.split()
 208             for i in range(len(words)):
 209                 if words[i][0] == '#':
 210                     del words[i:]
 211                     break
 212             if not words:
 213                 continue
 214             type, suffixes = words[0], words[1:]
 215             for suff in suffixes:
 216                 self.add_type(type, '.' + suff, strict)
 217
 218 def guess_type(url, strict=True):
 219     """Guess the type of a file based on its URL.
 220
 221     Return value is a tuple (type, encoding) where type is None if the
 222     type can't be guessed (no or unknown suffix) or a string of the
 223     form type/subtype, usable for a MIME Content-type header; and
 224     encoding is None for no encoding or the name of the program used
 225     to encode (e.g. compress or gzip).  The mappings are table
 226     driven.  Encoding suffixes are case sensitive; type suffixes are
 227     first tried case sensitive, then case insensitive.
 228
 229     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
 230     to ".tar.gz".  (This is table-driven too, using the dictionary
 231     suffix_map).
 232
 233     Optional `strict' argument when false adds a bunch of commonly found, but
 234     non-standard types.
 235     """
 236     init()
 237     return guess_type(url, strict)
 238
 239
 240 def guess_all_extensions(type, strict=True):
 241     """Guess the extensions for a file based on its MIME type.
 242
 243     Return value is a list of strings giving the possible filename
 244     extensions, including the leading dot ('.').  The extension is not
 245     guaranteed to have been associated with any particular data
 246     stream, but would be mapped to the MIME type `type' by
 247     guess_type().  If no extension can be guessed for `type', None
 248     is returned.
 249
 250     Optional `strict' argument when false adds a bunch of commonly found,
 251     but non-standard types.
 252     """
 253     init()
 254     return guess_all_extensions(type, strict)
 255
 256 def guess_extension(type, strict=True):
 257     """Guess the extension for a file based on its MIME type.
 258
 259     Return value is a string giving a filename extension, including the
 260     leading dot ('.').  The extension is not guaranteed to have been
 261     associated with any particular data stream, but would be mapped to the
 262     MIME type `type' by guess_type().  If no extension can be guessed for
 263     `type', None is returned.
 264
 265     Optional `strict' argument when false adds a bunch of commonly found,
 266     but non-standard types.
 267     """
 268     init()
 269     return guess_extension(type, strict)
 270
 271 def add_type(type, ext, strict=True):
 272     """Add a mapping between a type and an extension.
 273
 274     When the extension is already known, the new
 275     type will replace the old one. When the type
 276     is already known the extension will be added
 277     to the list of known extensions.
 278
 279     If strict is true, information will be added to
 280     list of standard types, else to the list of non-standard
 281     types.
 282     """
 283     init()
 284     return add_type(type, ext, strict)
 285
 286
 287 def init(files=None):
 288     global guess_all_extensions, guess_extension, guess_type
 289     global suffix_map, types_map, encodings_map, common_types
 290     global add_type, inited
 291     inited = True
 292     db = MimeTypes()
 293     if files is None:
 294         files = knownfiles
 295     for file in files:
 296         if os.path.isfile(file):
 297             db.readfp(open(file))
 298     encodings_map = db.encodings_map
 299     suffix_map = db.suffix_map
 300     types_map = db.types_map[True]
 301     guess_all_extensions = db.guess_all_extensions
 302     guess_extension = db.guess_extension
 303     guess_type = db.guess_type
 304     add_type = db.add_type
 305     common_types = db.types_map[False]
 306
 307
 308 def read_mime_types(file):
 309     try:
 310         f = open(file)
 311     except IOError:
 312         return None
 313     db = MimeTypes()
 314     db.readfp(f, True)
 315     return db.types_map[True]
 316
 317
 318 suffix_map = {
 319     '.tgz': '.tar.gz',
 320     '.taz': '.tar.gz',
 321     '.tz': '.tar.gz',
 322     }
 323
 324 encodings_map = {
 325     '.gz': 'gzip',
 326     '.Z': 'compress',
 327     }
 328
 329 # Before adding new types, make sure they are either registered with IANA, at
 330 # http://www.isi.edu/in-notes/iana/assignments/media-types
 331 # or extensions, i.e. using the x- prefix
 332
 333 # If you add to these, please keep them sorted!
 334 types_map = {
 335     '.a'      : 'application/octet-stream',
 336     '.ai'     : 'application/postscript',
 337     '.aif'    : 'audio/x-aiff',
 338     '.aifc'   : 'audio/x-aiff',
 339     '.aiff'   : 'audio/x-aiff',
 340     '.au'     : 'audio/basic',
 341     '.avi'    : 'video/x-msvideo',
 342     '.bat'    : 'text/plain',
 343     '.bcpio'  : 'application/x-bcpio',
 344     '.bin'    : 'application/octet-stream',
 345     '.bmp'    : 'image/x-ms-bmp',
 346     '.c'      : 'text/plain',
 347     # Duplicates :(
 348     '.cdf'    : 'application/x-cdf',
 349     '.cdf'    : 'application/x-netcdf',
 350     '.cpio'   : 'application/x-cpio',
 351     '.csh'    : 'application/x-csh',
 352     '.css'    : 'text/css',
 353     '.dll'    : 'application/octet-stream',
 354     '.doc'    : 'application/msword',
 355     '.dot'    : 'application/msword',
 356     '.dvi'    : 'application/x-dvi',
 357     '.eml'    : 'message/rfc822',
 358     '.eps'    : 'application/postscript',
 359     '.etx'    : 'text/x-setext',
 360     '.exe'    : 'application/octet-stream',
 361     '.gif'    : 'image/gif',
 362     '.gtar'   : 'application/x-gtar',
 363     '.h'      : 'text/plain',
 364     '.hdf'    : 'application/x-hdf',
 365     '.htm'    : 'text/html',
 366     '.html'   : 'text/html',
 367     '.ief'    : 'image/ief',
 368     '.jpe'    : 'image/jpeg',
 369     '.jpeg'   : 'image/jpeg',
 370     '.jpg'    : 'image/jpeg',
 371     '.js'     : 'application/x-javascript',
 372     '.ksh'    : 'text/plain',
 373     '.latex'  : 'application/x-latex',
 374     '.m1v'    : 'video/mpeg',
 375     '.man'    : 'application/x-troff-man',
 376     '.me'     : 'application/x-troff-me',
 377     '.mht'    : 'message/rfc822',
 378     '.mhtml'  : 'message/rfc822',
 379     '.mif'    : 'application/x-mif',
 380     '.mov'    : 'video/quicktime',
 381     '.movie'  : 'video/x-sgi-movie',
 382     '.mp2'    : 'audio/mpeg',
 383     '.mp3'    : 'audio/mpeg',
 384     '.mpa'    : 'video/mpeg',
 385     '.mpe'    : 'video/mpeg',
 386     '.mpeg'   : 'video/mpeg',
 387     '.mpg'    : 'video/mpeg',
 388     '.ms'     : 'application/x-troff-ms',
 389     '.nc'     : 'application/x-netcdf',
 390     '.nws'    : 'message/rfc822',
 391     '.o'      : 'application/octet-stream',
 392     '.obj'    : 'application/octet-stream',
 393     '.oda'    : 'application/oda',
 394     '.p12'    : 'application/x-pkcs12',
 395     '.p7c'    : 'application/pkcs7-mime',
 396     '.pbm'    : 'image/x-portable-bitmap',
 397     '.pdf'    : 'application/pdf',
 398     '.pfx'    : 'application/x-pkcs12',
 399     '.pgm'    : 'image/x-portable-graymap',
 400     '.pl'     : 'text/plain',
 401     '.png'    : 'image/png',
 402     '.pnm'    : 'image/x-portable-anymap',
 403     '.pot'    : 'application/vnd.ms-powerpoint',
 404     '.ppa'    : 'application/vnd.ms-powerpoint',
 405     '.ppm'    : 'image/x-portable-pixmap',
 406     '.pps'    : 'application/vnd.ms-powerpoint',
 407     '.ppt'    : 'application/vnd.ms-powerpoint',
 408     '.ps'     : 'application/postscript',
 409     '.pwz'    : 'application/vnd.ms-powerpoint',
 410     '.py'     : 'text/x-python',
 411     '.pyc'    : 'application/x-python-code',
 412     '.pyo'    : 'application/x-python-code',
 413     '.qt'     : 'video/quicktime',
 414     '.ra'     : 'audio/x-pn-realaudio',
 415     '.ram'    : 'application/x-pn-realaudio',
 416     '.ras'    : 'image/x-cmu-raster',
 417     '.rdf'    : 'application/xml',
 418     '.rgb'    : 'image/x-rgb',
 419     '.roff'   : 'application/x-troff',
 420     '.rtx'    : 'text/richtext',
 421     '.sgm'    : 'text/x-sgml',
 422     '.sgml'   : 'text/x-sgml',
 423     '.sh'     : 'application/x-sh',
 424     '.shar'   : 'application/x-shar',
 425     '.snd'    : 'audio/basic',
 426     '.so'     : 'application/octet-stream',
 427     '.src'    : 'application/x-wais-source',
 428     '.sv4cpio': 'application/x-sv4cpio',
 429     '.sv4crc' : 'application/x-sv4crc',
 430     '.swf'    : 'application/x-shockwave-flash',
 431     '.t'      : 'application/x-troff',
 432     '.tar'    : 'application/x-tar',
 433     '.tcl'    : 'application/x-tcl',
 434     '.tex'    : 'application/x-tex',
 435     '.texi'   : 'application/x-texinfo',
 436     '.texinfo': 'application/x-texinfo',
 437     '.tif'    : 'image/tiff',
 438     '.tiff'   : 'image/tiff',
 439     '.tr'     : 'application/x-troff',
 440     '.tsv'    : 'text/tab-separated-values',
 441     '.txt'    : 'text/plain',
 442     '.ustar'  : 'application/x-ustar',
 443     '.vcf'    : 'text/x-vcard',
 444     '.wav'    : 'audio/x-wav',
 445     '.wiz'    : 'application/msword',
 446     '.wsdl'   : 'application/xml',
 447     '.xbm'    : 'image/x-xbitmap',
 448     '.xlb'    : 'application/vnd.ms-excel',
 449     # Duplicates :(
 450     '.xls'    : 'application/excel',
 451     '.xls'    : 'application/vnd.ms-excel',
 452     '.xml'    : 'text/xml',
 453     '.xpdl'   : 'application/xml',
 454     '.xpm'    : 'image/x-xpixmap',
 455     '.xsl'    : 'application/xml',
 456     '.xwd'    : 'image/x-xwindowdump',
 457     '.zip'    : 'application/zip',
 458     }
 459
 460 # These are non-standard types, commonly found in the wild.  They will only
 461 # match if strict=0 flag is given to the API methods.
 462
 463 # Please sort these too
 464 common_types = {
 465     '.jpg' : 'image/jpg',
 466     '.mid' : 'audio/midi',
 467     '.midi': 'audio/midi',
 468     '.pct' : 'image/pict',
 469     '.pic' : 'image/pict',
 470     '.pict': 'image/pict',
 471     '.rtf' : 'application/rtf',
 472     '.xul' : 'text/xul'
 473     }
 474
 475
 476 if __name__ == '__main__':
 477     import sys
 478     import getopt
 479
 480     USAGE = """\
 481 Usage: mimetypes.py [options] type
 482
 483 Options:
 484     --help / -h       -- print this message and exit
 485     --lenient / -l    -- additionally search of some common, but non-standard
 486                          types.
 487     --extension / -e  -- guess extension instead of type
 488
 489 More than one type argument may be given.
 490 """
 491
 492     def usage(code, msg=''):
 493         print USAGE
 494         if msg: print msg
 495         sys.exit(code)
 496
 497     try:
 498         opts, args = getopt.getopt(sys.argv[1:], 'hle',
 499                                    ['help', 'lenient', 'extension'])
 500     except getopt.error, msg:
 501         usage(1, msg)
 502
 503     strict = 1
 504     extension = 0
 505     for opt, arg in opts:
 506         if opt in ('-h', '--help'):
 507             usage(0)
 508         elif opt in ('-l', '--lenient'):
 509             strict = 0
 510         elif opt in ('-e', '--extension'):
 511             extension = 1
 512     for gtype in args:
 513         if extension:
 514             guess = guess_extension(gtype, strict)
 515             if not guess: print "I don't know anything about type", gtype
 516             else: print guess
 517         else:
 518             guess, encoding = guess_type(gtype, strict)
 519             if not guess: print "I don't know anything about type", gtype
 520             else: print 'type:', guess, 'encoding:', encoding