Lib/mimetypes.py

   1 """Guess the MIME type of a file.
   2
   3 This module defines two useful functions:
   4
   5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
   6
   7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
   8
   9 It also contains the following, for tuning the behavior:
  10
  11 Data:
  12
  13 knownfiles -- list of files to parse
  14 inited -- flag set when init() has been called
  15 suffix_map -- dictionary mapping suffixes to suffixes
  16 encodings_map -- dictionary mapping suffixes to encodings
  17 types_map -- dictionary mapping suffixes to types
  18
  19 Functions:
  20
  21 init([files]) -- parse a list of files, default knownfiles
  22 read_mime_types(file) -- parse one file, return a dictionary or None
  23 """
  24
  25 import os
  26 import posixpath
  27 import urllib
  28
  29 __all__ = [
  30     "guess_type","guess_extension","guess_all_extensions",
  31     "add_type","read_mime_types","init"
  32 ]
  33
  34 knownfiles = [
  35     "/etc/mime.types",
  36     "/etc/httpd/mime.types",                    # Mac OS X
  37     "/etc/httpd/conf/mime.types",               # Apache
  38     "/etc/apache/mime.types",                   # Apache 1
  39     "/etc/apache2/mime.types",                  # Apache 2
  40     "/usr/local/etc/httpd/conf/mime.types",
  41     "/usr/local/lib/netscape/mime.types",
  42     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
  43     "/usr/local/etc/mime.types",                # Apache 1.3
  44     ]
  45
  46 inited = False
  47 _db = None
  48
  49
  50 class MimeTypes:
  51     """MIME-types datastore.
  52
  53     This datastore can handle information from mime.types-style files
  54     and supports basic determination of MIME type from a filename or
  55     URL, and can guess a reasonable extension given a MIME type.
  56     """
  57
  58     def __init__(self, filenames=(), strict=True):
  59         if not inited:
  60             init()
  61         self.encodings_map = encodings_map.copy()
  62         self.suffix_map = suffix_map.copy()
  63         self.types_map = ({}, {}) # dict for (non-strict, strict)
  64         self.types_map_inv = ({}, {})
  65         for (ext, type) in types_map.items():
  66             self.add_type(type, ext, True)
  67         for (ext, type) in common_types.items():
  68             self.add_type(type, ext, False)
  69         for name in filenames:
  70             self.read(name, strict)
  71
  72     def add_type(self, type, ext, strict=True):
  73         """Add a mapping between a type and an extension.
  74
  75         When the extension is already known, the new
  76         type will replace the old one. When the type
  77         is already known the extension will be added
  78         to the list of known extensions.
  79
  80         If strict is true, information will be added to
  81         list of standard types, else to the list of non-standard
  82         types.
  83         """
  84         self.types_map[strict][ext] = type
  85         exts = self.types_map_inv[strict].setdefault(type, [])
  86         if ext not in exts:
  87             exts.append(ext)
  88
  89     def guess_type(self, url, strict=True):
  90         """Guess the type of a file based on its URL.
  91
  92         Return value is a tuple (type, encoding) where type is None if
  93         the type can't be guessed (no or unknown suffix) or a string
  94         of the form type/subtype, usable for a MIME Content-type
  95         header; and encoding is None for no encoding or the name of
  96         the program used to encode (e.g. compress or gzip).  The
  97         mappings are table driven.  Encoding suffixes are case
  98         sensitive; type suffixes are first tried case sensitive, then
  99         case insensitive.
 100
 101         The suffixes .tgz, .taz and .tz (case sensitive!) are all
 102         mapped to '.tar.gz'.  (This is table-driven too, using the
 103         dictionary suffix_map.)
 104
 105         Optional `strict' argument when False adds a bunch of commonly found,
 106         but non-standard types.
 107         """
 108         scheme, url = urllib.splittype(url)
 109         if scheme == 'data':
 110             # syntax of data URLs:
 111             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 112             # mediatype := [ type "/" subtype ] *( ";" parameter )
 113             # data      := *urlchar
 114             # parameter := attribute "=" value
 115             # type/subtype defaults to "text/plain"
 116             comma = url.find(',')
 117             if comma < 0:
 118                 # bad data URL
 119                 return None, None
 120             semi = url.find(';', 0, comma)
 121             if semi >= 0:
 122                 type = url[:semi]
 123             else:
 124                 type = url[:comma]
 125             if '=' in type or '/' not in type:
 126                 type = 'text/plain'
 127             return type, None           # never compressed, so encoding is None
 128         base, ext = posixpath.splitext(url)
 129         while ext in self.suffix_map:
 130             base, ext = posixpath.splitext(base + self.suffix_map[ext])
 131         if ext in self.encodings_map:
 132             encoding = self.encodings_map[ext]
 133             base, ext = posixpath.splitext(base)
 134         else:
 135             encoding = None
 136         types_map = self.types_map[True]
 137         if ext in types_map:
 138             return types_map[ext], encoding
 139         elif ext.lower() in types_map:
 140             return types_map[ext.lower()], encoding
 141         elif strict:
 142             return None, encoding
 143         types_map = self.types_map[False]
 144         if ext in types_map:
 145             return types_map[ext], encoding
 146         elif ext.lower() in types_map:
 147             return types_map[ext.lower()], encoding
 148         else:
 149             return None, encoding
 150
 151     def guess_all_extensions(self, type, strict=True):
 152         """Guess the extensions for a file based on its MIME type.
 153
 154         Return value is a list of strings giving the possible filename
 155         extensions, including the leading dot ('.').  The extension is not
 156         guaranteed to have been associated with any particular data stream,
 157         but would be mapped to the MIME type `type' by guess_type().
 158
 159         Optional `strict' argument when false adds a bunch of commonly found,
 160         but non-standard types.
 161         """
 162         type = type.lower()
 163         extensions = self.types_map_inv[True].get(type, [])
 164         if not strict:
 165             for ext in self.types_map_inv[False].get(type, []):
 166                 if ext not in extensions:
 167                     extensions.append(ext)
 168         return extensions
 169
 170     def guess_extension(self, type, strict=True):
 171         """Guess the extension for a file based on its MIME type.
 172
 173         Return value is a string giving a filename extension,
 174         including the leading dot ('.').  The extension is not
 175         guaranteed to have been associated with any particular data
 176         stream, but would be mapped to the MIME type `type' by
 177         guess_type().  If no extension can be guessed for `type', None
 178         is returned.
 179
 180         Optional `strict' argument when false adds a bunch of commonly found,
 181         but non-standard types.
 182         """
 183         extensions = self.guess_all_extensions(type, strict)
 184         if not extensions:
 185             return None
 186         return extensions[0]
 187
 188     def read(self, filename, strict=True):
 189         """
 190         Read a single mime.types-format file, specified by pathname.
 191
 192         If strict is true, information will be added to
 193         list of standard types, else to the list of non-standard
 194         types.
 195         """
 196         fp = open(filename)
 197         self.readfp(fp, strict)
 198         fp.close()
 199
 200     def readfp(self, fp, strict=True):
 201         """
 202         Read a single mime.types-format file.
 203
 204         If strict is true, information will be added to
 205         list of standard types, else to the list of non-standard
 206         types.
 207         """
 208         while 1:
 209             line = fp.readline()
 210             if not line:
 211                 break
 212             words = line.split()
 213             for i in range(len(words)):
 214                 if words[i][0] == '#':
 215                     del words[i:]
 216                     break
 217             if not words:
 218                 continue
 219             type, suffixes = words[0], words[1:]
 220             for suff in suffixes:
 221                 self.add_type(type, '.' + suff, strict)
 222
 223 def guess_type(url, strict=True):
 224     """Guess the type of a file based on its URL.
 225
 226     Return value is a tuple (type, encoding) where type is None if the
 227     type can't be guessed (no or unknown suffix) or a string of the
 228     form type/subtype, usable for a MIME Content-type header; and
 229     encoding is None for no encoding or the name of the program used
 230     to encode (e.g. compress or gzip).  The mappings are table
 231     driven.  Encoding suffixes are case sensitive; type suffixes are
 232     first tried case sensitive, then case insensitive.
 233
 234     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
 235     to ".tar.gz".  (This is table-driven too, using the dictionary
 236     suffix_map).
 237
 238     Optional `strict' argument when false adds a bunch of commonly found, but
 239     non-standard types.
 240     """
 241     if _db is None:
 242         init()
 243     return _db.guess_type(url, strict)
 244
 245
 246 def guess_all_extensions(type, strict=True):
 247     """Guess the extensions for a file based on its MIME type.
 248
 249     Return value is a list of strings giving the possible filename
 250     extensions, including the leading dot ('.').  The extension is not
 251     guaranteed to have been associated with any particular data
 252     stream, but would be mapped to the MIME type `type' by
 253     guess_type().  If no extension can be guessed for `type', None
 254     is returned.
 255
 256     Optional `strict' argument when false adds a bunch of commonly found,
 257     but non-standard types.
 258     """
 259     if _db is None:
 260         init()
 261     return _db.guess_all_extensions(type, strict)
 262
 263 def guess_extension(type, strict=True):
 264     """Guess the extension for a file based on its MIME type.
 265
 266     Return value is a string giving a filename extension, including the
 267     leading dot ('.').  The extension is not guaranteed to have been
 268     associated with any particular data stream, but would be mapped to the
 269     MIME type `type' by guess_type().  If no extension can be guessed for
 270     `type', None is returned.
 271
 272     Optional `strict' argument when false adds a bunch of commonly found,
 273     but non-standard types.
 274     """
 275     if _db is None:
 276         init()
 277     return _db.guess_extension(type, strict)
 278
 279 def add_type(type, ext, strict=True):
 280     """Add a mapping between a type and an extension.
 281
 282     When the extension is already known, the new
 283     type will replace the old one. When the type
 284     is already known the extension will be added
 285     to the list of known extensions.
 286
 287     If strict is true, information will be added to
 288     list of standard types, else to the list of non-standard
 289     types.
 290     """
 291     if _db is None:
 292         init()
 293     return _db.add_type(type, ext, strict)
 294
 295
 296 def init(files=None):
 297     global suffix_map, types_map, encodings_map, common_types
 298     global inited, _db
 299     inited = True    # so that MimeTypes.__init__() doesn't call us again
 300     db = MimeTypes()
 301     if files is None:
 302         files = knownfiles
 303     for file in files:
 304         if os.path.isfile(file):
 305             db.readfp(open(file))
 306     encodings_map = db.encodings_map
 307     suffix_map = db.suffix_map
 308     types_map = db.types_map[True]
 309     common_types = db.types_map[False]
 310     # Make the DB a global variable now that it is fully initialized
 311     _db = db
 312
 313
 314 def read_mime_types(file):
 315     try:
 316         f = open(file)
 317     except IOError:
 318         return None
 319     db = MimeTypes()
 320     db.readfp(f, True)
 321     return db.types_map[True]
 322
 323
 324 def _default_mime_types():
 325     global suffix_map
 326     global encodings_map
 327     global types_map
 328     global common_types
 329
 330     suffix_map = {
 331         '.tgz': '.tar.gz',
 332         '.taz': '.tar.gz',
 333         '.tz': '.tar.gz',
 334         '.tbz2': '.tar.bz2',
 335         }
 336
 337     encodings_map = {
 338         '.gz': 'gzip',
 339         '.Z': 'compress',
 340         '.bz2': 'bzip2',
 341         }
 342
 343     # Before adding new types, make sure they are either registered with IANA,
 344     # at http://www.isi.edu/in-notes/iana/assignments/media-types
 345     # or extensions, i.e. using the x- prefix
 346
 347     # If you add to these, please keep them sorted!
 348     types_map = {
 349         '.a'      : 'application/octet-stream',
 350         '.ai'     : 'application/postscript',
 351         '.aif'    : 'audio/x-aiff',
 352         '.aifc'   : 'audio/x-aiff',
 353         '.aiff'   : 'audio/x-aiff',
 354         '.au'     : 'audio/basic',
 355         '.avi'    : 'video/x-msvideo',
 356         '.bat'    : 'text/plain',
 357         '.bcpio'  : 'application/x-bcpio',
 358         '.bin'    : 'application/octet-stream',
 359         '.bmp'    : 'image/x-ms-bmp',
 360         '.c'      : 'text/plain',
 361         # Duplicates :(
 362         '.cdf'    : 'application/x-cdf',
 363         '.cdf'    : 'application/x-netcdf',
 364         '.cpio'   : 'application/x-cpio',
 365         '.csh'    : 'application/x-csh',
 366         '.css'    : 'text/css',
 367         '.dll'    : 'application/octet-stream',
 368         '.doc'    : 'application/msword',
 369         '.dot'    : 'application/msword',
 370         '.dvi'    : 'application/x-dvi',
 371         '.eml'    : 'message/rfc822',
 372         '.eps'    : 'application/postscript',
 373         '.etx'    : 'text/x-setext',
 374         '.exe'    : 'application/octet-stream',
 375         '.gif'    : 'image/gif',
 376         '.gtar'   : 'application/x-gtar',
 377         '.h'      : 'text/plain',
 378         '.hdf'    : 'application/x-hdf',
 379         '.htm'    : 'text/html',
 380         '.html'   : 'text/html',
 381         '.ief'    : 'image/ief',
 382         '.jpe'    : 'image/jpeg',
 383         '.jpeg'   : 'image/jpeg',
 384         '.jpg'    : 'image/jpeg',
 385         '.js'     : 'application/x-javascript',
 386         '.ksh'    : 'text/plain',
 387         '.latex'  : 'application/x-latex',
 388         '.m1v'    : 'video/mpeg',
 389         '.man'    : 'application/x-troff-man',
 390         '.me'     : 'application/x-troff-me',
 391         '.mht'    : 'message/rfc822',
 392         '.mhtml'  : 'message/rfc822',
 393         '.mif'    : 'application/x-mif',
 394         '.mov'    : 'video/quicktime',
 395         '.movie'  : 'video/x-sgi-movie',
 396         '.mp2'    : 'audio/mpeg',
 397         '.mp3'    : 'audio/mpeg',
 398         '.mp4'    : 'video/mp4',
 399         '.mpa'    : 'video/mpeg',
 400         '.mpe'    : 'video/mpeg',
 401         '.mpeg'   : 'video/mpeg',
 402         '.mpg'    : 'video/mpeg',
 403         '.ms'     : 'application/x-troff-ms',
 404         '.nc'     : 'application/x-netcdf',
 405         '.nws'    : 'message/rfc822',
 406         '.o'      : 'application/octet-stream',
 407         '.obj'    : 'application/octet-stream',
 408         '.oda'    : 'application/oda',
 409         '.p12'    : 'application/x-pkcs12',
 410         '.p7c'    : 'application/pkcs7-mime',
 411         '.pbm'    : 'image/x-portable-bitmap',
 412         '.pdf'    : 'application/pdf',
 413         '.pfx'    : 'application/x-pkcs12',
 414         '.pgm'    : 'image/x-portable-graymap',
 415         '.pl'     : 'text/plain',
 416         '.png'    : 'image/png',
 417         '.pnm'    : 'image/x-portable-anymap',
 418         '.pot'    : 'application/vnd.ms-powerpoint',
 419         '.ppa'    : 'application/vnd.ms-powerpoint',
 420         '.ppm'    : 'image/x-portable-pixmap',
 421         '.pps'    : 'application/vnd.ms-powerpoint',
 422         '.ppt'    : 'application/vnd.ms-powerpoint',
 423         '.ps'     : 'application/postscript',
 424         '.pwz'    : 'application/vnd.ms-powerpoint',
 425         '.py'     : 'text/x-python',
 426         '.pyc'    : 'application/x-python-code',
 427         '.pyo'    : 'application/x-python-code',
 428         '.qt'     : 'video/quicktime',
 429         '.ra'     : 'audio/x-pn-realaudio',
 430         '.ram'    : 'application/x-pn-realaudio',
 431         '.ras'    : 'image/x-cmu-raster',
 432         '.rdf'    : 'application/xml',
 433         '.rgb'    : 'image/x-rgb',
 434         '.roff'   : 'application/x-troff',
 435         '.rtx'    : 'text/richtext',
 436         '.sgm'    : 'text/x-sgml',
 437         '.sgml'   : 'text/x-sgml',
 438         '.sh'     : 'application/x-sh',
 439         '.shar'   : 'application/x-shar',
 440         '.snd'    : 'audio/basic',
 441         '.so'     : 'application/octet-stream',
 442         '.src'    : 'application/x-wais-source',
 443         '.sv4cpio': 'application/x-sv4cpio',
 444         '.sv4crc' : 'application/x-sv4crc',
 445         '.swf'    : 'application/x-shockwave-flash',
 446         '.t'      : 'application/x-troff',
 447         '.tar'    : 'application/x-tar',
 448         '.tcl'    : 'application/x-tcl',
 449         '.tex'    : 'application/x-tex',
 450         '.texi'   : 'application/x-texinfo',
 451         '.texinfo': 'application/x-texinfo',
 452         '.tif'    : 'image/tiff',
 453         '.tiff'   : 'image/tiff',
 454         '.tr'     : 'application/x-troff',
 455         '.tsv'    : 'text/tab-separated-values',
 456         '.txt'    : 'text/plain',
 457         '.ustar'  : 'application/x-ustar',
 458         '.vcf'    : 'text/x-vcard',
 459         '.wav'    : 'audio/x-wav',
 460         '.wiz'    : 'application/msword',
 461         '.wsdl'   : 'application/xml',
 462         '.xbm'    : 'image/x-xbitmap',
 463         '.xlb'    : 'application/vnd.ms-excel',
 464         # Duplicates :(
 465         '.xls'    : 'application/excel',
 466         '.xls'    : 'application/vnd.ms-excel',
 467         '.xml'    : 'text/xml',
 468         '.xpdl'   : 'application/xml',
 469         '.xpm'    : 'image/x-xpixmap',
 470         '.xsl'    : 'application/xml',
 471         '.xwd'    : 'image/x-xwindowdump',
 472         '.zip'    : 'application/zip',
 473         }
 474
 475     # These are non-standard types, commonly found in the wild.  They will
 476     # only match if strict=0 flag is given to the API methods.
 477
 478     # Please sort these too
 479     common_types = {
 480         '.jpg' : 'image/jpg',
 481         '.mid' : 'audio/midi',
 482         '.midi': 'audio/midi',
 483         '.pct' : 'image/pict',
 484         '.pic' : 'image/pict',
 485         '.pict': 'image/pict',
 486         '.rtf' : 'application/rtf',
 487         '.xul' : 'text/xul'
 488         }
 489
 490
 491 _default_mime_types()
 492
 493
 494 if __name__ == '__main__':
 495     import sys
 496     import getopt
 497
 498     USAGE = """\
 499 Usage: mimetypes.py [options] type
 500
 501 Options:
 502     --help / -h       -- print this message and exit
 503     --lenient / -l    -- additionally search of some common, but non-standard
 504                          types.
 505     --extension / -e  -- guess extension instead of type
 506
 507 More than one type argument may be given.
 508 """
 509
 510     def usage(code, msg=''):
 511         print USAGE
 512         if msg: print msg
 513         sys.exit(code)
 514
 515     try:
 516         opts, args = getopt.getopt(sys.argv[1:], 'hle',
 517                                    ['help', 'lenient', 'extension'])
 518     except getopt.error, msg:
 519         usage(1, msg)
 520
 521     strict = 1
 522     extension = 0
 523     for opt, arg in opts:
 524         if opt in ('-h', '--help'):
 525             usage(0)
 526         elif opt in ('-l', '--lenient'):
 527             strict = 0
 528         elif opt in ('-e', '--extension'):
 529             extension = 1
 530     for gtype in args:
 531         if extension:
 532             guess = guess_extension(gtype, strict)
 533             if not guess: print "I don't know anything about type", gtype
 534             else: print guess
 535         else:
 536             guess, encoding = guess_type(gtype, strict)
 537             if not guess: print "I don't know anything about type", gtype
 538             else: print 'type:', guess, 'encoding:', encoding