Lib/mimetypes.py

   1 """Guess the MIME type of a file.
   2
   3 This module defines two useful functions:
   4
   5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
   6
   7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
   8
   9 It also contains the following, for tuning the behavior:
  10
  11 Data:
  12
  13 knownfiles -- list of files to parse
  14 inited -- flag set when init() has been called
  15 suffix_map -- dictionary mapping suffixes to suffixes
  16 encodings_map -- dictionary mapping suffixes to encodings
  17 types_map -- dictionary mapping suffixes to types
  18
  19 Functions:
  20
  21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
  22   default values are taken from the registry)
  23 read_mime_types(file) -- parse one file, return a dictionary or None
  24 """
  25
  26 import os
  27 import sys
  28 import posixpath
  29 import urllib
  30 try:
  31     import _winreg
  32 except ImportError:
  33     _winreg = None
  34
  35 __all__ = [
  36     "guess_type","guess_extension","guess_all_extensions",
  37     "add_type","read_mime_types","init"
  38 ]
  39
  40 knownfiles = [
  41     "/etc/mime.types",
  42     "/etc/httpd/mime.types",                    # Mac OS X
  43     "/etc/httpd/conf/mime.types",               # Apache
  44     "/etc/apache/mime.types",                   # Apache 1
  45     "/etc/apache2/mime.types",                  # Apache 2
  46     "/usr/local/etc/httpd/conf/mime.types",
  47     "/usr/local/lib/netscape/mime.types",
  48     "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
  49     "/usr/local/etc/mime.types",                # Apache 1.3
  50     ]
  51
  52 inited = False
  53 _db = None
  54
  55
  56 class MimeTypes:
  57     """MIME-types datastore.
  58
  59     This datastore can handle information from mime.types-style files
  60     and supports basic determination of MIME type from a filename or
  61     URL, and can guess a reasonable extension given a MIME type.
  62     """
  63
  64     def __init__(self, filenames=(), strict=True):
  65         if not inited:
  66             init()
  67         self.encodings_map = encodings_map.copy()
  68         self.suffix_map = suffix_map.copy()
  69         self.types_map = ({}, {}) # dict for (non-strict, strict)
  70         self.types_map_inv = ({}, {})
  71         for (ext, type) in types_map.items():
  72             self.add_type(type, ext, True)
  73         for (ext, type) in common_types.items():
  74             self.add_type(type, ext, False)
  75         for name in filenames:
  76             self.read(name, strict)
  77
  78     def add_type(self, type, ext, strict=True):
  79         """Add a mapping between a type and an extension.
  80
  81         When the extension is already known, the new
  82         type will replace the old one. When the type
  83         is already known the extension will be added
  84         to the list of known extensions.
  85
  86         If strict is true, information will be added to
  87         list of standard types, else to the list of non-standard
  88         types.
  89         """
  90         self.types_map[strict][ext] = type
  91         exts = self.types_map_inv[strict].setdefault(type, [])
  92         if ext not in exts:
  93             exts.append(ext)
  94
  95     def guess_type(self, url, strict=True):
  96         """Guess the type of a file based on its URL.
  97
  98         Return value is a tuple (type, encoding) where type is None if
  99         the type can't be guessed (no or unknown suffix) or a string
 100         of the form type/subtype, usable for a MIME Content-type
 101         header; and encoding is None for no encoding or the name of
 102         the program used to encode (e.g. compress or gzip).  The
 103         mappings are table driven.  Encoding suffixes are case
 104         sensitive; type suffixes are first tried case sensitive, then
 105         case insensitive.
 106
 107         The suffixes .tgz, .taz and .tz (case sensitive!) are all
 108         mapped to '.tar.gz'.  (This is table-driven too, using the
 109         dictionary suffix_map.)
 110
 111         Optional `strict' argument when False adds a bunch of commonly found,
 112         but non-standard types.
 113         """
 114         scheme, url = urllib.splittype(url)
 115         if scheme == 'data':
 116             # syntax of data URLs:
 117             # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 118             # mediatype := [ type "/" subtype ] *( ";" parameter )
 119             # data      := *urlchar
 120             # parameter := attribute "=" value
 121             # type/subtype defaults to "text/plain"
 122             comma = url.find(',')
 123             if comma < 0:
 124                 # bad data URL
 125                 return None, None
 126             semi = url.find(';', 0, comma)
 127             if semi >= 0:
 128                 type = url[:semi]
 129             else:
 130                 type = url[:comma]
 131             if '=' in type or '/' not in type:
 132                 type = 'text/plain'
 133             return type, None           # never compressed, so encoding is None
 134         base, ext = posixpath.splitext(url)
 135         while ext in self.suffix_map:
 136             base, ext = posixpath.splitext(base + self.suffix_map[ext])
 137         if ext in self.encodings_map:
 138             encoding = self.encodings_map[ext]
 139             base, ext = posixpath.splitext(base)
 140         else:
 141             encoding = None
 142         types_map = self.types_map[True]
 143         if ext in types_map:
 144             return types_map[ext], encoding
 145         elif ext.lower() in types_map:
 146             return types_map[ext.lower()], encoding
 147         elif strict:
 148             return None, encoding
 149         types_map = self.types_map[False]
 150         if ext in types_map:
 151             return types_map[ext], encoding
 152         elif ext.lower() in types_map:
 153             return types_map[ext.lower()], encoding
 154         else:
 155             return None, encoding
 156
 157     def guess_all_extensions(self, type, strict=True):
 158         """Guess the extensions for a file based on its MIME type.
 159
 160         Return value is a list of strings giving the possible filename
 161         extensions, including the leading dot ('.').  The extension is not
 162         guaranteed to have been associated with any particular data stream,
 163         but would be mapped to the MIME type `type' by guess_type().
 164
 165         Optional `strict' argument when false adds a bunch of commonly found,
 166         but non-standard types.
 167         """
 168         type = type.lower()
 169         extensions = self.types_map_inv[True].get(type, [])
 170         if not strict:
 171             for ext in self.types_map_inv[False].get(type, []):
 172                 if ext not in extensions:
 173                     extensions.append(ext)
 174         return extensions
 175
 176     def guess_extension(self, type, strict=True):
 177         """Guess the extension for a file based on its MIME type.
 178
 179         Return value is a string giving a filename extension,
 180         including the leading dot ('.').  The extension is not
 181         guaranteed to have been associated with any particular data
 182         stream, but would be mapped to the MIME type `type' by
 183         guess_type().  If no extension can be guessed for `type', None
 184         is returned.
 185
 186         Optional `strict' argument when false adds a bunch of commonly found,
 187         but non-standard types.
 188         """
 189         extensions = self.guess_all_extensions(type, strict)
 190         if not extensions:
 191             return None
 192         return extensions[0]
 193
 194     def read(self, filename, strict=True):
 195         """
 196         Read a single mime.types-format file, specified by pathname.
 197
 198         If strict is true, information will be added to
 199         list of standard types, else to the list of non-standard
 200         types.
 201         """
 202         fp = open(filename)
 203         self.readfp(fp, strict)
 204         fp.close()
 205
 206     def readfp(self, fp, strict=True):
 207         """
 208         Read a single mime.types-format file.
 209
 210         If strict is true, information will be added to
 211         list of standard types, else to the list of non-standard
 212         types.
 213         """
 214         while 1:
 215             line = fp.readline()
 216             if not line:
 217                 break
 218             words = line.split()
 219             for i in range(len(words)):
 220                 if words[i][0] == '#':
 221                     del words[i:]
 222                     break
 223             if not words:
 224                 continue
 225             type, suffixes = words[0], words[1:]
 226             for suff in suffixes:
 227                 self.add_type(type, '.' + suff, strict)
 228
 229     def read_windows_registry(self, strict=True):
 230         """
 231         Load the MIME types database from Windows registry.
 232
 233         If strict is true, information will be added to
 234         list of standard types, else to the list of non-standard
 235         types.
 236         """
 237
 238         # Windows only
 239         if not _winreg:
 240             return
 241
 242         def enum_types(mimedb):
 243             i = 0
 244             while True:
 245                 try:
 246                     ctype = _winreg.EnumKey(mimedb, i)
 247                 except EnvironmentError:
 248                     break
 249                 try:
 250                     ctype = ctype.encode(default_encoding) # omit in 3.x!
 251                 except UnicodeEncodeError:
 252                     pass
 253                 else:
 254                     yield ctype
 255                 i += 1
 256
 257         default_encoding = sys.getdefaultencoding()
 258         with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT,
 259                              r'MIME\Database\Content Type') as mimedb:
 260             for ctype in enum_types(mimedb):
 261                 with _winreg.OpenKey(mimedb, ctype) as key:
 262                     try:
 263                         suffix, datatype = _winreg.QueryValueEx(key, 'Extension')
 264                     except EnvironmentError:
 265                         continue
 266                     if datatype != _winreg.REG_SZ:
 267                         continue
 268                     try:
 269                         suffix = suffix.encode(default_encoding) # omit in 3.x!
 270                     except UnicodeEncodeError:
 271                         continue
 272                     self.add_type(ctype, suffix, strict)
 273
 274
 275 def guess_type(url, strict=True):
 276     """Guess the type of a file based on its URL.
 277
 278     Return value is a tuple (type, encoding) where type is None if the
 279     type can't be guessed (no or unknown suffix) or a string of the
 280     form type/subtype, usable for a MIME Content-type header; and
 281     encoding is None for no encoding or the name of the program used
 282     to encode (e.g. compress or gzip).  The mappings are table
 283     driven.  Encoding suffixes are case sensitive; type suffixes are
 284     first tried case sensitive, then case insensitive.
 285
 286     The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
 287     to ".tar.gz".  (This is table-driven too, using the dictionary
 288     suffix_map).
 289
 290     Optional `strict' argument when false adds a bunch of commonly found, but
 291     non-standard types.
 292     """
 293     if _db is None:
 294         init()
 295     return _db.guess_type(url, strict)
 296
 297
 298 def guess_all_extensions(type, strict=True):
 299     """Guess the extensions for a file based on its MIME type.
 300
 301     Return value is a list of strings giving the possible filename
 302     extensions, including the leading dot ('.').  The extension is not
 303     guaranteed to have been associated with any particular data
 304     stream, but would be mapped to the MIME type `type' by
 305     guess_type().  If no extension can be guessed for `type', None
 306     is returned.
 307
 308     Optional `strict' argument when false adds a bunch of commonly found,
 309     but non-standard types.
 310     """
 311     if _db is None:
 312         init()
 313     return _db.guess_all_extensions(type, strict)
 314
 315 def guess_extension(type, strict=True):
 316     """Guess the extension for a file based on its MIME type.
 317
 318     Return value is a string giving a filename extension, including the
 319     leading dot ('.').  The extension is not guaranteed to have been
 320     associated with any particular data stream, but would be mapped to the
 321     MIME type `type' by guess_type().  If no extension can be guessed for
 322     `type', None is returned.
 323
 324     Optional `strict' argument when false adds a bunch of commonly found,
 325     but non-standard types.
 326     """
 327     if _db is None:
 328         init()
 329     return _db.guess_extension(type, strict)
 330
 331 def add_type(type, ext, strict=True):
 332     """Add a mapping between a type and an extension.
 333
 334     When the extension is already known, the new
 335     type will replace the old one. When the type
 336     is already known the extension will be added
 337     to the list of known extensions.
 338
 339     If strict is true, information will be added to
 340     list of standard types, else to the list of non-standard
 341     types.
 342     """
 343     if _db is None:
 344         init()
 345     return _db.add_type(type, ext, strict)
 346
 347
 348 def init(files=None):
 349     global suffix_map, types_map, encodings_map, common_types
 350     global inited, _db
 351     inited = True    # so that MimeTypes.__init__() doesn't call us again
 352     db = MimeTypes()
 353     if files is None:
 354         if _winreg:
 355             db.read_windows_registry()
 356         files = knownfiles
 357     for file in files:
 358         if os.path.isfile(file):
 359             db.readfp(open(file))
 360     encodings_map = db.encodings_map
 361     suffix_map = db.suffix_map
 362     types_map = db.types_map[True]
 363     common_types = db.types_map[False]
 364     # Make the DB a global variable now that it is fully initialized
 365     _db = db
 366
 367
 368 def read_mime_types(file):
 369     try:
 370         f = open(file)
 371     except IOError:
 372         return None
 373     db = MimeTypes()
 374     db.readfp(f, True)
 375     return db.types_map[True]
 376
 377
 378 def _default_mime_types():
 379     global suffix_map
 380     global encodings_map
 381     global types_map
 382     global common_types
 383
 384     suffix_map = {
 385         '.tgz': '.tar.gz',
 386         '.taz': '.tar.gz',
 387         '.tz': '.tar.gz',
 388         '.tbz2': '.tar.bz2',
 389         }
 390
 391     encodings_map = {
 392         '.gz': 'gzip',
 393         '.Z': 'compress',
 394         '.bz2': 'bzip2',
 395         }
 396
 397     # Before adding new types, make sure they are either registered with IANA,
 398     # at http://www.isi.edu/in-notes/iana/assignments/media-types
 399     # or extensions, i.e. using the x- prefix
 400
 401     # If you add to these, please keep them sorted!
 402     types_map = {
 403         '.a'      : 'application/octet-stream',
 404         '.ai'     : 'application/postscript',
 405         '.aif'    : 'audio/x-aiff',
 406         '.aifc'   : 'audio/x-aiff',
 407         '.aiff'   : 'audio/x-aiff',
 408         '.au'     : 'audio/basic',
 409         '.avi'    : 'video/x-msvideo',
 410         '.bat'    : 'text/plain',
 411         '.bcpio'  : 'application/x-bcpio',
 412         '.bin'    : 'application/octet-stream',
 413         '.bmp'    : 'image/x-ms-bmp',
 414         '.c'      : 'text/plain',
 415         # Duplicates :(
 416         '.cdf'    : 'application/x-cdf',
 417         '.cdf'    : 'application/x-netcdf',
 418         '.cpio'   : 'application/x-cpio',
 419         '.csh'    : 'application/x-csh',
 420         '.css'    : 'text/css',
 421         '.dll'    : 'application/octet-stream',
 422         '.doc'    : 'application/msword',
 423         '.dot'    : 'application/msword',
 424         '.dvi'    : 'application/x-dvi',
 425         '.eml'    : 'message/rfc822',
 426         '.eps'    : 'application/postscript',
 427         '.etx'    : 'text/x-setext',
 428         '.exe'    : 'application/octet-stream',
 429         '.gif'    : 'image/gif',
 430         '.gtar'   : 'application/x-gtar',
 431         '.h'      : 'text/plain',
 432         '.hdf'    : 'application/x-hdf',
 433         '.htm'    : 'text/html',
 434         '.html'   : 'text/html',
 435         '.ief'    : 'image/ief',
 436         '.jpe'    : 'image/jpeg',
 437         '.jpeg'   : 'image/jpeg',
 438         '.jpg'    : 'image/jpeg',
 439         '.js'     : 'application/x-javascript',
 440         '.ksh'    : 'text/plain',
 441         '.latex'  : 'application/x-latex',
 442         '.m1v'    : 'video/mpeg',
 443         '.man'    : 'application/x-troff-man',
 444         '.me'     : 'application/x-troff-me',
 445         '.mht'    : 'message/rfc822',
 446         '.mhtml'  : 'message/rfc822',
 447         '.mif'    : 'application/x-mif',
 448         '.mov'    : 'video/quicktime',
 449         '.movie'  : 'video/x-sgi-movie',
 450         '.mp2'    : 'audio/mpeg',
 451         '.mp3'    : 'audio/mpeg',
 452         '.mp4'    : 'video/mp4',
 453         '.mpa'    : 'video/mpeg',
 454         '.mpe'    : 'video/mpeg',
 455         '.mpeg'   : 'video/mpeg',
 456         '.mpg'    : 'video/mpeg',
 457         '.ms'     : 'application/x-troff-ms',
 458         '.nc'     : 'application/x-netcdf',
 459         '.nws'    : 'message/rfc822',
 460         '.o'      : 'application/octet-stream',
 461         '.obj'    : 'application/octet-stream',
 462         '.oda'    : 'application/oda',
 463         '.p12'    : 'application/x-pkcs12',
 464         '.p7c'    : 'application/pkcs7-mime',
 465         '.pbm'    : 'image/x-portable-bitmap',
 466         '.pdf'    : 'application/pdf',
 467         '.pfx'    : 'application/x-pkcs12',
 468         '.pgm'    : 'image/x-portable-graymap',
 469         '.pl'     : 'text/plain',
 470         '.png'    : 'image/png',
 471         '.pnm'    : 'image/x-portable-anymap',
 472         '.pot'    : 'application/vnd.ms-powerpoint',
 473         '.ppa'    : 'application/vnd.ms-powerpoint',
 474         '.ppm'    : 'image/x-portable-pixmap',
 475         '.pps'    : 'application/vnd.ms-powerpoint',
 476         '.ppt'    : 'application/vnd.ms-powerpoint',
 477         '.ps'     : 'application/postscript',
 478         '.pwz'    : 'application/vnd.ms-powerpoint',
 479         '.py'     : 'text/x-python',
 480         '.pyc'    : 'application/x-python-code',
 481         '.pyo'    : 'application/x-python-code',
 482         '.qt'     : 'video/quicktime',
 483         '.ra'     : 'audio/x-pn-realaudio',
 484         '.ram'    : 'application/x-pn-realaudio',
 485         '.ras'    : 'image/x-cmu-raster',
 486         '.rdf'    : 'application/xml',
 487         '.rgb'    : 'image/x-rgb',
 488         '.roff'   : 'application/x-troff',
 489         '.rtx'    : 'text/richtext',
 490         '.sgm'    : 'text/x-sgml',
 491         '.sgml'   : 'text/x-sgml',
 492         '.sh'     : 'application/x-sh',
 493         '.shar'   : 'application/x-shar',
 494         '.snd'    : 'audio/basic',
 495         '.so'     : 'application/octet-stream',
 496         '.src'    : 'application/x-wais-source',
 497         '.sv4cpio': 'application/x-sv4cpio',
 498         '.sv4crc' : 'application/x-sv4crc',
 499         '.swf'    : 'application/x-shockwave-flash',
 500         '.t'      : 'application/x-troff',
 501         '.tar'    : 'application/x-tar',
 502         '.tcl'    : 'application/x-tcl',
 503         '.tex'    : 'application/x-tex',
 504         '.texi'   : 'application/x-texinfo',
 505         '.texinfo': 'application/x-texinfo',
 506         '.tif'    : 'image/tiff',
 507         '.tiff'   : 'image/tiff',
 508         '.tr'     : 'application/x-troff',
 509         '.tsv'    : 'text/tab-separated-values',
 510         '.txt'    : 'text/plain',
 511         '.ustar'  : 'application/x-ustar',
 512         '.vcf'    : 'text/x-vcard',
 513         '.wav'    : 'audio/x-wav',
 514         '.wiz'    : 'application/msword',
 515         '.wsdl'   : 'application/xml',
 516         '.xbm'    : 'image/x-xbitmap',
 517         '.xlb'    : 'application/vnd.ms-excel',
 518         # Duplicates :(
 519         '.xls'    : 'application/excel',
 520         '.xls'    : 'application/vnd.ms-excel',
 521         '.xml'    : 'text/xml',
 522         '.xpdl'   : 'application/xml',
 523         '.xpm'    : 'image/x-xpixmap',
 524         '.xsl'    : 'application/xml',
 525         '.xwd'    : 'image/x-xwindowdump',
 526         '.zip'    : 'application/zip',
 527         }
 528
 529     # These are non-standard types, commonly found in the wild.  They will
 530     # only match if strict=0 flag is given to the API methods.
 531
 532     # Please sort these too
 533     common_types = {
 534         '.jpg' : 'image/jpg',
 535         '.mid' : 'audio/midi',
 536         '.midi': 'audio/midi',
 537         '.pct' : 'image/pict',
 538         '.pic' : 'image/pict',
 539         '.pict': 'image/pict',
 540         '.rtf' : 'application/rtf',
 541         '.xul' : 'text/xul'
 542         }
 543
 544
 545 _default_mime_types()
 546
 547
 548 if __name__ == '__main__':
 549     import sys
 550     import getopt
 551
 552     USAGE = """\
 553 Usage: mimetypes.py [options] type
 554
 555 Options:
 556     --help / -h       -- print this message and exit
 557     --lenient / -l    -- additionally search of some common, but non-standard
 558                          types.
 559     --extension / -e  -- guess extension instead of type
 560
 561 More than one type argument may be given.
 562 """
 563
 564     def usage(code, msg=''):
 565         print USAGE
 566         if msg: print msg
 567         sys.exit(code)
 568
 569     try:
 570         opts, args = getopt.getopt(sys.argv[1:], 'hle',
 571                                    ['help', 'lenient', 'extension'])
 572     except getopt.error, msg:
 573         usage(1, msg)
 574
 575     strict = 1
 576     extension = 0
 577     for opt, arg in opts:
 578         if opt in ('-h', '--help'):
 579             usage(0)
 580         elif opt in ('-l', '--lenient'):
 581             strict = 0
 582         elif opt in ('-e', '--extension'):
 583             extension = 1
 584     for gtype in args:
 585         if extension:
 586             guess = guess_extension(gtype, strict)
 587             if not guess: print "I don't know anything about type", gtype
 588             else: print guess
 589         else:
 590             guess, encoding = guess_type(gtype, strict)
 591             if not guess: print "I don't know anything about type", gtype
 592             else: print 'type:', guess, 'encoding:', encoding