1 """Guess the MIME type of a file.
3 This module defines two useful functions:
5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
9 It also contains the following, for tuning the behavior:
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
22 default values are taken from the registry)
23 read_mime_types(file) -- parse one file, return a dictionary or None
36 "guess_type","guess_extension","guess_all_extensions",
37 "add_type","read_mime_types","init"
42 "/etc/httpd/mime.types", # Mac OS X
43 "/etc/httpd/conf/mime.types", # Apache
44 "/etc/apache/mime.types", # Apache 1
45 "/etc/apache2/mime.types", # Apache 2
46 "/usr/local/etc/httpd/conf/mime.types",
47 "/usr/local/lib/netscape/mime.types",
48 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
49 "/usr/local/etc/mime.types", # Apache 1.3
57 """MIME-types datastore.
59 This datastore can handle information from mime.types-style files
60 and supports basic determination of MIME type from a filename or
61 URL, and can guess a reasonable extension given a MIME type.
64 def __init__(self
, filenames
=(), strict
=True):
67 self
.encodings_map
= encodings_map
.copy()
68 self
.suffix_map
= suffix_map
.copy()
69 self
.types_map
= ({}, {}) # dict for (non-strict, strict)
70 self
.types_map_inv
= ({}, {})
71 for (ext
, type) in types_map
.items():
72 self
.add_type(type, ext
, True)
73 for (ext
, type) in common_types
.items():
74 self
.add_type(type, ext
, False)
75 for name
in filenames
:
76 self
.read(name
, strict
)
78 def add_type(self
, type, ext
, strict
=True):
79 """Add a mapping between a type and an extension.
81 When the extension is already known, the new
82 type will replace the old one. When the type
83 is already known the extension will be added
84 to the list of known extensions.
86 If strict is true, information will be added to
87 list of standard types, else to the list of non-standard
90 self
.types_map
[strict
][ext
] = type
91 exts
= self
.types_map_inv
[strict
].setdefault(type, [])
95 def guess_type(self
, url
, strict
=True):
96 """Guess the type of a file based on its URL.
98 Return value is a tuple (type, encoding) where type is None if
99 the type can't be guessed (no or unknown suffix) or a string
100 of the form type/subtype, usable for a MIME Content-type
101 header; and encoding is None for no encoding or the name of
102 the program used to encode (e.g. compress or gzip). The
103 mappings are table driven. Encoding suffixes are case
104 sensitive; type suffixes are first tried case sensitive, then
107 The suffixes .tgz, .taz and .tz (case sensitive!) are all
108 mapped to '.tar.gz'. (This is table-driven too, using the
109 dictionary suffix_map.)
111 Optional `strict' argument when False adds a bunch of commonly found,
112 but non-standard types.
114 scheme
, url
= urllib
.splittype(url
)
116 # syntax of data URLs:
117 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
118 # mediatype := [ type "/" subtype ] *( ";" parameter )
120 # parameter := attribute "=" value
121 # type/subtype defaults to "text/plain"
122 comma
= url
.find(',')
126 semi
= url
.find(';', 0, comma
)
131 if '=' in type or '/' not in type:
133 return type, None # never compressed, so encoding is None
134 base
, ext
= posixpath
.splitext(url
)
135 while ext
in self
.suffix_map
:
136 base
, ext
= posixpath
.splitext(base
+ self
.suffix_map
[ext
])
137 if ext
in self
.encodings_map
:
138 encoding
= self
.encodings_map
[ext
]
139 base
, ext
= posixpath
.splitext(base
)
142 types_map
= self
.types_map
[True]
144 return types_map
[ext
], encoding
145 elif ext
.lower() in types_map
:
146 return types_map
[ext
.lower()], encoding
148 return None, encoding
149 types_map
= self
.types_map
[False]
151 return types_map
[ext
], encoding
152 elif ext
.lower() in types_map
:
153 return types_map
[ext
.lower()], encoding
155 return None, encoding
157 def guess_all_extensions(self
, type, strict
=True):
158 """Guess the extensions for a file based on its MIME type.
160 Return value is a list of strings giving the possible filename
161 extensions, including the leading dot ('.'). The extension is not
162 guaranteed to have been associated with any particular data stream,
163 but would be mapped to the MIME type `type' by guess_type().
165 Optional `strict' argument when false adds a bunch of commonly found,
166 but non-standard types.
169 extensions
= self
.types_map_inv
[True].get(type, [])
171 for ext
in self
.types_map_inv
[False].get(type, []):
172 if ext
not in extensions
:
173 extensions
.append(ext
)
176 def guess_extension(self
, type, strict
=True):
177 """Guess the extension for a file based on its MIME type.
179 Return value is a string giving a filename extension,
180 including the leading dot ('.'). The extension is not
181 guaranteed to have been associated with any particular data
182 stream, but would be mapped to the MIME type `type' by
183 guess_type(). If no extension can be guessed for `type', None
186 Optional `strict' argument when false adds a bunch of commonly found,
187 but non-standard types.
189 extensions
= self
.guess_all_extensions(type, strict
)
194 def read(self
, filename
, strict
=True):
196 Read a single mime.types-format file, specified by pathname.
198 If strict is true, information will be added to
199 list of standard types, else to the list of non-standard
203 self
.readfp(fp
, strict
)
206 def readfp(self
, fp
, strict
=True):
208 Read a single mime.types-format file.
210 If strict is true, information will be added to
211 list of standard types, else to the list of non-standard
219 for i
in range(len(words
)):
220 if words
[i
][0] == '#':
225 type, suffixes
= words
[0], words
[1:]
226 for suff
in suffixes
:
227 self
.add_type(type, '.' + suff
, strict
)
229 def read_windows_registry(self
, strict
=True):
231 Load the MIME types database from Windows registry.
233 If strict is true, information will be added to
234 list of standard types, else to the list of non-standard
242 def enum_types(mimedb
):
246 ctype
= _winreg
.EnumKey(mimedb
, i
)
247 except EnvironmentError:
250 ctype
= ctype
.encode(default_encoding
) # omit in 3.x!
251 except UnicodeEncodeError:
257 default_encoding
= sys
.getdefaultencoding()
258 with _winreg
.OpenKey(_winreg
.HKEY_CLASSES_ROOT
,
259 r
'MIME\Database\Content Type') as mimedb
:
260 for ctype
in enum_types(mimedb
):
261 with _winreg
.OpenKey(mimedb
, ctype
) as key
:
263 suffix
, datatype
= _winreg
.QueryValueEx(key
, 'Extension')
264 except EnvironmentError:
266 if datatype
!= _winreg
.REG_SZ
:
269 suffix
= suffix
.encode(default_encoding
) # omit in 3.x!
270 except UnicodeEncodeError:
272 self
.add_type(ctype
, suffix
, strict
)
275 def guess_type(url
, strict
=True):
276 """Guess the type of a file based on its URL.
278 Return value is a tuple (type, encoding) where type is None if the
279 type can't be guessed (no or unknown suffix) or a string of the
280 form type/subtype, usable for a MIME Content-type header; and
281 encoding is None for no encoding or the name of the program used
282 to encode (e.g. compress or gzip). The mappings are table
283 driven. Encoding suffixes are case sensitive; type suffixes are
284 first tried case sensitive, then case insensitive.
286 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
287 to ".tar.gz". (This is table-driven too, using the dictionary
290 Optional `strict' argument when false adds a bunch of commonly found, but
295 return _db
.guess_type(url
, strict
)
298 def guess_all_extensions(type, strict
=True):
299 """Guess the extensions for a file based on its MIME type.
301 Return value is a list of strings giving the possible filename
302 extensions, including the leading dot ('.'). The extension is not
303 guaranteed to have been associated with any particular data
304 stream, but would be mapped to the MIME type `type' by
305 guess_type(). If no extension can be guessed for `type', None
308 Optional `strict' argument when false adds a bunch of commonly found,
309 but non-standard types.
313 return _db
.guess_all_extensions(type, strict
)
315 def guess_extension(type, strict
=True):
316 """Guess the extension for a file based on its MIME type.
318 Return value is a string giving a filename extension, including the
319 leading dot ('.'). The extension is not guaranteed to have been
320 associated with any particular data stream, but would be mapped to the
321 MIME type `type' by guess_type(). If no extension can be guessed for
322 `type', None is returned.
324 Optional `strict' argument when false adds a bunch of commonly found,
325 but non-standard types.
329 return _db
.guess_extension(type, strict
)
331 def add_type(type, ext
, strict
=True):
332 """Add a mapping between a type and an extension.
334 When the extension is already known, the new
335 type will replace the old one. When the type
336 is already known the extension will be added
337 to the list of known extensions.
339 If strict is true, information will be added to
340 list of standard types, else to the list of non-standard
345 return _db
.add_type(type, ext
, strict
)
348 def init(files
=None):
349 global suffix_map
, types_map
, encodings_map
, common_types
351 inited
= True # so that MimeTypes.__init__() doesn't call us again
355 db
.read_windows_registry()
358 if os
.path
.isfile(file):
359 db
.readfp(open(file))
360 encodings_map
= db
.encodings_map
361 suffix_map
= db
.suffix_map
362 types_map
= db
.types_map
[True]
363 common_types
= db
.types_map
[False]
364 # Make the DB a global variable now that it is fully initialized
368 def read_mime_types(file):
375 return db
.types_map
[True]
378 def _default_mime_types():
397 # Before adding new types, make sure they are either registered with IANA,
398 # at http://www.isi.edu/in-notes/iana/assignments/media-types
399 # or extensions, i.e. using the x- prefix
401 # If you add to these, please keep them sorted!
403 '.a' : 'application/octet-stream',
404 '.ai' : 'application/postscript',
405 '.aif' : 'audio/x-aiff',
406 '.aifc' : 'audio/x-aiff',
407 '.aiff' : 'audio/x-aiff',
408 '.au' : 'audio/basic',
409 '.avi' : 'video/x-msvideo',
410 '.bat' : 'text/plain',
411 '.bcpio' : 'application/x-bcpio',
412 '.bin' : 'application/octet-stream',
413 '.bmp' : 'image/x-ms-bmp',
416 '.cdf' : 'application/x-cdf',
417 '.cdf' : 'application/x-netcdf',
418 '.cpio' : 'application/x-cpio',
419 '.csh' : 'application/x-csh',
421 '.dll' : 'application/octet-stream',
422 '.doc' : 'application/msword',
423 '.dot' : 'application/msword',
424 '.dvi' : 'application/x-dvi',
425 '.eml' : 'message/rfc822',
426 '.eps' : 'application/postscript',
427 '.etx' : 'text/x-setext',
428 '.exe' : 'application/octet-stream',
429 '.gif' : 'image/gif',
430 '.gtar' : 'application/x-gtar',
432 '.hdf' : 'application/x-hdf',
433 '.htm' : 'text/html',
434 '.html' : 'text/html',
435 '.ief' : 'image/ief',
436 '.jpe' : 'image/jpeg',
437 '.jpeg' : 'image/jpeg',
438 '.jpg' : 'image/jpeg',
439 '.js' : 'application/x-javascript',
440 '.ksh' : 'text/plain',
441 '.latex' : 'application/x-latex',
442 '.m1v' : 'video/mpeg',
443 '.man' : 'application/x-troff-man',
444 '.me' : 'application/x-troff-me',
445 '.mht' : 'message/rfc822',
446 '.mhtml' : 'message/rfc822',
447 '.mif' : 'application/x-mif',
448 '.mov' : 'video/quicktime',
449 '.movie' : 'video/x-sgi-movie',
450 '.mp2' : 'audio/mpeg',
451 '.mp3' : 'audio/mpeg',
452 '.mp4' : 'video/mp4',
453 '.mpa' : 'video/mpeg',
454 '.mpe' : 'video/mpeg',
455 '.mpeg' : 'video/mpeg',
456 '.mpg' : 'video/mpeg',
457 '.ms' : 'application/x-troff-ms',
458 '.nc' : 'application/x-netcdf',
459 '.nws' : 'message/rfc822',
460 '.o' : 'application/octet-stream',
461 '.obj' : 'application/octet-stream',
462 '.oda' : 'application/oda',
463 '.p12' : 'application/x-pkcs12',
464 '.p7c' : 'application/pkcs7-mime',
465 '.pbm' : 'image/x-portable-bitmap',
466 '.pdf' : 'application/pdf',
467 '.pfx' : 'application/x-pkcs12',
468 '.pgm' : 'image/x-portable-graymap',
469 '.pl' : 'text/plain',
470 '.png' : 'image/png',
471 '.pnm' : 'image/x-portable-anymap',
472 '.pot' : 'application/vnd.ms-powerpoint',
473 '.ppa' : 'application/vnd.ms-powerpoint',
474 '.ppm' : 'image/x-portable-pixmap',
475 '.pps' : 'application/vnd.ms-powerpoint',
476 '.ppt' : 'application/vnd.ms-powerpoint',
477 '.ps' : 'application/postscript',
478 '.pwz' : 'application/vnd.ms-powerpoint',
479 '.py' : 'text/x-python',
480 '.pyc' : 'application/x-python-code',
481 '.pyo' : 'application/x-python-code',
482 '.qt' : 'video/quicktime',
483 '.ra' : 'audio/x-pn-realaudio',
484 '.ram' : 'application/x-pn-realaudio',
485 '.ras' : 'image/x-cmu-raster',
486 '.rdf' : 'application/xml',
487 '.rgb' : 'image/x-rgb',
488 '.roff' : 'application/x-troff',
489 '.rtx' : 'text/richtext',
490 '.sgm' : 'text/x-sgml',
491 '.sgml' : 'text/x-sgml',
492 '.sh' : 'application/x-sh',
493 '.shar' : 'application/x-shar',
494 '.snd' : 'audio/basic',
495 '.so' : 'application/octet-stream',
496 '.src' : 'application/x-wais-source',
497 '.sv4cpio': 'application/x-sv4cpio',
498 '.sv4crc' : 'application/x-sv4crc',
499 '.swf' : 'application/x-shockwave-flash',
500 '.t' : 'application/x-troff',
501 '.tar' : 'application/x-tar',
502 '.tcl' : 'application/x-tcl',
503 '.tex' : 'application/x-tex',
504 '.texi' : 'application/x-texinfo',
505 '.texinfo': 'application/x-texinfo',
506 '.tif' : 'image/tiff',
507 '.tiff' : 'image/tiff',
508 '.tr' : 'application/x-troff',
509 '.tsv' : 'text/tab-separated-values',
510 '.txt' : 'text/plain',
511 '.ustar' : 'application/x-ustar',
512 '.vcf' : 'text/x-vcard',
513 '.wav' : 'audio/x-wav',
514 '.wiz' : 'application/msword',
515 '.wsdl' : 'application/xml',
516 '.xbm' : 'image/x-xbitmap',
517 '.xlb' : 'application/vnd.ms-excel',
519 '.xls' : 'application/excel',
520 '.xls' : 'application/vnd.ms-excel',
522 '.xpdl' : 'application/xml',
523 '.xpm' : 'image/x-xpixmap',
524 '.xsl' : 'application/xml',
525 '.xwd' : 'image/x-xwindowdump',
526 '.zip' : 'application/zip',
529 # These are non-standard types, commonly found in the wild. They will
530 # only match if strict=0 flag is given to the API methods.
532 # Please sort these too
534 '.jpg' : 'image/jpg',
535 '.mid' : 'audio/midi',
536 '.midi': 'audio/midi',
537 '.pct' : 'image/pict',
538 '.pic' : 'image/pict',
539 '.pict': 'image/pict',
540 '.rtf' : 'application/rtf',
545 _default_mime_types()
548 if __name__
== '__main__':
553 Usage: mimetypes.py [options] type
556 --help / -h -- print this message and exit
557 --lenient / -l -- additionally search of some common, but non-standard
559 --extension / -e -- guess extension instead of type
561 More than one type argument may be given.
564 def usage(code
, msg
=''):
570 opts
, args
= getopt
.getopt(sys
.argv
[1:], 'hle',
571 ['help', 'lenient', 'extension'])
572 except getopt
.error
, msg
:
577 for opt
, arg
in opts
:
578 if opt
in ('-h', '--help'):
580 elif opt
in ('-l', '--lenient'):
582 elif opt
in ('-e', '--extension'):
586 guess
= guess_extension(gtype
, strict
)
587 if not guess
: print "I don't know anything about type", gtype
590 guess
, encoding
= guess_type(gtype
, strict
)
591 if not guess
: print "I don't know anything about type", gtype
592 else: print 'type:', guess
, 'encoding:', encoding