Patch by Jeremy Katz (SF #1609407)
[python.git] / Lib / mimetypes.py
blobb0d2f181728d105e12f322c7e8bcd7eb1bc27df5
1 """Guess the MIME type of a file.
3 This module defines two useful functions:
5 guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
7 guess_extension(type, strict=1) -- guess the extension for a given MIME type.
9 It also contains the following, for tuning the behavior:
11 Data:
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
19 Functions:
21 init([files]) -- parse a list of files, default knownfiles
22 read_mime_types(file) -- parse one file, return a dictionary or None
23 """
25 import os
26 import posixpath
27 import urllib
29 __all__ = [
30 "guess_type","guess_extension","guess_all_extensions",
31 "add_type","read_mime_types","init"
34 knownfiles = [
35 "/etc/mime.types",
36 "/etc/httpd/mime.types", # Mac OS X
37 "/etc/httpd/conf/mime.types", # Apache
38 "/etc/apache/mime.types", # Apache 1
39 "/etc/apache2/mime.types", # Apache 2
40 "/usr/local/etc/httpd/conf/mime.types",
41 "/usr/local/lib/netscape/mime.types",
42 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
43 "/usr/local/etc/mime.types", # Apache 1.3
46 inited = False
49 class MimeTypes:
50 """MIME-types datastore.
52 This datastore can handle information from mime.types-style files
53 and supports basic determination of MIME type from a filename or
54 URL, and can guess a reasonable extension given a MIME type.
55 """
57 def __init__(self, filenames=(), strict=True):
58 if not inited:
59 init()
60 self.encodings_map = encodings_map.copy()
61 self.suffix_map = suffix_map.copy()
62 self.types_map = ({}, {}) # dict for (non-strict, strict)
63 self.types_map_inv = ({}, {})
64 for (ext, type) in types_map.items():
65 self.add_type(type, ext, True)
66 for (ext, type) in common_types.items():
67 self.add_type(type, ext, False)
68 for name in filenames:
69 self.read(name, strict)
71 def add_type(self, type, ext, strict=True):
72 """Add a mapping between a type and an extension.
74 When the extension is already known, the new
75 type will replace the old one. When the type
76 is already known the extension will be added
77 to the list of known extensions.
79 If strict is true, information will be added to
80 list of standard types, else to the list of non-standard
81 types.
82 """
83 self.types_map[strict][ext] = type
84 exts = self.types_map_inv[strict].setdefault(type, [])
85 if ext not in exts:
86 exts.append(ext)
88 def guess_type(self, url, strict=True):
89 """Guess the type of a file based on its URL.
91 Return value is a tuple (type, encoding) where type is None if
92 the type can't be guessed (no or unknown suffix) or a string
93 of the form type/subtype, usable for a MIME Content-type
94 header; and encoding is None for no encoding or the name of
95 the program used to encode (e.g. compress or gzip). The
96 mappings are table driven. Encoding suffixes are case
97 sensitive; type suffixes are first tried case sensitive, then
98 case insensitive.
100 The suffixes .tgz, .taz and .tz (case sensitive!) are all
101 mapped to '.tar.gz'. (This is table-driven too, using the
102 dictionary suffix_map.)
104 Optional `strict' argument when False adds a bunch of commonly found,
105 but non-standard types.
107 scheme, url = urllib.splittype(url)
108 if scheme == 'data':
109 # syntax of data URLs:
110 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
111 # mediatype := [ type "/" subtype ] *( ";" parameter )
112 # data := *urlchar
113 # parameter := attribute "=" value
114 # type/subtype defaults to "text/plain"
115 comma = url.find(',')
116 if comma < 0:
117 # bad data URL
118 return None, None
119 semi = url.find(';', 0, comma)
120 if semi >= 0:
121 type = url[:semi]
122 else:
123 type = url[:comma]
124 if '=' in type or '/' not in type:
125 type = 'text/plain'
126 return type, None # never compressed, so encoding is None
127 base, ext = posixpath.splitext(url)
128 while ext in self.suffix_map:
129 base, ext = posixpath.splitext(base + self.suffix_map[ext])
130 if ext in self.encodings_map:
131 encoding = self.encodings_map[ext]
132 base, ext = posixpath.splitext(base)
133 else:
134 encoding = None
135 types_map = self.types_map[True]
136 if ext in types_map:
137 return types_map[ext], encoding
138 elif ext.lower() in types_map:
139 return types_map[ext.lower()], encoding
140 elif strict:
141 return None, encoding
142 types_map = self.types_map[False]
143 if ext in types_map:
144 return types_map[ext], encoding
145 elif ext.lower() in types_map:
146 return types_map[ext.lower()], encoding
147 else:
148 return None, encoding
150 def guess_all_extensions(self, type, strict=True):
151 """Guess the extensions for a file based on its MIME type.
153 Return value is a list of strings giving the possible filename
154 extensions, including the leading dot ('.'). The extension is not
155 guaranteed to have been associated with any particular data stream,
156 but would be mapped to the MIME type `type' by guess_type().
158 Optional `strict' argument when false adds a bunch of commonly found,
159 but non-standard types.
161 type = type.lower()
162 extensions = self.types_map_inv[True].get(type, [])
163 if not strict:
164 for ext in self.types_map_inv[False].get(type, []):
165 if ext not in extensions:
166 extensions.append(ext)
167 return extensions
169 def guess_extension(self, type, strict=True):
170 """Guess the extension for a file based on its MIME type.
172 Return value is a string giving a filename extension,
173 including the leading dot ('.'). The extension is not
174 guaranteed to have been associated with any particular data
175 stream, but would be mapped to the MIME type `type' by
176 guess_type(). If no extension can be guessed for `type', None
177 is returned.
179 Optional `strict' argument when false adds a bunch of commonly found,
180 but non-standard types.
182 extensions = self.guess_all_extensions(type, strict)
183 if not extensions:
184 return None
185 return extensions[0]
187 def read(self, filename, strict=True):
189 Read a single mime.types-format file, specified by pathname.
191 If strict is true, information will be added to
192 list of standard types, else to the list of non-standard
193 types.
195 fp = open(filename)
196 self.readfp(fp, strict)
197 fp.close()
199 def readfp(self, fp, strict=True):
201 Read a single mime.types-format file.
203 If strict is true, information will be added to
204 list of standard types, else to the list of non-standard
205 types.
207 while 1:
208 line = fp.readline()
209 if not line:
210 break
211 words = line.split()
212 for i in range(len(words)):
213 if words[i][0] == '#':
214 del words[i:]
215 break
216 if not words:
217 continue
218 type, suffixes = words[0], words[1:]
219 for suff in suffixes:
220 self.add_type(type, '.' + suff, strict)
222 def guess_type(url, strict=True):
223 """Guess the type of a file based on its URL.
225 Return value is a tuple (type, encoding) where type is None if the
226 type can't be guessed (no or unknown suffix) or a string of the
227 form type/subtype, usable for a MIME Content-type header; and
228 encoding is None for no encoding or the name of the program used
229 to encode (e.g. compress or gzip). The mappings are table
230 driven. Encoding suffixes are case sensitive; type suffixes are
231 first tried case sensitive, then case insensitive.
233 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
234 to ".tar.gz". (This is table-driven too, using the dictionary
235 suffix_map).
237 Optional `strict' argument when false adds a bunch of commonly found, but
238 non-standard types.
240 init()
241 return guess_type(url, strict)
244 def guess_all_extensions(type, strict=True):
245 """Guess the extensions for a file based on its MIME type.
247 Return value is a list of strings giving the possible filename
248 extensions, including the leading dot ('.'). The extension is not
249 guaranteed to have been associated with any particular data
250 stream, but would be mapped to the MIME type `type' by
251 guess_type(). If no extension can be guessed for `type', None
252 is returned.
254 Optional `strict' argument when false adds a bunch of commonly found,
255 but non-standard types.
257 init()
258 return guess_all_extensions(type, strict)
260 def guess_extension(type, strict=True):
261 """Guess the extension for a file based on its MIME type.
263 Return value is a string giving a filename extension, including the
264 leading dot ('.'). The extension is not guaranteed to have been
265 associated with any particular data stream, but would be mapped to the
266 MIME type `type' by guess_type(). If no extension can be guessed for
267 `type', None is returned.
269 Optional `strict' argument when false adds a bunch of commonly found,
270 but non-standard types.
272 init()
273 return guess_extension(type, strict)
275 def add_type(type, ext, strict=True):
276 """Add a mapping between a type and an extension.
278 When the extension is already known, the new
279 type will replace the old one. When the type
280 is already known the extension will be added
281 to the list of known extensions.
283 If strict is true, information will be added to
284 list of standard types, else to the list of non-standard
285 types.
287 init()
288 return add_type(type, ext, strict)
291 def init(files=None):
292 global guess_all_extensions, guess_extension, guess_type
293 global suffix_map, types_map, encodings_map, common_types
294 global add_type, inited
295 inited = True
296 db = MimeTypes()
297 if files is None:
298 files = knownfiles
299 for file in files:
300 if os.path.isfile(file):
301 db.readfp(open(file))
302 encodings_map = db.encodings_map
303 suffix_map = db.suffix_map
304 types_map = db.types_map[True]
305 guess_all_extensions = db.guess_all_extensions
306 guess_extension = db.guess_extension
307 guess_type = db.guess_type
308 add_type = db.add_type
309 common_types = db.types_map[False]
312 def read_mime_types(file):
313 try:
314 f = open(file)
315 except IOError:
316 return None
317 db = MimeTypes()
318 db.readfp(f, True)
319 return db.types_map[True]
322 def _default_mime_types():
323 global suffix_map
324 global encodings_map
325 global types_map
326 global common_types
328 suffix_map = {
329 '.tgz': '.tar.gz',
330 '.taz': '.tar.gz',
331 '.tz': '.tar.gz',
334 encodings_map = {
335 '.gz': 'gzip',
336 '.Z': 'compress',
339 # Before adding new types, make sure they are either registered with IANA,
340 # at http://www.isi.edu/in-notes/iana/assignments/media-types
341 # or extensions, i.e. using the x- prefix
343 # If you add to these, please keep them sorted!
344 types_map = {
345 '.a' : 'application/octet-stream',
346 '.ai' : 'application/postscript',
347 '.aif' : 'audio/x-aiff',
348 '.aifc' : 'audio/x-aiff',
349 '.aiff' : 'audio/x-aiff',
350 '.au' : 'audio/basic',
351 '.avi' : 'video/x-msvideo',
352 '.bat' : 'text/plain',
353 '.bcpio' : 'application/x-bcpio',
354 '.bin' : 'application/octet-stream',
355 '.bmp' : 'image/x-ms-bmp',
356 '.c' : 'text/plain',
357 # Duplicates :(
358 '.cdf' : 'application/x-cdf',
359 '.cdf' : 'application/x-netcdf',
360 '.cpio' : 'application/x-cpio',
361 '.csh' : 'application/x-csh',
362 '.css' : 'text/css',
363 '.dll' : 'application/octet-stream',
364 '.doc' : 'application/msword',
365 '.dot' : 'application/msword',
366 '.dvi' : 'application/x-dvi',
367 '.eml' : 'message/rfc822',
368 '.eps' : 'application/postscript',
369 '.etx' : 'text/x-setext',
370 '.exe' : 'application/octet-stream',
371 '.gif' : 'image/gif',
372 '.gtar' : 'application/x-gtar',
373 '.h' : 'text/plain',
374 '.hdf' : 'application/x-hdf',
375 '.htm' : 'text/html',
376 '.html' : 'text/html',
377 '.ief' : 'image/ief',
378 '.jpe' : 'image/jpeg',
379 '.jpeg' : 'image/jpeg',
380 '.jpg' : 'image/jpeg',
381 '.js' : 'application/x-javascript',
382 '.ksh' : 'text/plain',
383 '.latex' : 'application/x-latex',
384 '.m1v' : 'video/mpeg',
385 '.man' : 'application/x-troff-man',
386 '.me' : 'application/x-troff-me',
387 '.mht' : 'message/rfc822',
388 '.mhtml' : 'message/rfc822',
389 '.mif' : 'application/x-mif',
390 '.mov' : 'video/quicktime',
391 '.movie' : 'video/x-sgi-movie',
392 '.mp2' : 'audio/mpeg',
393 '.mp3' : 'audio/mpeg',
394 '.mpa' : 'video/mpeg',
395 '.mpe' : 'video/mpeg',
396 '.mpeg' : 'video/mpeg',
397 '.mpg' : 'video/mpeg',
398 '.ms' : 'application/x-troff-ms',
399 '.nc' : 'application/x-netcdf',
400 '.nws' : 'message/rfc822',
401 '.o' : 'application/octet-stream',
402 '.obj' : 'application/octet-stream',
403 '.oda' : 'application/oda',
404 '.p12' : 'application/x-pkcs12',
405 '.p7c' : 'application/pkcs7-mime',
406 '.pbm' : 'image/x-portable-bitmap',
407 '.pdf' : 'application/pdf',
408 '.pfx' : 'application/x-pkcs12',
409 '.pgm' : 'image/x-portable-graymap',
410 '.pl' : 'text/plain',
411 '.png' : 'image/png',
412 '.pnm' : 'image/x-portable-anymap',
413 '.pot' : 'application/vnd.ms-powerpoint',
414 '.ppa' : 'application/vnd.ms-powerpoint',
415 '.ppm' : 'image/x-portable-pixmap',
416 '.pps' : 'application/vnd.ms-powerpoint',
417 '.ppt' : 'application/vnd.ms-powerpoint',
418 '.ps' : 'application/postscript',
419 '.pwz' : 'application/vnd.ms-powerpoint',
420 '.py' : 'text/x-python',
421 '.pyc' : 'application/x-python-code',
422 '.pyo' : 'application/x-python-code',
423 '.qt' : 'video/quicktime',
424 '.ra' : 'audio/x-pn-realaudio',
425 '.ram' : 'application/x-pn-realaudio',
426 '.ras' : 'image/x-cmu-raster',
427 '.rdf' : 'application/xml',
428 '.rgb' : 'image/x-rgb',
429 '.roff' : 'application/x-troff',
430 '.rtx' : 'text/richtext',
431 '.sgm' : 'text/x-sgml',
432 '.sgml' : 'text/x-sgml',
433 '.sh' : 'application/x-sh',
434 '.shar' : 'application/x-shar',
435 '.snd' : 'audio/basic',
436 '.so' : 'application/octet-stream',
437 '.src' : 'application/x-wais-source',
438 '.sv4cpio': 'application/x-sv4cpio',
439 '.sv4crc' : 'application/x-sv4crc',
440 '.swf' : 'application/x-shockwave-flash',
441 '.t' : 'application/x-troff',
442 '.tar' : 'application/x-tar',
443 '.tcl' : 'application/x-tcl',
444 '.tex' : 'application/x-tex',
445 '.texi' : 'application/x-texinfo',
446 '.texinfo': 'application/x-texinfo',
447 '.tif' : 'image/tiff',
448 '.tiff' : 'image/tiff',
449 '.tr' : 'application/x-troff',
450 '.tsv' : 'text/tab-separated-values',
451 '.txt' : 'text/plain',
452 '.ustar' : 'application/x-ustar',
453 '.vcf' : 'text/x-vcard',
454 '.wav' : 'audio/x-wav',
455 '.wiz' : 'application/msword',
456 '.wsdl' : 'application/xml',
457 '.xbm' : 'image/x-xbitmap',
458 '.xlb' : 'application/vnd.ms-excel',
459 # Duplicates :(
460 '.xls' : 'application/excel',
461 '.xls' : 'application/vnd.ms-excel',
462 '.xml' : 'text/xml',
463 '.xpdl' : 'application/xml',
464 '.xpm' : 'image/x-xpixmap',
465 '.xsl' : 'application/xml',
466 '.xwd' : 'image/x-xwindowdump',
467 '.zip' : 'application/zip',
470 # These are non-standard types, commonly found in the wild. They will
471 # only match if strict=0 flag is given to the API methods.
473 # Please sort these too
474 common_types = {
475 '.jpg' : 'image/jpg',
476 '.mid' : 'audio/midi',
477 '.midi': 'audio/midi',
478 '.pct' : 'image/pict',
479 '.pic' : 'image/pict',
480 '.pict': 'image/pict',
481 '.rtf' : 'application/rtf',
482 '.xul' : 'text/xul'
486 _default_mime_types()
489 if __name__ == '__main__':
490 import sys
491 import getopt
493 USAGE = """\
494 Usage: mimetypes.py [options] type
496 Options:
497 --help / -h -- print this message and exit
498 --lenient / -l -- additionally search of some common, but non-standard
499 types.
500 --extension / -e -- guess extension instead of type
502 More than one type argument may be given.
505 def usage(code, msg=''):
506 print USAGE
507 if msg: print msg
508 sys.exit(code)
510 try:
511 opts, args = getopt.getopt(sys.argv[1:], 'hle',
512 ['help', 'lenient', 'extension'])
513 except getopt.error, msg:
514 usage(1, msg)
516 strict = 1
517 extension = 0
518 for opt, arg in opts:
519 if opt in ('-h', '--help'):
520 usage(0)
521 elif opt in ('-l', '--lenient'):
522 strict = 0
523 elif opt in ('-e', '--extension'):
524 extension = 1
525 for gtype in args:
526 if extension:
527 guess = guess_extension(gtype, strict)
528 if not guess: print "I don't know anything about type", gtype
529 else: print guess
530 else:
531 guess, encoding = guess_type(gtype, strict)
532 if not guess: print "I don't know anything about type", gtype
533 else: print 'type:', guess, 'encoding:', encoding