Merged revisions 83951 via svnmerge from
[python/dscho.git] / Lib / mimetypes.py
blobf0da453da05e8ea7cd5f915dc8edf31cf42bd627
1 """Guess the MIME type of a file.
3 This module defines two useful functions:
5 guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
7 guess_extension(type, strict=True) -- guess the extension for a given MIME type.
9 It also contains the following, for tuning the behavior:
11 Data:
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
19 Functions:
21 init([files]) -- parse a list of files, default knownfiles
22 read_mime_types(file) -- parse one file, return a dictionary or None
23 """
25 import os
26 import posixpath
27 import urllib.parse
29 __all__ = [
30 "guess_type","guess_extension","guess_all_extensions",
31 "add_type","read_mime_types","init"
34 knownfiles = [
35 "/etc/mime.types",
36 "/etc/httpd/mime.types", # Mac OS X
37 "/etc/httpd/conf/mime.types", # Apache
38 "/etc/apache/mime.types", # Apache 1
39 "/etc/apache2/mime.types", # Apache 2
40 "/usr/local/etc/httpd/conf/mime.types",
41 "/usr/local/lib/netscape/mime.types",
42 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
43 "/usr/local/etc/mime.types", # Apache 1.3
46 inited = False
47 _db = None
50 class MimeTypes:
51 """MIME-types datastore.
53 This datastore can handle information from mime.types-style files
54 and supports basic determination of MIME type from a filename or
55 URL, and can guess a reasonable extension given a MIME type.
56 """
58 def __init__(self, filenames=(), strict=True):
59 if not inited:
60 init()
61 self.encodings_map = encodings_map.copy()
62 self.suffix_map = suffix_map.copy()
63 self.types_map = ({}, {}) # dict for (non-strict, strict)
64 self.types_map_inv = ({}, {})
65 for (ext, type) in types_map.items():
66 self.add_type(type, ext, True)
67 for (ext, type) in common_types.items():
68 self.add_type(type, ext, False)
69 for name in filenames:
70 self.read(name, strict)
72 def add_type(self, type, ext, strict=True):
73 """Add a mapping between a type and an extension.
75 When the extension is already known, the new
76 type will replace the old one. When the type
77 is already known the extension will be added
78 to the list of known extensions.
80 If strict is true, information will be added to
81 list of standard types, else to the list of non-standard
82 types.
83 """
84 self.types_map[strict][ext] = type
85 exts = self.types_map_inv[strict].setdefault(type, [])
86 if ext not in exts:
87 exts.append(ext)
89 def guess_type(self, url, strict=True):
90 """Guess the type of a file based on its URL.
92 Return value is a tuple (type, encoding) where type is None if
93 the type can't be guessed (no or unknown suffix) or a string
94 of the form type/subtype, usable for a MIME Content-type
95 header; and encoding is None for no encoding or the name of
96 the program used to encode (e.g. compress or gzip). The
97 mappings are table driven. Encoding suffixes are case
98 sensitive; type suffixes are first tried case sensitive, then
99 case insensitive.
101 The suffixes .tgz, .taz and .tz (case sensitive!) are all
102 mapped to '.tar.gz'. (This is table-driven too, using the
103 dictionary suffix_map.)
105 Optional `strict' argument when False adds a bunch of commonly found,
106 but non-standard types.
108 scheme, url = urllib.parse.splittype(url)
109 if scheme == 'data':
110 # syntax of data URLs:
111 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
112 # mediatype := [ type "/" subtype ] *( ";" parameter )
113 # data := *urlchar
114 # parameter := attribute "=" value
115 # type/subtype defaults to "text/plain"
116 comma = url.find(',')
117 if comma < 0:
118 # bad data URL
119 return None, None
120 semi = url.find(';', 0, comma)
121 if semi >= 0:
122 type = url[:semi]
123 else:
124 type = url[:comma]
125 if '=' in type or '/' not in type:
126 type = 'text/plain'
127 return type, None # never compressed, so encoding is None
128 base, ext = posixpath.splitext(url)
129 while ext in self.suffix_map:
130 base, ext = posixpath.splitext(base + self.suffix_map[ext])
131 if ext in self.encodings_map:
132 encoding = self.encodings_map[ext]
133 base, ext = posixpath.splitext(base)
134 else:
135 encoding = None
136 types_map = self.types_map[True]
137 if ext in types_map:
138 return types_map[ext], encoding
139 elif ext.lower() in types_map:
140 return types_map[ext.lower()], encoding
141 elif strict:
142 return None, encoding
143 types_map = self.types_map[False]
144 if ext in types_map:
145 return types_map[ext], encoding
146 elif ext.lower() in types_map:
147 return types_map[ext.lower()], encoding
148 else:
149 return None, encoding
151 def guess_all_extensions(self, type, strict=True):
152 """Guess the extensions for a file based on its MIME type.
154 Return value is a list of strings giving the possible filename
155 extensions, including the leading dot ('.'). The extension is not
156 guaranteed to have been associated with any particular data stream,
157 but would be mapped to the MIME type `type' by guess_type().
159 Optional `strict' argument when false adds a bunch of commonly found,
160 but non-standard types.
162 type = type.lower()
163 extensions = self.types_map_inv[True].get(type, [])
164 if not strict:
165 for ext in self.types_map_inv[False].get(type, []):
166 if ext not in extensions:
167 extensions.append(ext)
168 return extensions
170 def guess_extension(self, type, strict=True):
171 """Guess the extension for a file based on its MIME type.
173 Return value is a string giving a filename extension,
174 including the leading dot ('.'). The extension is not
175 guaranteed to have been associated with any particular data
176 stream, but would be mapped to the MIME type `type' by
177 guess_type(). If no extension can be guessed for `type', None
178 is returned.
180 Optional `strict' argument when false adds a bunch of commonly found,
181 but non-standard types.
183 extensions = self.guess_all_extensions(type, strict)
184 if not extensions:
185 return None
186 return extensions[0]
188 def read(self, filename, strict=True):
190 Read a single mime.types-format file, specified by pathname.
192 If strict is true, information will be added to
193 list of standard types, else to the list of non-standard
194 types.
196 fp = open(filename)
197 self.readfp(fp, strict)
198 fp.close()
200 def readfp(self, fp, strict=True):
202 Read a single mime.types-format file.
204 If strict is true, information will be added to
205 list of standard types, else to the list of non-standard
206 types.
208 while 1:
209 line = fp.readline()
210 if not line:
211 break
212 words = line.split()
213 for i in range(len(words)):
214 if words[i][0] == '#':
215 del words[i:]
216 break
217 if not words:
218 continue
219 type, suffixes = words[0], words[1:]
220 for suff in suffixes:
221 self.add_type(type, '.' + suff, strict)
223 def guess_type(url, strict=True):
224 """Guess the type of a file based on its URL.
226 Return value is a tuple (type, encoding) where type is None if the
227 type can't be guessed (no or unknown suffix) or a string of the
228 form type/subtype, usable for a MIME Content-type header; and
229 encoding is None for no encoding or the name of the program used
230 to encode (e.g. compress or gzip). The mappings are table
231 driven. Encoding suffixes are case sensitive; type suffixes are
232 first tried case sensitive, then case insensitive.
234 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
235 to ".tar.gz". (This is table-driven too, using the dictionary
236 suffix_map).
238 Optional `strict' argument when false adds a bunch of commonly found, but
239 non-standard types.
241 if _db is None:
242 init()
243 return _db.guess_type(url, strict)
246 def guess_all_extensions(type, strict=True):
247 """Guess the extensions for a file based on its MIME type.
249 Return value is a list of strings giving the possible filename
250 extensions, including the leading dot ('.'). The extension is not
251 guaranteed to have been associated with any particular data
252 stream, but would be mapped to the MIME type `type' by
253 guess_type(). If no extension can be guessed for `type', None
254 is returned.
256 Optional `strict' argument when false adds a bunch of commonly found,
257 but non-standard types.
259 if _db is None:
260 init()
261 return _db.guess_all_extensions(type, strict)
263 def guess_extension(type, strict=True):
264 """Guess the extension for a file based on its MIME type.
266 Return value is a string giving a filename extension, including the
267 leading dot ('.'). The extension is not guaranteed to have been
268 associated with any particular data stream, but would be mapped to the
269 MIME type `type' by guess_type(). If no extension can be guessed for
270 `type', None is returned.
272 Optional `strict' argument when false adds a bunch of commonly found,
273 but non-standard types.
275 if _db is None:
276 init()
277 return _db.guess_extension(type, strict)
279 def add_type(type, ext, strict=True):
280 """Add a mapping between a type and an extension.
282 When the extension is already known, the new
283 type will replace the old one. When the type
284 is already known the extension will be added
285 to the list of known extensions.
287 If strict is true, information will be added to
288 list of standard types, else to the list of non-standard
289 types.
291 if _db is None:
292 init()
293 return _db.add_type(type, ext, strict)
296 def init(files=None):
297 global suffix_map, types_map, encodings_map, common_types
298 global inited, _db
299 inited = True # so that MimeTypes.__init__() doesn't call us again
300 db = MimeTypes()
301 if files is None:
302 files = knownfiles
303 for file in files:
304 if os.path.isfile(file):
305 db.readfp(open(file))
306 encodings_map = db.encodings_map
307 suffix_map = db.suffix_map
308 types_map = db.types_map[True]
309 common_types = db.types_map[False]
310 # Make the DB a global variable now that it is fully initialized
311 _db = db
314 def read_mime_types(file):
315 try:
316 f = open(file)
317 except IOError:
318 return None
319 db = MimeTypes()
320 db.readfp(f, True)
321 return db.types_map[True]
324 def _default_mime_types():
325 global suffix_map
326 global encodings_map
327 global types_map
328 global common_types
330 suffix_map = {
331 '.tgz': '.tar.gz',
332 '.taz': '.tar.gz',
333 '.tz': '.tar.gz',
334 '.tbz2': '.tar.bz2',
337 encodings_map = {
338 '.gz': 'gzip',
339 '.Z': 'compress',
340 '.bz2': 'bzip2',
343 # Before adding new types, make sure they are either registered with IANA,
344 # at http://www.isi.edu/in-notes/iana/assignments/media-types
345 # or extensions, i.e. using the x- prefix
347 # If you add to these, please keep them sorted!
348 types_map = {
349 '.a' : 'application/octet-stream',
350 '.ai' : 'application/postscript',
351 '.aif' : 'audio/x-aiff',
352 '.aifc' : 'audio/x-aiff',
353 '.aiff' : 'audio/x-aiff',
354 '.au' : 'audio/basic',
355 '.avi' : 'video/x-msvideo',
356 '.bat' : 'text/plain',
357 '.bcpio' : 'application/x-bcpio',
358 '.bin' : 'application/octet-stream',
359 '.bmp' : 'image/x-ms-bmp',
360 '.c' : 'text/plain',
361 # Duplicates :(
362 '.cdf' : 'application/x-cdf',
363 '.cdf' : 'application/x-netcdf',
364 '.cpio' : 'application/x-cpio',
365 '.csh' : 'application/x-csh',
366 '.css' : 'text/css',
367 '.dll' : 'application/octet-stream',
368 '.doc' : 'application/msword',
369 '.dot' : 'application/msword',
370 '.dvi' : 'application/x-dvi',
371 '.eml' : 'message/rfc822',
372 '.eps' : 'application/postscript',
373 '.etx' : 'text/x-setext',
374 '.exe' : 'application/octet-stream',
375 '.gif' : 'image/gif',
376 '.gtar' : 'application/x-gtar',
377 '.h' : 'text/plain',
378 '.hdf' : 'application/x-hdf',
379 '.htm' : 'text/html',
380 '.html' : 'text/html',
381 '.ief' : 'image/ief',
382 '.jpe' : 'image/jpeg',
383 '.jpeg' : 'image/jpeg',
384 '.jpg' : 'image/jpeg',
385 '.js' : 'application/x-javascript',
386 '.ksh' : 'text/plain',
387 '.latex' : 'application/x-latex',
388 '.m1v' : 'video/mpeg',
389 '.man' : 'application/x-troff-man',
390 '.me' : 'application/x-troff-me',
391 '.mht' : 'message/rfc822',
392 '.mhtml' : 'message/rfc822',
393 '.mif' : 'application/x-mif',
394 '.mov' : 'video/quicktime',
395 '.movie' : 'video/x-sgi-movie',
396 '.mp2' : 'audio/mpeg',
397 '.mp3' : 'audio/mpeg',
398 '.mp4' : 'video/mp4',
399 '.mpa' : 'video/mpeg',
400 '.mpe' : 'video/mpeg',
401 '.mpeg' : 'video/mpeg',
402 '.mpg' : 'video/mpeg',
403 '.ms' : 'application/x-troff-ms',
404 '.nc' : 'application/x-netcdf',
405 '.nws' : 'message/rfc822',
406 '.o' : 'application/octet-stream',
407 '.obj' : 'application/octet-stream',
408 '.oda' : 'application/oda',
409 '.p12' : 'application/x-pkcs12',
410 '.p7c' : 'application/pkcs7-mime',
411 '.pbm' : 'image/x-portable-bitmap',
412 '.pdf' : 'application/pdf',
413 '.pfx' : 'application/x-pkcs12',
414 '.pgm' : 'image/x-portable-graymap',
415 '.pl' : 'text/plain',
416 '.png' : 'image/png',
417 '.pnm' : 'image/x-portable-anymap',
418 '.pot' : 'application/vnd.ms-powerpoint',
419 '.ppa' : 'application/vnd.ms-powerpoint',
420 '.ppm' : 'image/x-portable-pixmap',
421 '.pps' : 'application/vnd.ms-powerpoint',
422 '.ppt' : 'application/vnd.ms-powerpoint',
423 '.ps' : 'application/postscript',
424 '.pwz' : 'application/vnd.ms-powerpoint',
425 '.py' : 'text/x-python',
426 '.pyc' : 'application/x-python-code',
427 '.pyo' : 'application/x-python-code',
428 '.qt' : 'video/quicktime',
429 '.ra' : 'audio/x-pn-realaudio',
430 '.ram' : 'application/x-pn-realaudio',
431 '.ras' : 'image/x-cmu-raster',
432 '.rdf' : 'application/xml',
433 '.rgb' : 'image/x-rgb',
434 '.roff' : 'application/x-troff',
435 '.rtx' : 'text/richtext',
436 '.sgm' : 'text/x-sgml',
437 '.sgml' : 'text/x-sgml',
438 '.sh' : 'application/x-sh',
439 '.shar' : 'application/x-shar',
440 '.snd' : 'audio/basic',
441 '.so' : 'application/octet-stream',
442 '.src' : 'application/x-wais-source',
443 '.sv4cpio': 'application/x-sv4cpio',
444 '.sv4crc' : 'application/x-sv4crc',
445 '.swf' : 'application/x-shockwave-flash',
446 '.t' : 'application/x-troff',
447 '.tar' : 'application/x-tar',
448 '.tcl' : 'application/x-tcl',
449 '.tex' : 'application/x-tex',
450 '.texi' : 'application/x-texinfo',
451 '.texinfo': 'application/x-texinfo',
452 '.tif' : 'image/tiff',
453 '.tiff' : 'image/tiff',
454 '.tr' : 'application/x-troff',
455 '.tsv' : 'text/tab-separated-values',
456 '.txt' : 'text/plain',
457 '.ustar' : 'application/x-ustar',
458 '.vcf' : 'text/x-vcard',
459 '.wav' : 'audio/x-wav',
460 '.wiz' : 'application/msword',
461 '.wsdl' : 'application/xml',
462 '.xbm' : 'image/x-xbitmap',
463 '.xlb' : 'application/vnd.ms-excel',
464 # Duplicates :(
465 '.xls' : 'application/excel',
466 '.xls' : 'application/vnd.ms-excel',
467 '.xml' : 'text/xml',
468 '.xpdl' : 'application/xml',
469 '.xpm' : 'image/x-xpixmap',
470 '.xsl' : 'application/xml',
471 '.xwd' : 'image/x-xwindowdump',
472 '.zip' : 'application/zip',
475 # These are non-standard types, commonly found in the wild. They will
476 # only match if strict=0 flag is given to the API methods.
478 # Please sort these too
479 common_types = {
480 '.jpg' : 'image/jpg',
481 '.mid' : 'audio/midi',
482 '.midi': 'audio/midi',
483 '.pct' : 'image/pict',
484 '.pic' : 'image/pict',
485 '.pict': 'image/pict',
486 '.rtf' : 'application/rtf',
487 '.xul' : 'text/xul'
491 _default_mime_types()
494 if __name__ == '__main__':
495 import sys
496 import getopt
498 USAGE = """\
499 Usage: mimetypes.py [options] type
501 Options:
502 --help / -h -- print this message and exit
503 --lenient / -l -- additionally search of some common, but non-standard
504 types.
505 --extension / -e -- guess extension instead of type
507 More than one type argument may be given.
510 def usage(code, msg=''):
511 print(USAGE)
512 if msg: print(msg)
513 sys.exit(code)
515 try:
516 opts, args = getopt.getopt(sys.argv[1:], 'hle',
517 ['help', 'lenient', 'extension'])
518 except getopt.error as msg:
519 usage(1, msg)
521 strict = 1
522 extension = 0
523 for opt, arg in opts:
524 if opt in ('-h', '--help'):
525 usage(0)
526 elif opt in ('-l', '--lenient'):
527 strict = 0
528 elif opt in ('-e', '--extension'):
529 extension = 1
530 for gtype in args:
531 if extension:
532 guess = guess_extension(gtype, strict)
533 if not guess: print("I don't know anything about type", gtype)
534 else: print(guess)
535 else:
536 guess, encoding = guess_type(gtype, strict)
537 if not guess: print("I don't know anything about type", gtype)
538 else: print('type:', guess, 'encoding:', encoding)