Added new optional credentials argument to SMTPHandler.__init__, and smtp.login(...
[python.git] / Lib / urllib2.py
blob6bfbc29772cee4b4fe65339beee4f9cdecaf98aa
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15 deals with digest authentication.
17 urlopen(url, data=None) -- Basic usage is the same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- Function that creates a new OpenerDirector instance.
25 Will install the default handlers. Accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. If one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- Installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- An object that encapsulates the state of a request. The
36 state can be as simple as the URL. It can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError -- A subclass of IOError, individual protocols have their own
43 specific subclass.
45 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response.
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password(realm='PDQ Application',
59 uri='https://mahler:8092/site-updates.py',
60 user='klem',
61 passwd='geheim$parole')
63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
65 # build a new opener that adds authentication and caching FTP handlers
66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
68 # install it
69 urllib2.install_opener(opener)
71 f = urllib2.urlopen('http://www.python.org/')
74 """
76 # XXX issues:
77 # If an authentication error handler that tries to perform
78 # authentication for some reason but fails, how should the error be
79 # signalled? The client needs to know the HTTP error code. But if
80 # the handler knows that the problem was, e.g., that it didn't know
81 # that hash algo that requested in the challenge, it would be good to
82 # pass that information along to the client, too.
83 # ftp errors aren't handled cleanly
84 # check digest against correct (i.e. non-apache) implementation
86 # Possible extensions:
87 # complex proxies XXX not sure what exactly was meant by this
88 # abstract factory for opener
90 import base64
91 import hashlib
92 import httplib
93 import mimetools
94 import os
95 import posixpath
96 import random
97 import re
98 import socket
99 import sys
100 import time
101 import urlparse
102 import bisect
104 try:
105 from cStringIO import StringIO
106 except ImportError:
107 from StringIO import StringIO
109 from urllib import (unwrap, unquote, splittype, splithost, quote,
110 addinfourl, splitport, splitgophertype, splitquery,
111 splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
113 # support for FileHandler, proxies via environment variables
114 from urllib import localhost, url2pathname, getproxies
116 # used in User-Agent header sent
117 __version__ = sys.version[:3]
119 _opener = None
120 def urlopen(url, data=None):
121 global _opener
122 if _opener is None:
123 _opener = build_opener()
124 return _opener.open(url, data)
126 def install_opener(opener):
127 global _opener
128 _opener = opener
130 # do these error classes make sense?
131 # make sure all of the IOError stuff is overridden. we just want to be
132 # subtypes.
134 class URLError(IOError):
135 # URLError is a sub-type of IOError, but it doesn't share any of
136 # the implementation. need to override __init__ and __str__.
137 # It sets self.args for compatibility with other EnvironmentError
138 # subclasses, but args doesn't have the typical format with errno in
139 # slot 0 and strerror in slot 1. This may be better than nothing.
140 def __init__(self, reason):
141 self.args = reason,
142 self.reason = reason
144 def __str__(self):
145 return '<urlopen error %s>' % self.reason
147 class HTTPError(URLError, addinfourl):
148 """Raised when HTTP error occurs, but also acts like non-error return"""
149 __super_init = addinfourl.__init__
151 def __init__(self, url, code, msg, hdrs, fp):
152 self.code = code
153 self.msg = msg
154 self.hdrs = hdrs
155 self.fp = fp
156 self.filename = url
157 # The addinfourl classes depend on fp being a valid file
158 # object. In some cases, the HTTPError may not have a valid
159 # file object. If this happens, the simplest workaround is to
160 # not initialize the base classes.
161 if fp is not None:
162 self.__super_init(fp, hdrs, url)
164 def __str__(self):
165 return 'HTTP Error %s: %s' % (self.code, self.msg)
167 class GopherError(URLError):
168 pass
170 # copied from cookielib.py
171 _cut_port_re = re.compile(r":\d+$")
172 def request_host(request):
173 """Return request-host, as defined by RFC 2965.
175 Variation from RFC: returned value is lowercased, for convenient
176 comparison.
179 url = request.get_full_url()
180 host = urlparse.urlparse(url)[1]
181 if host == "":
182 host = request.get_header("Host", "")
184 # remove port, if present
185 host = _cut_port_re.sub("", host, 1)
186 return host.lower()
188 class Request:
190 def __init__(self, url, data=None, headers={},
191 origin_req_host=None, unverifiable=False):
192 # unwrap('<URL:type://host/path>') --> 'type://host/path'
193 self.__original = unwrap(url)
194 self.type = None
195 # self.__r_type is what's left after doing the splittype
196 self.host = None
197 self.port = None
198 self.data = data
199 self.headers = {}
200 for key, value in headers.items():
201 self.add_header(key, value)
202 self.unredirected_hdrs = {}
203 if origin_req_host is None:
204 origin_req_host = request_host(self)
205 self.origin_req_host = origin_req_host
206 self.unverifiable = unverifiable
208 def __getattr__(self, attr):
209 # XXX this is a fallback mechanism to guard against these
210 # methods getting called in a non-standard order. this may be
211 # too complicated and/or unnecessary.
212 # XXX should the __r_XXX attributes be public?
213 if attr[:12] == '_Request__r_':
214 name = attr[12:]
215 if hasattr(Request, 'get_' + name):
216 getattr(self, 'get_' + name)()
217 return getattr(self, attr)
218 raise AttributeError, attr
220 def get_method(self):
221 if self.has_data():
222 return "POST"
223 else:
224 return "GET"
226 # XXX these helper methods are lame
228 def add_data(self, data):
229 self.data = data
231 def has_data(self):
232 return self.data is not None
234 def get_data(self):
235 return self.data
237 def get_full_url(self):
238 return self.__original
240 def get_type(self):
241 if self.type is None:
242 self.type, self.__r_type = splittype(self.__original)
243 if self.type is None:
244 raise ValueError, "unknown url type: %s" % self.__original
245 return self.type
247 def get_host(self):
248 if self.host is None:
249 self.host, self.__r_host = splithost(self.__r_type)
250 if self.host:
251 self.host = unquote(self.host)
252 return self.host
254 def get_selector(self):
255 return self.__r_host
257 def set_proxy(self, host, type):
258 self.host, self.type = host, type
259 self.__r_host = self.__original
261 def get_origin_req_host(self):
262 return self.origin_req_host
264 def is_unverifiable(self):
265 return self.unverifiable
267 def add_header(self, key, val):
268 # useful for something like authentication
269 self.headers[key.capitalize()] = val
271 def add_unredirected_header(self, key, val):
272 # will not be added to a redirected request
273 self.unredirected_hdrs[key.capitalize()] = val
275 def has_header(self, header_name):
276 return (header_name in self.headers or
277 header_name in self.unredirected_hdrs)
279 def get_header(self, header_name, default=None):
280 return self.headers.get(
281 header_name,
282 self.unredirected_hdrs.get(header_name, default))
284 def header_items(self):
285 hdrs = self.unredirected_hdrs.copy()
286 hdrs.update(self.headers)
287 return hdrs.items()
289 class OpenerDirector:
290 def __init__(self):
291 client_version = "Python-urllib/%s" % __version__
292 self.addheaders = [('User-agent', client_version)]
293 # manage the individual handlers
294 self.handlers = []
295 self.handle_open = {}
296 self.handle_error = {}
297 self.process_response = {}
298 self.process_request = {}
300 def add_handler(self, handler):
301 added = False
302 for meth in dir(handler):
303 if meth in ["redirect_request", "do_open", "proxy_open"]:
304 # oops, coincidental match
305 continue
307 i = meth.find("_")
308 protocol = meth[:i]
309 condition = meth[i+1:]
311 if condition.startswith("error"):
312 j = condition.find("_") + i + 1
313 kind = meth[j+1:]
314 try:
315 kind = int(kind)
316 except ValueError:
317 pass
318 lookup = self.handle_error.get(protocol, {})
319 self.handle_error[protocol] = lookup
320 elif condition == "open":
321 kind = protocol
322 lookup = self.handle_open
323 elif condition == "response":
324 kind = protocol
325 lookup = self.process_response
326 elif condition == "request":
327 kind = protocol
328 lookup = self.process_request
329 else:
330 continue
332 handlers = lookup.setdefault(kind, [])
333 if handlers:
334 bisect.insort(handlers, handler)
335 else:
336 handlers.append(handler)
337 added = True
339 if added:
340 # the handlers must work in an specific order, the order
341 # is specified in a Handler attribute
342 bisect.insort(self.handlers, handler)
343 handler.add_parent(self)
345 def close(self):
346 # Only exists for backwards compatibility.
347 pass
349 def _call_chain(self, chain, kind, meth_name, *args):
350 # Handlers raise an exception if no one else should try to handle
351 # the request, or return None if they can't but another handler
352 # could. Otherwise, they return the response.
353 handlers = chain.get(kind, ())
354 for handler in handlers:
355 func = getattr(handler, meth_name)
357 result = func(*args)
358 if result is not None:
359 return result
361 def open(self, fullurl, data=None):
362 # accept a URL or a Request object
363 if isinstance(fullurl, basestring):
364 req = Request(fullurl, data)
365 else:
366 req = fullurl
367 if data is not None:
368 req.add_data(data)
370 protocol = req.get_type()
372 # pre-process request
373 meth_name = protocol+"_request"
374 for processor in self.process_request.get(protocol, []):
375 meth = getattr(processor, meth_name)
376 req = meth(req)
378 response = self._open(req, data)
380 # post-process response
381 meth_name = protocol+"_response"
382 for processor in self.process_response.get(protocol, []):
383 meth = getattr(processor, meth_name)
384 response = meth(req, response)
386 return response
388 def _open(self, req, data=None):
389 result = self._call_chain(self.handle_open, 'default',
390 'default_open', req)
391 if result:
392 return result
394 protocol = req.get_type()
395 result = self._call_chain(self.handle_open, protocol, protocol +
396 '_open', req)
397 if result:
398 return result
400 return self._call_chain(self.handle_open, 'unknown',
401 'unknown_open', req)
403 def error(self, proto, *args):
404 if proto in ('http', 'https'):
405 # XXX http[s] protocols are special-cased
406 dict = self.handle_error['http'] # https is not different than http
407 proto = args[2] # YUCK!
408 meth_name = 'http_error_%s' % proto
409 http_err = 1
410 orig_args = args
411 else:
412 dict = self.handle_error
413 meth_name = proto + '_error'
414 http_err = 0
415 args = (dict, proto, meth_name) + args
416 result = self._call_chain(*args)
417 if result:
418 return result
420 if http_err:
421 args = (dict, 'default', 'http_error_default') + orig_args
422 return self._call_chain(*args)
424 # XXX probably also want an abstract factory that knows when it makes
425 # sense to skip a superclass in favor of a subclass and when it might
426 # make sense to include both
428 def build_opener(*handlers):
429 """Create an opener object from a list of handlers.
431 The opener will use several default handlers, including support
432 for HTTP and FTP.
434 If any of the handlers passed as arguments are subclasses of the
435 default handlers, the default handlers will not be used.
437 import types
438 def isclass(obj):
439 return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
441 opener = OpenerDirector()
442 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
443 HTTPDefaultErrorHandler, HTTPRedirectHandler,
444 FTPHandler, FileHandler, HTTPErrorProcessor]
445 if hasattr(httplib, 'HTTPS'):
446 default_classes.append(HTTPSHandler)
447 skip = []
448 for klass in default_classes:
449 for check in handlers:
450 if isclass(check):
451 if issubclass(check, klass):
452 skip.append(klass)
453 elif isinstance(check, klass):
454 skip.append(klass)
455 for klass in skip:
456 default_classes.remove(klass)
458 for klass in default_classes:
459 opener.add_handler(klass())
461 for h in handlers:
462 if isclass(h):
463 h = h()
464 opener.add_handler(h)
465 return opener
467 class BaseHandler:
468 handler_order = 500
470 def add_parent(self, parent):
471 self.parent = parent
473 def close(self):
474 # Only exists for backwards compatibility
475 pass
477 def __lt__(self, other):
478 if not hasattr(other, "handler_order"):
479 # Try to preserve the old behavior of having custom classes
480 # inserted after default ones (works only for custom user
481 # classes which are not aware of handler_order).
482 return True
483 return self.handler_order < other.handler_order
486 class HTTPErrorProcessor(BaseHandler):
487 """Process HTTP error responses."""
488 handler_order = 1000 # after all other processing
490 def http_response(self, request, response):
491 code, msg, hdrs = response.code, response.msg, response.info()
493 # According to RFC 2616, "2xx" code indicates that the client's
494 # request was successfully received, understood, and accepted.
495 if not (200 <= code < 300):
496 response = self.parent.error(
497 'http', request, response, code, msg, hdrs)
499 return response
501 https_response = http_response
503 class HTTPDefaultErrorHandler(BaseHandler):
504 def http_error_default(self, req, fp, code, msg, hdrs):
505 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
507 class HTTPRedirectHandler(BaseHandler):
508 # maximum number of redirections to any single URL
509 # this is needed because of the state that cookies introduce
510 max_repeats = 4
511 # maximum total number of redirections (regardless of URL) before
512 # assuming we're in a loop
513 max_redirections = 10
515 def redirect_request(self, req, fp, code, msg, headers, newurl):
516 """Return a Request or None in response to a redirect.
518 This is called by the http_error_30x methods when a
519 redirection response is received. If a redirection should
520 take place, return a new Request to allow http_error_30x to
521 perform the redirect. Otherwise, raise HTTPError if no-one
522 else should try to handle this url. Return None if you can't
523 but another Handler might.
525 m = req.get_method()
526 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
527 or code in (301, 302, 303) and m == "POST"):
528 # Strictly (according to RFC 2616), 301 or 302 in response
529 # to a POST MUST NOT cause a redirection without confirmation
530 # from the user (of urllib2, in this case). In practice,
531 # essentially all clients do redirect in this case, so we
532 # do the same.
533 # be conciliant with URIs containing a space
534 newurl = newurl.replace(' ', '%20')
535 return Request(newurl,
536 headers=req.headers,
537 origin_req_host=req.get_origin_req_host(),
538 unverifiable=True)
539 else:
540 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
542 # Implementation note: To avoid the server sending us into an
543 # infinite loop, the request object needs to track what URLs we
544 # have already seen. Do this by adding a handler-specific
545 # attribute to the Request object.
546 def http_error_302(self, req, fp, code, msg, headers):
547 # Some servers (incorrectly) return multiple Location headers
548 # (so probably same goes for URI). Use first header.
549 if 'location' in headers:
550 newurl = headers.getheaders('location')[0]
551 elif 'uri' in headers:
552 newurl = headers.getheaders('uri')[0]
553 else:
554 return
555 newurl = urlparse.urljoin(req.get_full_url(), newurl)
557 # XXX Probably want to forget about the state of the current
558 # request, although that might interact poorly with other
559 # handlers that also use handler-specific request attributes
560 new = self.redirect_request(req, fp, code, msg, headers, newurl)
561 if new is None:
562 return
564 # loop detection
565 # .redirect_dict has a key url if url was previously visited.
566 if hasattr(req, 'redirect_dict'):
567 visited = new.redirect_dict = req.redirect_dict
568 if (visited.get(newurl, 0) >= self.max_repeats or
569 len(visited) >= self.max_redirections):
570 raise HTTPError(req.get_full_url(), code,
571 self.inf_msg + msg, headers, fp)
572 else:
573 visited = new.redirect_dict = req.redirect_dict = {}
574 visited[newurl] = visited.get(newurl, 0) + 1
576 # Don't close the fp until we are sure that we won't use it
577 # with HTTPError.
578 fp.read()
579 fp.close()
581 return self.parent.open(new)
583 http_error_301 = http_error_303 = http_error_307 = http_error_302
585 inf_msg = "The HTTP server returned a redirect error that would " \
586 "lead to an infinite loop.\n" \
587 "The last 30x error message was:\n"
590 def _parse_proxy(proxy):
591 """Return (scheme, user, password, host/port) given a URL or an authority.
593 If a URL is supplied, it must have an authority (host:port) component.
594 According to RFC 3986, having an authority component means the URL must
595 have two slashes after the scheme:
597 >>> _parse_proxy('file:/ftp.example.com/')
598 Traceback (most recent call last):
599 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
601 The first three items of the returned tuple may be None.
603 Examples of authority parsing:
605 >>> _parse_proxy('proxy.example.com')
606 (None, None, None, 'proxy.example.com')
607 >>> _parse_proxy('proxy.example.com:3128')
608 (None, None, None, 'proxy.example.com:3128')
610 The authority component may optionally include userinfo (assumed to be
611 username:password):
613 >>> _parse_proxy('joe:password@proxy.example.com')
614 (None, 'joe', 'password', 'proxy.example.com')
615 >>> _parse_proxy('joe:password@proxy.example.com:3128')
616 (None, 'joe', 'password', 'proxy.example.com:3128')
618 Same examples, but with URLs instead:
620 >>> _parse_proxy('http://proxy.example.com/')
621 ('http', None, None, 'proxy.example.com')
622 >>> _parse_proxy('http://proxy.example.com:3128/')
623 ('http', None, None, 'proxy.example.com:3128')
624 >>> _parse_proxy('http://joe:password@proxy.example.com/')
625 ('http', 'joe', 'password', 'proxy.example.com')
626 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
627 ('http', 'joe', 'password', 'proxy.example.com:3128')
629 Everything after the authority is ignored:
631 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
632 ('ftp', 'joe', 'password', 'proxy.example.com')
634 Test for no trailing '/' case:
636 >>> _parse_proxy('http://joe:password@proxy.example.com')
637 ('http', 'joe', 'password', 'proxy.example.com')
640 scheme, r_scheme = splittype(proxy)
641 if not r_scheme.startswith("/"):
642 # authority
643 scheme = None
644 authority = proxy
645 else:
646 # URL
647 if not r_scheme.startswith("//"):
648 raise ValueError("proxy URL with no authority: %r" % proxy)
649 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
650 # and 3.3.), path is empty or starts with '/'
651 end = r_scheme.find("/", 2)
652 if end == -1:
653 end = None
654 authority = r_scheme[2:end]
655 userinfo, hostport = splituser(authority)
656 if userinfo is not None:
657 user, password = splitpasswd(userinfo)
658 else:
659 user = password = None
660 return scheme, user, password, hostport
662 class ProxyHandler(BaseHandler):
663 # Proxies must be in front
664 handler_order = 100
666 def __init__(self, proxies=None):
667 if proxies is None:
668 proxies = getproxies()
669 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
670 self.proxies = proxies
671 for type, url in proxies.items():
672 setattr(self, '%s_open' % type,
673 lambda r, proxy=url, type=type, meth=self.proxy_open: \
674 meth(r, proxy, type))
676 def proxy_open(self, req, proxy, type):
677 orig_type = req.get_type()
678 proxy_type, user, password, hostport = _parse_proxy(proxy)
679 if proxy_type is None:
680 proxy_type = orig_type
681 if user and password:
682 user_pass = '%s:%s' % (unquote(user), unquote(password))
683 creds = base64.b64encode(user_pass).strip()
684 req.add_header('Proxy-authorization', 'Basic ' + creds)
685 hostport = unquote(hostport)
686 req.set_proxy(hostport, proxy_type)
687 if orig_type == proxy_type:
688 # let other handlers take care of it
689 return None
690 else:
691 # need to start over, because the other handlers don't
692 # grok the proxy's URL type
693 # e.g. if we have a constructor arg proxies like so:
694 # {'http': 'ftp://proxy.example.com'}, we may end up turning
695 # a request for http://acme.example.com/a into one for
696 # ftp://proxy.example.com/a
697 return self.parent.open(req)
699 class HTTPPasswordMgr:
701 def __init__(self):
702 self.passwd = {}
704 def add_password(self, realm, uri, user, passwd):
705 # uri could be a single URI or a sequence
706 if isinstance(uri, basestring):
707 uri = [uri]
708 if not realm in self.passwd:
709 self.passwd[realm] = {}
710 for default_port in True, False:
711 reduced_uri = tuple(
712 [self.reduce_uri(u, default_port) for u in uri])
713 self.passwd[realm][reduced_uri] = (user, passwd)
715 def find_user_password(self, realm, authuri):
716 domains = self.passwd.get(realm, {})
717 for default_port in True, False:
718 reduced_authuri = self.reduce_uri(authuri, default_port)
719 for uris, authinfo in domains.iteritems():
720 for uri in uris:
721 if self.is_suburi(uri, reduced_authuri):
722 return authinfo
723 return None, None
725 def reduce_uri(self, uri, default_port=True):
726 """Accept authority or URI and extract only the authority and path."""
727 # note HTTP URLs do not have a userinfo component
728 parts = urlparse.urlsplit(uri)
729 if parts[1]:
730 # URI
731 scheme = parts[0]
732 authority = parts[1]
733 path = parts[2] or '/'
734 else:
735 # host or host:port
736 scheme = None
737 authority = uri
738 path = '/'
739 host, port = splitport(authority)
740 if default_port and port is None and scheme is not None:
741 dport = {"http": 80,
742 "https": 443,
743 }.get(scheme)
744 if dport is not None:
745 authority = "%s:%d" % (host, dport)
746 return authority, path
748 def is_suburi(self, base, test):
749 """Check if test is below base in a URI tree
751 Both args must be URIs in reduced form.
753 if base == test:
754 return True
755 if base[0] != test[0]:
756 return False
757 common = posixpath.commonprefix((base[1], test[1]))
758 if len(common) == len(base[1]):
759 return True
760 return False
763 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
765 def find_user_password(self, realm, authuri):
766 user, password = HTTPPasswordMgr.find_user_password(self, realm,
767 authuri)
768 if user is not None:
769 return user, password
770 return HTTPPasswordMgr.find_user_password(self, None, authuri)
773 class AbstractBasicAuthHandler:
775 # XXX this allows for multiple auth-schemes, but will stupidly pick
776 # the last one with a realm specified.
778 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
780 # XXX could pre-emptively send auth info already accepted (RFC 2617,
781 # end of section 2, and section 1.2 immediately after "credentials"
782 # production).
784 def __init__(self, password_mgr=None):
785 if password_mgr is None:
786 password_mgr = HTTPPasswordMgr()
787 self.passwd = password_mgr
788 self.add_password = self.passwd.add_password
790 def http_error_auth_reqed(self, authreq, host, req, headers):
791 # host may be an authority (without userinfo) or a URL with an
792 # authority
793 # XXX could be multiple headers
794 authreq = headers.get(authreq, None)
795 if authreq:
796 mo = AbstractBasicAuthHandler.rx.search(authreq)
797 if mo:
798 scheme, realm = mo.groups()
799 if scheme.lower() == 'basic':
800 return self.retry_http_basic_auth(host, req, realm)
802 def retry_http_basic_auth(self, host, req, realm):
803 user, pw = self.passwd.find_user_password(realm, host)
804 if pw is not None:
805 raw = "%s:%s" % (user, pw)
806 auth = 'Basic %s' % base64.b64encode(raw).strip()
807 if req.headers.get(self.auth_header, None) == auth:
808 return None
809 req.add_header(self.auth_header, auth)
810 return self.parent.open(req)
811 else:
812 return None
815 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
817 auth_header = 'Authorization'
819 def http_error_401(self, req, fp, code, msg, headers):
820 url = req.get_full_url()
821 return self.http_error_auth_reqed('www-authenticate',
822 url, req, headers)
825 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
827 auth_header = 'Proxy-authorization'
829 def http_error_407(self, req, fp, code, msg, headers):
830 # http_error_auth_reqed requires that there is no userinfo component in
831 # authority. Assume there isn't one, since urllib2 does not (and
832 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
833 # userinfo.
834 authority = req.get_host()
835 return self.http_error_auth_reqed('proxy-authenticate',
836 authority, req, headers)
839 def randombytes(n):
840 """Return n random bytes."""
841 # Use /dev/urandom if it is available. Fall back to random module
842 # if not. It might be worthwhile to extend this function to use
843 # other platform-specific mechanisms for getting random bytes.
844 if os.path.exists("/dev/urandom"):
845 f = open("/dev/urandom")
846 s = f.read(n)
847 f.close()
848 return s
849 else:
850 L = [chr(random.randrange(0, 256)) for i in range(n)]
851 return "".join(L)
853 class AbstractDigestAuthHandler:
854 # Digest authentication is specified in RFC 2617.
856 # XXX The client does not inspect the Authentication-Info header
857 # in a successful response.
859 # XXX It should be possible to test this implementation against
860 # a mock server that just generates a static set of challenges.
862 # XXX qop="auth-int" supports is shaky
864 def __init__(self, passwd=None):
865 if passwd is None:
866 passwd = HTTPPasswordMgr()
867 self.passwd = passwd
868 self.add_password = self.passwd.add_password
869 self.retried = 0
870 self.nonce_count = 0
872 def reset_retry_count(self):
873 self.retried = 0
875 def http_error_auth_reqed(self, auth_header, host, req, headers):
876 authreq = headers.get(auth_header, None)
877 if self.retried > 5:
878 # Don't fail endlessly - if we failed once, we'll probably
879 # fail a second time. Hm. Unless the Password Manager is
880 # prompting for the information. Crap. This isn't great
881 # but it's better than the current 'repeat until recursion
882 # depth exceeded' approach <wink>
883 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
884 headers, None)
885 else:
886 self.retried += 1
887 if authreq:
888 scheme = authreq.split()[0]
889 if scheme.lower() == 'digest':
890 return self.retry_http_digest_auth(req, authreq)
892 def retry_http_digest_auth(self, req, auth):
893 token, challenge = auth.split(' ', 1)
894 chal = parse_keqv_list(parse_http_list(challenge))
895 auth = self.get_authorization(req, chal)
896 if auth:
897 auth_val = 'Digest %s' % auth
898 if req.headers.get(self.auth_header, None) == auth_val:
899 return None
900 req.add_unredirected_header(self.auth_header, auth_val)
901 resp = self.parent.open(req)
902 return resp
904 def get_cnonce(self, nonce):
905 # The cnonce-value is an opaque
906 # quoted string value provided by the client and used by both client
907 # and server to avoid chosen plaintext attacks, to provide mutual
908 # authentication, and to provide some message integrity protection.
909 # This isn't a fabulous effort, but it's probably Good Enough.
910 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
911 randombytes(8))).hexdigest()
912 return dig[:16]
914 def get_authorization(self, req, chal):
915 try:
916 realm = chal['realm']
917 nonce = chal['nonce']
918 qop = chal.get('qop')
919 algorithm = chal.get('algorithm', 'MD5')
920 # mod_digest doesn't send an opaque, even though it isn't
921 # supposed to be optional
922 opaque = chal.get('opaque', None)
923 except KeyError:
924 return None
926 H, KD = self.get_algorithm_impls(algorithm)
927 if H is None:
928 return None
930 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
931 if user is None:
932 return None
934 # XXX not implemented yet
935 if req.has_data():
936 entdig = self.get_entity_digest(req.get_data(), chal)
937 else:
938 entdig = None
940 A1 = "%s:%s:%s" % (user, realm, pw)
941 A2 = "%s:%s" % (req.get_method(),
942 # XXX selector: what about proxies and full urls
943 req.get_selector())
944 if qop == 'auth':
945 self.nonce_count += 1
946 ncvalue = '%08x' % self.nonce_count
947 cnonce = self.get_cnonce(nonce)
948 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
949 respdig = KD(H(A1), noncebit)
950 elif qop is None:
951 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
952 else:
953 # XXX handle auth-int.
954 pass
956 # XXX should the partial digests be encoded too?
958 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
959 'response="%s"' % (user, realm, nonce, req.get_selector(),
960 respdig)
961 if opaque:
962 base += ', opaque="%s"' % opaque
963 if entdig:
964 base += ', digest="%s"' % entdig
965 base += ', algorithm="%s"' % algorithm
966 if qop:
967 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
968 return base
970 def get_algorithm_impls(self, algorithm):
971 # lambdas assume digest modules are imported at the top level
972 if algorithm == 'MD5':
973 H = lambda x: hashlib.md5(x).hexdigest()
974 elif algorithm == 'SHA':
975 H = lambda x: hashlib.sha1(x).hexdigest()
976 # XXX MD5-sess
977 KD = lambda s, d: H("%s:%s" % (s, d))
978 return H, KD
980 def get_entity_digest(self, data, chal):
981 # XXX not implemented yet
982 return None
985 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
986 """An authentication protocol defined by RFC 2069
988 Digest authentication improves on basic authentication because it
989 does not transmit passwords in the clear.
992 auth_header = 'Authorization'
993 handler_order = 490 # before Basic auth
995 def http_error_401(self, req, fp, code, msg, headers):
996 host = urlparse.urlparse(req.get_full_url())[1]
997 retry = self.http_error_auth_reqed('www-authenticate',
998 host, req, headers)
999 self.reset_retry_count()
1000 return retry
1003 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1005 auth_header = 'Proxy-Authorization'
1006 handler_order = 490 # before Basic auth
1008 def http_error_407(self, req, fp, code, msg, headers):
1009 host = req.get_host()
1010 retry = self.http_error_auth_reqed('proxy-authenticate',
1011 host, req, headers)
1012 self.reset_retry_count()
1013 return retry
1015 class AbstractHTTPHandler(BaseHandler):
1017 def __init__(self, debuglevel=0):
1018 self._debuglevel = debuglevel
1020 def set_http_debuglevel(self, level):
1021 self._debuglevel = level
1023 def do_request_(self, request):
1024 host = request.get_host()
1025 if not host:
1026 raise URLError('no host given')
1028 if request.has_data(): # POST
1029 data = request.get_data()
1030 if not request.has_header('Content-type'):
1031 request.add_unredirected_header(
1032 'Content-type',
1033 'application/x-www-form-urlencoded')
1034 if not request.has_header('Content-length'):
1035 request.add_unredirected_header(
1036 'Content-length', '%d' % len(data))
1038 scheme, sel = splittype(request.get_selector())
1039 sel_host, sel_path = splithost(sel)
1040 if not request.has_header('Host'):
1041 request.add_unredirected_header('Host', sel_host or host)
1042 for name, value in self.parent.addheaders:
1043 name = name.capitalize()
1044 if not request.has_header(name):
1045 request.add_unredirected_header(name, value)
1047 return request
1049 def do_open(self, http_class, req):
1050 """Return an addinfourl object for the request, using http_class.
1052 http_class must implement the HTTPConnection API from httplib.
1053 The addinfourl return value is a file-like object. It also
1054 has methods and attributes including:
1055 - info(): return a mimetools.Message object for the headers
1056 - geturl(): return the original request URL
1057 - code: HTTP status code
1059 host = req.get_host()
1060 if not host:
1061 raise URLError('no host given')
1063 h = http_class(host) # will parse host:port
1064 h.set_debuglevel(self._debuglevel)
1066 headers = dict(req.headers)
1067 headers.update(req.unredirected_hdrs)
1068 # We want to make an HTTP/1.1 request, but the addinfourl
1069 # class isn't prepared to deal with a persistent connection.
1070 # It will try to read all remaining data from the socket,
1071 # which will block while the server waits for the next request.
1072 # So make sure the connection gets closed after the (only)
1073 # request.
1074 headers["Connection"] = "close"
1075 headers = dict(
1076 (name.title(), val) for name, val in headers.items())
1077 try:
1078 h.request(req.get_method(), req.get_selector(), req.data, headers)
1079 r = h.getresponse()
1080 except socket.error, err: # XXX what error?
1081 raise URLError(err)
1083 # Pick apart the HTTPResponse object to get the addinfourl
1084 # object initialized properly.
1086 # Wrap the HTTPResponse object in socket's file object adapter
1087 # for Windows. That adapter calls recv(), so delegate recv()
1088 # to read(). This weird wrapping allows the returned object to
1089 # have readline() and readlines() methods.
1091 # XXX It might be better to extract the read buffering code
1092 # out of socket._fileobject() and into a base class.
1094 r.recv = r.read
1095 fp = socket._fileobject(r, close=True)
1097 resp = addinfourl(fp, r.msg, req.get_full_url())
1098 resp.code = r.status
1099 resp.msg = r.reason
1100 return resp
1103 class HTTPHandler(AbstractHTTPHandler):
1105 def http_open(self, req):
1106 return self.do_open(httplib.HTTPConnection, req)
1108 http_request = AbstractHTTPHandler.do_request_
1110 if hasattr(httplib, 'HTTPS'):
1111 class HTTPSHandler(AbstractHTTPHandler):
1113 def https_open(self, req):
1114 return self.do_open(httplib.HTTPSConnection, req)
1116 https_request = AbstractHTTPHandler.do_request_
1118 class HTTPCookieProcessor(BaseHandler):
1119 def __init__(self, cookiejar=None):
1120 import cookielib
1121 if cookiejar is None:
1122 cookiejar = cookielib.CookieJar()
1123 self.cookiejar = cookiejar
1125 def http_request(self, request):
1126 self.cookiejar.add_cookie_header(request)
1127 return request
1129 def http_response(self, request, response):
1130 self.cookiejar.extract_cookies(response, request)
1131 return response
1133 https_request = http_request
1134 https_response = http_response
1136 class UnknownHandler(BaseHandler):
1137 def unknown_open(self, req):
1138 type = req.get_type()
1139 raise URLError('unknown url type: %s' % type)
1141 def parse_keqv_list(l):
1142 """Parse list of key=value strings where keys are not duplicated."""
1143 parsed = {}
1144 for elt in l:
1145 k, v = elt.split('=', 1)
1146 if v[0] == '"' and v[-1] == '"':
1147 v = v[1:-1]
1148 parsed[k] = v
1149 return parsed
1151 def parse_http_list(s):
1152 """Parse lists as described by RFC 2068 Section 2.
1154 In particular, parse comma-separated lists where the elements of
1155 the list may include quoted-strings. A quoted-string could
1156 contain a comma. A non-quoted string could have quotes in the
1157 middle. Neither commas nor quotes count if they are escaped.
1158 Only double-quotes count, not single-quotes.
1160 res = []
1161 part = ''
1163 escape = quote = False
1164 for cur in s:
1165 if escape:
1166 part += cur
1167 escape = False
1168 continue
1169 if quote:
1170 if cur == '\\':
1171 escape = True
1172 continue
1173 elif cur == '"':
1174 quote = False
1175 part += cur
1176 continue
1178 if cur == ',':
1179 res.append(part)
1180 part = ''
1181 continue
1183 if cur == '"':
1184 quote = True
1186 part += cur
1188 # append last part
1189 if part:
1190 res.append(part)
1192 return [part.strip() for part in res]
1194 class FileHandler(BaseHandler):
1195 # Use local file or FTP depending on form of URL
1196 def file_open(self, req):
1197 url = req.get_selector()
1198 if url[:2] == '//' and url[2:3] != '/':
1199 req.type = 'ftp'
1200 return self.parent.open(req)
1201 else:
1202 return self.open_local_file(req)
1204 # names for the localhost
1205 names = None
1206 def get_names(self):
1207 if FileHandler.names is None:
1208 try:
1209 FileHandler.names = (socket.gethostbyname('localhost'),
1210 socket.gethostbyname(socket.gethostname()))
1211 except socket.gaierror:
1212 FileHandler.names = (socket.gethostbyname('localhost'),)
1213 return FileHandler.names
1215 # not entirely sure what the rules are here
1216 def open_local_file(self, req):
1217 import email.utils
1218 import mimetypes
1219 host = req.get_host()
1220 file = req.get_selector()
1221 localfile = url2pathname(file)
1222 try:
1223 stats = os.stat(localfile)
1224 size = stats.st_size
1225 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1226 mtype = mimetypes.guess_type(file)[0]
1227 headers = mimetools.Message(StringIO(
1228 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1229 (mtype or 'text/plain', size, modified)))
1230 if host:
1231 host, port = splitport(host)
1232 if not host or \
1233 (not port and socket.gethostbyname(host) in self.get_names()):
1234 return addinfourl(open(localfile, 'rb'),
1235 headers, 'file:'+file)
1236 except OSError, msg:
1237 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1238 raise URLError(msg)
1239 raise URLError('file not on local host')
1241 class FTPHandler(BaseHandler):
1242 def ftp_open(self, req):
1243 import ftplib
1244 import mimetypes
1245 host = req.get_host()
1246 if not host:
1247 raise IOError, ('ftp error', 'no host given')
1248 host, port = splitport(host)
1249 if port is None:
1250 port = ftplib.FTP_PORT
1251 else:
1252 port = int(port)
1254 # username/password handling
1255 user, host = splituser(host)
1256 if user:
1257 user, passwd = splitpasswd(user)
1258 else:
1259 passwd = None
1260 host = unquote(host)
1261 user = unquote(user or '')
1262 passwd = unquote(passwd or '')
1264 try:
1265 host = socket.gethostbyname(host)
1266 except socket.error, msg:
1267 raise URLError(msg)
1268 path, attrs = splitattr(req.get_selector())
1269 dirs = path.split('/')
1270 dirs = map(unquote, dirs)
1271 dirs, file = dirs[:-1], dirs[-1]
1272 if dirs and not dirs[0]:
1273 dirs = dirs[1:]
1274 try:
1275 fw = self.connect_ftp(user, passwd, host, port, dirs)
1276 type = file and 'I' or 'D'
1277 for attr in attrs:
1278 attr, value = splitvalue(attr)
1279 if attr.lower() == 'type' and \
1280 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1281 type = value.upper()
1282 fp, retrlen = fw.retrfile(file, type)
1283 headers = ""
1284 mtype = mimetypes.guess_type(req.get_full_url())[0]
1285 if mtype:
1286 headers += "Content-type: %s\n" % mtype
1287 if retrlen is not None and retrlen >= 0:
1288 headers += "Content-length: %d\n" % retrlen
1289 sf = StringIO(headers)
1290 headers = mimetools.Message(sf)
1291 return addinfourl(fp, headers, req.get_full_url())
1292 except ftplib.all_errors, msg:
1293 raise IOError, ('ftp error', msg), sys.exc_info()[2]
1295 def connect_ftp(self, user, passwd, host, port, dirs):
1296 fw = ftpwrapper(user, passwd, host, port, dirs)
1297 ## fw.ftp.set_debuglevel(1)
1298 return fw
1300 class CacheFTPHandler(FTPHandler):
1301 # XXX would be nice to have pluggable cache strategies
1302 # XXX this stuff is definitely not thread safe
1303 def __init__(self):
1304 self.cache = {}
1305 self.timeout = {}
1306 self.soonest = 0
1307 self.delay = 60
1308 self.max_conns = 16
1310 def setTimeout(self, t):
1311 self.delay = t
1313 def setMaxConns(self, m):
1314 self.max_conns = m
1316 def connect_ftp(self, user, passwd, host, port, dirs):
1317 key = user, host, port, '/'.join(dirs)
1318 if key in self.cache:
1319 self.timeout[key] = time.time() + self.delay
1320 else:
1321 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1322 self.timeout[key] = time.time() + self.delay
1323 self.check_cache()
1324 return self.cache[key]
1326 def check_cache(self):
1327 # first check for old ones
1328 t = time.time()
1329 if self.soonest <= t:
1330 for k, v in self.timeout.items():
1331 if v < t:
1332 self.cache[k].close()
1333 del self.cache[k]
1334 del self.timeout[k]
1335 self.soonest = min(self.timeout.values())
1337 # then check the size
1338 if len(self.cache) == self.max_conns:
1339 for k, v in self.timeout.items():
1340 if v == self.soonest:
1341 del self.cache[k]
1342 del self.timeout[k]
1343 break
1344 self.soonest = min(self.timeout.values())
1346 class GopherHandler(BaseHandler):
1347 def gopher_open(self, req):
1348 # XXX can raise socket.error
1349 import gopherlib # this raises DeprecationWarning in 2.5
1350 host = req.get_host()
1351 if not host:
1352 raise GopherError('no host given')
1353 host = unquote(host)
1354 selector = req.get_selector()
1355 type, selector = splitgophertype(selector)
1356 selector, query = splitquery(selector)
1357 selector = unquote(selector)
1358 if query:
1359 query = unquote(query)
1360 fp = gopherlib.send_query(selector, query, host)
1361 else:
1362 fp = gopherlib.send_selector(selector, host)
1363 return addinfourl(fp, noheaders(), req.get_full_url())