1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15 deals with digest authentication.
17 urlopen(url, data=None) -- Basic usage is the same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- Function that creates a new OpenerDirector instance.
25 Will install the default handlers. Accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. If one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- Installs a new opener as the default opener.
35 Request -- An object that encapsulates the state of a request. The
36 state can be as simple as the URL. It can also include extra HTTP
37 headers, e.g. a User-Agent.
42 URLError -- A subclass of IOError, individual protocols have their own
45 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response.
49 BaseHandler and parent
50 _call_chain conventions
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password(realm='PDQ Application',
59 uri='https://mahler:8092/site-updates.py',
61 passwd='geheim$parole')
63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
65 # build a new opener that adds authentication and caching FTP handlers
66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
69 urllib2.install_opener(opener)
71 f = urllib2.urlopen('http://www.python.org/')
77 # If an authentication error handler that tries to perform
78 # authentication for some reason but fails, how should the error be
79 # signalled? The client needs to know the HTTP error code. But if
80 # the handler knows that the problem was, e.g., that it didn't know
81 # that hash algo that requested in the challenge, it would be good to
82 # pass that information along to the client, too.
83 # ftp errors aren't handled cleanly
84 # check digest against correct (i.e. non-apache) implementation
86 # Possible extensions:
87 # complex proxies XXX not sure what exactly was meant by this
88 # abstract factory for opener
105 from cStringIO
import StringIO
107 from StringIO
import StringIO
109 from urllib
import (unwrap
, unquote
, splittype
, splithost
, quote
,
110 addinfourl
, splitport
, splitgophertype
, splitquery
,
111 splitattr
, ftpwrapper
, noheaders
, splituser
, splitpasswd
, splitvalue
)
113 # support for FileHandler, proxies via environment variables
114 from urllib
import localhost
, url2pathname
, getproxies
116 # used in User-Agent header sent
117 __version__
= sys
.version
[:3]
120 def urlopen(url
, data
=None):
123 _opener
= build_opener()
124 return _opener
.open(url
, data
)
126 def install_opener(opener
):
130 # do these error classes make sense?
131 # make sure all of the IOError stuff is overridden. we just want to be
134 class URLError(IOError):
135 # URLError is a sub-type of IOError, but it doesn't share any of
136 # the implementation. need to override __init__ and __str__.
137 # It sets self.args for compatibility with other EnvironmentError
138 # subclasses, but args doesn't have the typical format with errno in
139 # slot 0 and strerror in slot 1. This may be better than nothing.
140 def __init__(self
, reason
):
145 return '<urlopen error %s>' % self
.reason
147 class HTTPError(URLError
, addinfourl
):
148 """Raised when HTTP error occurs, but also acts like non-error return"""
149 __super_init
= addinfourl
.__init
__
151 def __init__(self
, url
, code
, msg
, hdrs
, fp
):
157 # The addinfourl classes depend on fp being a valid file
158 # object. In some cases, the HTTPError may not have a valid
159 # file object. If this happens, the simplest workaround is to
160 # not initialize the base classes.
162 self
.__super
_init
(fp
, hdrs
, url
)
165 return 'HTTP Error %s: %s' % (self
.code
, self
.msg
)
167 class GopherError(URLError
):
170 # copied from cookielib.py
171 _cut_port_re
= re
.compile(r
":\d+$")
172 def request_host(request
):
173 """Return request-host, as defined by RFC 2965.
175 Variation from RFC: returned value is lowercased, for convenient
179 url
= request
.get_full_url()
180 host
= urlparse
.urlparse(url
)[1]
182 host
= request
.get_header("Host", "")
184 # remove port, if present
185 host
= _cut_port_re
.sub("", host
, 1)
190 def __init__(self
, url
, data
=None, headers
={},
191 origin_req_host
=None, unverifiable
=False):
192 # unwrap('<URL:type://host/path>') --> 'type://host/path'
193 self
.__original
= unwrap(url
)
195 # self.__r_type is what's left after doing the splittype
200 for key
, value
in headers
.items():
201 self
.add_header(key
, value
)
202 self
.unredirected_hdrs
= {}
203 if origin_req_host
is None:
204 origin_req_host
= request_host(self
)
205 self
.origin_req_host
= origin_req_host
206 self
.unverifiable
= unverifiable
208 def __getattr__(self
, attr
):
209 # XXX this is a fallback mechanism to guard against these
210 # methods getting called in a non-standard order. this may be
211 # too complicated and/or unnecessary.
212 # XXX should the __r_XXX attributes be public?
213 if attr
[:12] == '_Request__r_':
215 if hasattr(Request
, 'get_' + name
):
216 getattr(self
, 'get_' + name
)()
217 return getattr(self
, attr
)
218 raise AttributeError, attr
220 def get_method(self
):
226 # XXX these helper methods are lame
228 def add_data(self
, data
):
232 return self
.data
is not None
237 def get_full_url(self
):
238 return self
.__original
241 if self
.type is None:
242 self
.type, self
.__r
_type
= splittype(self
.__original
)
243 if self
.type is None:
244 raise ValueError, "unknown url type: %s" % self
.__original
248 if self
.host
is None:
249 self
.host
, self
.__r
_host
= splithost(self
.__r
_type
)
251 self
.host
= unquote(self
.host
)
254 def get_selector(self
):
257 def set_proxy(self
, host
, type):
258 self
.host
, self
.type = host
, type
259 self
.__r
_host
= self
.__original
261 def get_origin_req_host(self
):
262 return self
.origin_req_host
264 def is_unverifiable(self
):
265 return self
.unverifiable
267 def add_header(self
, key
, val
):
268 # useful for something like authentication
269 self
.headers
[key
.capitalize()] = val
271 def add_unredirected_header(self
, key
, val
):
272 # will not be added to a redirected request
273 self
.unredirected_hdrs
[key
.capitalize()] = val
275 def has_header(self
, header_name
):
276 return (header_name
in self
.headers
or
277 header_name
in self
.unredirected_hdrs
)
279 def get_header(self
, header_name
, default
=None):
280 return self
.headers
.get(
282 self
.unredirected_hdrs
.get(header_name
, default
))
284 def header_items(self
):
285 hdrs
= self
.unredirected_hdrs
.copy()
286 hdrs
.update(self
.headers
)
289 class OpenerDirector
:
291 client_version
= "Python-urllib/%s" % __version__
292 self
.addheaders
= [('User-agent', client_version
)]
293 # manage the individual handlers
295 self
.handle_open
= {}
296 self
.handle_error
= {}
297 self
.process_response
= {}
298 self
.process_request
= {}
300 def add_handler(self
, handler
):
302 for meth
in dir(handler
):
303 if meth
in ["redirect_request", "do_open", "proxy_open"]:
304 # oops, coincidental match
309 condition
= meth
[i
+1:]
311 if condition
.startswith("error"):
312 j
= condition
.find("_") + i
+ 1
318 lookup
= self
.handle_error
.get(protocol
, {})
319 self
.handle_error
[protocol
] = lookup
320 elif condition
== "open":
322 lookup
= self
.handle_open
323 elif condition
== "response":
325 lookup
= self
.process_response
326 elif condition
== "request":
328 lookup
= self
.process_request
332 handlers
= lookup
.setdefault(kind
, [])
334 bisect
.insort(handlers
, handler
)
336 handlers
.append(handler
)
340 # the handlers must work in an specific order, the order
341 # is specified in a Handler attribute
342 bisect
.insort(self
.handlers
, handler
)
343 handler
.add_parent(self
)
346 # Only exists for backwards compatibility.
349 def _call_chain(self
, chain
, kind
, meth_name
, *args
):
350 # Handlers raise an exception if no one else should try to handle
351 # the request, or return None if they can't but another handler
352 # could. Otherwise, they return the response.
353 handlers
= chain
.get(kind
, ())
354 for handler
in handlers
:
355 func
= getattr(handler
, meth_name
)
358 if result
is not None:
361 def open(self
, fullurl
, data
=None):
362 # accept a URL or a Request object
363 if isinstance(fullurl
, basestring
):
364 req
= Request(fullurl
, data
)
370 protocol
= req
.get_type()
372 # pre-process request
373 meth_name
= protocol
+"_request"
374 for processor
in self
.process_request
.get(protocol
, []):
375 meth
= getattr(processor
, meth_name
)
378 response
= self
._open
(req
, data
)
380 # post-process response
381 meth_name
= protocol
+"_response"
382 for processor
in self
.process_response
.get(protocol
, []):
383 meth
= getattr(processor
, meth_name
)
384 response
= meth(req
, response
)
388 def _open(self
, req
, data
=None):
389 result
= self
._call
_chain
(self
.handle_open
, 'default',
394 protocol
= req
.get_type()
395 result
= self
._call
_chain
(self
.handle_open
, protocol
, protocol
+
400 return self
._call
_chain
(self
.handle_open
, 'unknown',
403 def error(self
, proto
, *args
):
404 if proto
in ('http', 'https'):
405 # XXX http[s] protocols are special-cased
406 dict = self
.handle_error
['http'] # https is not different than http
407 proto
= args
[2] # YUCK!
408 meth_name
= 'http_error_%s' % proto
412 dict = self
.handle_error
413 meth_name
= proto
+ '_error'
415 args
= (dict, proto
, meth_name
) + args
416 result
= self
._call
_chain
(*args
)
421 args
= (dict, 'default', 'http_error_default') + orig_args
422 return self
._call
_chain
(*args
)
424 # XXX probably also want an abstract factory that knows when it makes
425 # sense to skip a superclass in favor of a subclass and when it might
426 # make sense to include both
428 def build_opener(*handlers
):
429 """Create an opener object from a list of handlers.
431 The opener will use several default handlers, including support
434 If any of the handlers passed as arguments are subclasses of the
435 default handlers, the default handlers will not be used.
439 return isinstance(obj
, types
.ClassType
) or hasattr(obj
, "__bases__")
441 opener
= OpenerDirector()
442 default_classes
= [ProxyHandler
, UnknownHandler
, HTTPHandler
,
443 HTTPDefaultErrorHandler
, HTTPRedirectHandler
,
444 FTPHandler
, FileHandler
, HTTPErrorProcessor
]
445 if hasattr(httplib
, 'HTTPS'):
446 default_classes
.append(HTTPSHandler
)
448 for klass
in default_classes
:
449 for check
in handlers
:
451 if issubclass(check
, klass
):
453 elif isinstance(check
, klass
):
456 default_classes
.remove(klass
)
458 for klass
in default_classes
:
459 opener
.add_handler(klass())
464 opener
.add_handler(h
)
470 def add_parent(self
, parent
):
474 # Only exists for backwards compatibility
477 def __lt__(self
, other
):
478 if not hasattr(other
, "handler_order"):
479 # Try to preserve the old behavior of having custom classes
480 # inserted after default ones (works only for custom user
481 # classes which are not aware of handler_order).
483 return self
.handler_order
< other
.handler_order
486 class HTTPErrorProcessor(BaseHandler
):
487 """Process HTTP error responses."""
488 handler_order
= 1000 # after all other processing
490 def http_response(self
, request
, response
):
491 code
, msg
, hdrs
= response
.code
, response
.msg
, response
.info()
493 # According to RFC 2616, "2xx" code indicates that the client's
494 # request was successfully received, understood, and accepted.
495 if not (200 <= code
< 300):
496 response
= self
.parent
.error(
497 'http', request
, response
, code
, msg
, hdrs
)
501 https_response
= http_response
503 class HTTPDefaultErrorHandler(BaseHandler
):
504 def http_error_default(self
, req
, fp
, code
, msg
, hdrs
):
505 raise HTTPError(req
.get_full_url(), code
, msg
, hdrs
, fp
)
507 class HTTPRedirectHandler(BaseHandler
):
508 # maximum number of redirections to any single URL
509 # this is needed because of the state that cookies introduce
511 # maximum total number of redirections (regardless of URL) before
512 # assuming we're in a loop
513 max_redirections
= 10
515 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
516 """Return a Request or None in response to a redirect.
518 This is called by the http_error_30x methods when a
519 redirection response is received. If a redirection should
520 take place, return a new Request to allow http_error_30x to
521 perform the redirect. Otherwise, raise HTTPError if no-one
522 else should try to handle this url. Return None if you can't
523 but another Handler might.
526 if (code
in (301, 302, 303, 307) and m
in ("GET", "HEAD")
527 or code
in (301, 302, 303) and m
== "POST"):
528 # Strictly (according to RFC 2616), 301 or 302 in response
529 # to a POST MUST NOT cause a redirection without confirmation
530 # from the user (of urllib2, in this case). In practice,
531 # essentially all clients do redirect in this case, so we
533 # be conciliant with URIs containing a space
534 newurl
= newurl
.replace(' ', '%20')
535 return Request(newurl
,
537 origin_req_host
=req
.get_origin_req_host(),
540 raise HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
)
542 # Implementation note: To avoid the server sending us into an
543 # infinite loop, the request object needs to track what URLs we
544 # have already seen. Do this by adding a handler-specific
545 # attribute to the Request object.
546 def http_error_302(self
, req
, fp
, code
, msg
, headers
):
547 # Some servers (incorrectly) return multiple Location headers
548 # (so probably same goes for URI). Use first header.
549 if 'location' in headers
:
550 newurl
= headers
.getheaders('location')[0]
551 elif 'uri' in headers
:
552 newurl
= headers
.getheaders('uri')[0]
555 newurl
= urlparse
.urljoin(req
.get_full_url(), newurl
)
557 # XXX Probably want to forget about the state of the current
558 # request, although that might interact poorly with other
559 # handlers that also use handler-specific request attributes
560 new
= self
.redirect_request(req
, fp
, code
, msg
, headers
, newurl
)
565 # .redirect_dict has a key url if url was previously visited.
566 if hasattr(req
, 'redirect_dict'):
567 visited
= new
.redirect_dict
= req
.redirect_dict
568 if (visited
.get(newurl
, 0) >= self
.max_repeats
or
569 len(visited
) >= self
.max_redirections
):
570 raise HTTPError(req
.get_full_url(), code
,
571 self
.inf_msg
+ msg
, headers
, fp
)
573 visited
= new
.redirect_dict
= req
.redirect_dict
= {}
574 visited
[newurl
] = visited
.get(newurl
, 0) + 1
576 # Don't close the fp until we are sure that we won't use it
581 return self
.parent
.open(new
)
583 http_error_301
= http_error_303
= http_error_307
= http_error_302
585 inf_msg
= "The HTTP server returned a redirect error that would " \
586 "lead to an infinite loop.\n" \
587 "The last 30x error message was:\n"
590 def _parse_proxy(proxy
):
591 """Return (scheme, user, password, host/port) given a URL or an authority.
593 If a URL is supplied, it must have an authority (host:port) component.
594 According to RFC 3986, having an authority component means the URL must
595 have two slashes after the scheme:
597 >>> _parse_proxy('file:/ftp.example.com/')
598 Traceback (most recent call last):
599 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
601 The first three items of the returned tuple may be None.
603 Examples of authority parsing:
605 >>> _parse_proxy('proxy.example.com')
606 (None, None, None, 'proxy.example.com')
607 >>> _parse_proxy('proxy.example.com:3128')
608 (None, None, None, 'proxy.example.com:3128')
610 The authority component may optionally include userinfo (assumed to be
613 >>> _parse_proxy('joe:password@proxy.example.com')
614 (None, 'joe', 'password', 'proxy.example.com')
615 >>> _parse_proxy('joe:password@proxy.example.com:3128')
616 (None, 'joe', 'password', 'proxy.example.com:3128')
618 Same examples, but with URLs instead:
620 >>> _parse_proxy('http://proxy.example.com/')
621 ('http', None, None, 'proxy.example.com')
622 >>> _parse_proxy('http://proxy.example.com:3128/')
623 ('http', None, None, 'proxy.example.com:3128')
624 >>> _parse_proxy('http://joe:password@proxy.example.com/')
625 ('http', 'joe', 'password', 'proxy.example.com')
626 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
627 ('http', 'joe', 'password', 'proxy.example.com:3128')
629 Everything after the authority is ignored:
631 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
632 ('ftp', 'joe', 'password', 'proxy.example.com')
634 Test for no trailing '/' case:
636 >>> _parse_proxy('http://joe:password@proxy.example.com')
637 ('http', 'joe', 'password', 'proxy.example.com')
640 scheme
, r_scheme
= splittype(proxy
)
641 if not r_scheme
.startswith("/"):
647 if not r_scheme
.startswith("//"):
648 raise ValueError("proxy URL with no authority: %r" % proxy
)
649 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
650 # and 3.3.), path is empty or starts with '/'
651 end
= r_scheme
.find("/", 2)
654 authority
= r_scheme
[2:end
]
655 userinfo
, hostport
= splituser(authority
)
656 if userinfo
is not None:
657 user
, password
= splitpasswd(userinfo
)
659 user
= password
= None
660 return scheme
, user
, password
, hostport
662 class ProxyHandler(BaseHandler
):
663 # Proxies must be in front
666 def __init__(self
, proxies
=None):
668 proxies
= getproxies()
669 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
670 self
.proxies
= proxies
671 for type, url
in proxies
.items():
672 setattr(self
, '%s_open' % type,
673 lambda r
, proxy
=url
, type=type, meth
=self
.proxy_open
: \
674 meth(r
, proxy
, type))
676 def proxy_open(self
, req
, proxy
, type):
677 orig_type
= req
.get_type()
678 proxy_type
, user
, password
, hostport
= _parse_proxy(proxy
)
679 if proxy_type
is None:
680 proxy_type
= orig_type
681 if user
and password
:
682 user_pass
= '%s:%s' % (unquote(user
), unquote(password
))
683 creds
= base64
.b64encode(user_pass
).strip()
684 req
.add_header('Proxy-authorization', 'Basic ' + creds
)
685 hostport
= unquote(hostport
)
686 req
.set_proxy(hostport
, proxy_type
)
687 if orig_type
== proxy_type
:
688 # let other handlers take care of it
691 # need to start over, because the other handlers don't
692 # grok the proxy's URL type
693 # e.g. if we have a constructor arg proxies like so:
694 # {'http': 'ftp://proxy.example.com'}, we may end up turning
695 # a request for http://acme.example.com/a into one for
696 # ftp://proxy.example.com/a
697 return self
.parent
.open(req
)
699 class HTTPPasswordMgr
:
704 def add_password(self
, realm
, uri
, user
, passwd
):
705 # uri could be a single URI or a sequence
706 if isinstance(uri
, basestring
):
708 if not realm
in self
.passwd
:
709 self
.passwd
[realm
] = {}
710 for default_port
in True, False:
712 [self
.reduce_uri(u
, default_port
) for u
in uri
])
713 self
.passwd
[realm
][reduced_uri
] = (user
, passwd
)
715 def find_user_password(self
, realm
, authuri
):
716 domains
= self
.passwd
.get(realm
, {})
717 for default_port
in True, False:
718 reduced_authuri
= self
.reduce_uri(authuri
, default_port
)
719 for uris
, authinfo
in domains
.iteritems():
721 if self
.is_suburi(uri
, reduced_authuri
):
725 def reduce_uri(self
, uri
, default_port
=True):
726 """Accept authority or URI and extract only the authority and path."""
727 # note HTTP URLs do not have a userinfo component
728 parts
= urlparse
.urlsplit(uri
)
733 path
= parts
[2] or '/'
739 host
, port
= splitport(authority
)
740 if default_port
and port
is None and scheme
is not None:
744 if dport
is not None:
745 authority
= "%s:%d" % (host
, dport
)
746 return authority
, path
748 def is_suburi(self
, base
, test
):
749 """Check if test is below base in a URI tree
751 Both args must be URIs in reduced form.
755 if base
[0] != test
[0]:
757 common
= posixpath
.commonprefix((base
[1], test
[1]))
758 if len(common
) == len(base
[1]):
763 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr
):
765 def find_user_password(self
, realm
, authuri
):
766 user
, password
= HTTPPasswordMgr
.find_user_password(self
, realm
,
769 return user
, password
770 return HTTPPasswordMgr
.find_user_password(self
, None, authuri
)
773 class AbstractBasicAuthHandler
:
775 # XXX this allows for multiple auth-schemes, but will stupidly pick
776 # the last one with a realm specified.
778 rx
= re
.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re
.I
)
780 # XXX could pre-emptively send auth info already accepted (RFC 2617,
781 # end of section 2, and section 1.2 immediately after "credentials"
784 def __init__(self
, password_mgr
=None):
785 if password_mgr
is None:
786 password_mgr
= HTTPPasswordMgr()
787 self
.passwd
= password_mgr
788 self
.add_password
= self
.passwd
.add_password
790 def http_error_auth_reqed(self
, authreq
, host
, req
, headers
):
791 # host may be an authority (without userinfo) or a URL with an
793 # XXX could be multiple headers
794 authreq
= headers
.get(authreq
, None)
796 mo
= AbstractBasicAuthHandler
.rx
.search(authreq
)
798 scheme
, realm
= mo
.groups()
799 if scheme
.lower() == 'basic':
800 return self
.retry_http_basic_auth(host
, req
, realm
)
802 def retry_http_basic_auth(self
, host
, req
, realm
):
803 user
, pw
= self
.passwd
.find_user_password(realm
, host
)
805 raw
= "%s:%s" % (user
, pw
)
806 auth
= 'Basic %s' % base64
.b64encode(raw
).strip()
807 if req
.headers
.get(self
.auth_header
, None) == auth
:
809 req
.add_header(self
.auth_header
, auth
)
810 return self
.parent
.open(req
)
815 class HTTPBasicAuthHandler(AbstractBasicAuthHandler
, BaseHandler
):
817 auth_header
= 'Authorization'
819 def http_error_401(self
, req
, fp
, code
, msg
, headers
):
820 url
= req
.get_full_url()
821 return self
.http_error_auth_reqed('www-authenticate',
825 class ProxyBasicAuthHandler(AbstractBasicAuthHandler
, BaseHandler
):
827 auth_header
= 'Proxy-authorization'
829 def http_error_407(self
, req
, fp
, code
, msg
, headers
):
830 # http_error_auth_reqed requires that there is no userinfo component in
831 # authority. Assume there isn't one, since urllib2 does not (and
832 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
834 authority
= req
.get_host()
835 return self
.http_error_auth_reqed('proxy-authenticate',
836 authority
, req
, headers
)
840 """Return n random bytes."""
841 # Use /dev/urandom if it is available. Fall back to random module
842 # if not. It might be worthwhile to extend this function to use
843 # other platform-specific mechanisms for getting random bytes.
844 if os
.path
.exists("/dev/urandom"):
845 f
= open("/dev/urandom")
850 L
= [chr(random
.randrange(0, 256)) for i
in range(n
)]
853 class AbstractDigestAuthHandler
:
854 # Digest authentication is specified in RFC 2617.
856 # XXX The client does not inspect the Authentication-Info header
857 # in a successful response.
859 # XXX It should be possible to test this implementation against
860 # a mock server that just generates a static set of challenges.
862 # XXX qop="auth-int" supports is shaky
864 def __init__(self
, passwd
=None):
866 passwd
= HTTPPasswordMgr()
868 self
.add_password
= self
.passwd
.add_password
872 def reset_retry_count(self
):
875 def http_error_auth_reqed(self
, auth_header
, host
, req
, headers
):
876 authreq
= headers
.get(auth_header
, None)
878 # Don't fail endlessly - if we failed once, we'll probably
879 # fail a second time. Hm. Unless the Password Manager is
880 # prompting for the information. Crap. This isn't great
881 # but it's better than the current 'repeat until recursion
882 # depth exceeded' approach <wink>
883 raise HTTPError(req
.get_full_url(), 401, "digest auth failed",
888 scheme
= authreq
.split()[0]
889 if scheme
.lower() == 'digest':
890 return self
.retry_http_digest_auth(req
, authreq
)
892 def retry_http_digest_auth(self
, req
, auth
):
893 token
, challenge
= auth
.split(' ', 1)
894 chal
= parse_keqv_list(parse_http_list(challenge
))
895 auth
= self
.get_authorization(req
, chal
)
897 auth_val
= 'Digest %s' % auth
898 if req
.headers
.get(self
.auth_header
, None) == auth_val
:
900 req
.add_unredirected_header(self
.auth_header
, auth_val
)
901 resp
= self
.parent
.open(req
)
904 def get_cnonce(self
, nonce
):
905 # The cnonce-value is an opaque
906 # quoted string value provided by the client and used by both client
907 # and server to avoid chosen plaintext attacks, to provide mutual
908 # authentication, and to provide some message integrity protection.
909 # This isn't a fabulous effort, but it's probably Good Enough.
910 dig
= hashlib
.sha1("%s:%s:%s:%s" % (self
.nonce_count
, nonce
, time
.ctime(),
911 randombytes(8))).hexdigest()
914 def get_authorization(self
, req
, chal
):
916 realm
= chal
['realm']
917 nonce
= chal
['nonce']
918 qop
= chal
.get('qop')
919 algorithm
= chal
.get('algorithm', 'MD5')
920 # mod_digest doesn't send an opaque, even though it isn't
921 # supposed to be optional
922 opaque
= chal
.get('opaque', None)
926 H
, KD
= self
.get_algorithm_impls(algorithm
)
930 user
, pw
= self
.passwd
.find_user_password(realm
, req
.get_full_url())
934 # XXX not implemented yet
936 entdig
= self
.get_entity_digest(req
.get_data(), chal
)
940 A1
= "%s:%s:%s" % (user
, realm
, pw
)
941 A2
= "%s:%s" % (req
.get_method(),
942 # XXX selector: what about proxies and full urls
945 self
.nonce_count
+= 1
946 ncvalue
= '%08x' % self
.nonce_count
947 cnonce
= self
.get_cnonce(nonce
)
948 noncebit
= "%s:%s:%s:%s:%s" % (nonce
, ncvalue
, cnonce
, qop
, H(A2
))
949 respdig
= KD(H(A1
), noncebit
)
951 respdig
= KD(H(A1
), "%s:%s" % (nonce
, H(A2
)))
953 # XXX handle auth-int.
956 # XXX should the partial digests be encoded too?
958 base
= 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
959 'response="%s"' % (user
, realm
, nonce
, req
.get_selector(),
962 base
+= ', opaque="%s"' % opaque
964 base
+= ', digest="%s"' % entdig
965 base
+= ', algorithm="%s"' % algorithm
967 base
+= ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue
, cnonce
)
970 def get_algorithm_impls(self
, algorithm
):
971 # lambdas assume digest modules are imported at the top level
972 if algorithm
== 'MD5':
973 H
= lambda x
: hashlib
.md5(x
).hexdigest()
974 elif algorithm
== 'SHA':
975 H
= lambda x
: hashlib
.sha1(x
).hexdigest()
977 KD
= lambda s
, d
: H("%s:%s" % (s
, d
))
980 def get_entity_digest(self
, data
, chal
):
981 # XXX not implemented yet
985 class HTTPDigestAuthHandler(BaseHandler
, AbstractDigestAuthHandler
):
986 """An authentication protocol defined by RFC 2069
988 Digest authentication improves on basic authentication because it
989 does not transmit passwords in the clear.
992 auth_header
= 'Authorization'
993 handler_order
= 490 # before Basic auth
995 def http_error_401(self
, req
, fp
, code
, msg
, headers
):
996 host
= urlparse
.urlparse(req
.get_full_url())[1]
997 retry
= self
.http_error_auth_reqed('www-authenticate',
999 self
.reset_retry_count()
1003 class ProxyDigestAuthHandler(BaseHandler
, AbstractDigestAuthHandler
):
1005 auth_header
= 'Proxy-Authorization'
1006 handler_order
= 490 # before Basic auth
1008 def http_error_407(self
, req
, fp
, code
, msg
, headers
):
1009 host
= req
.get_host()
1010 retry
= self
.http_error_auth_reqed('proxy-authenticate',
1012 self
.reset_retry_count()
1015 class AbstractHTTPHandler(BaseHandler
):
1017 def __init__(self
, debuglevel
=0):
1018 self
._debuglevel
= debuglevel
1020 def set_http_debuglevel(self
, level
):
1021 self
._debuglevel
= level
1023 def do_request_(self
, request
):
1024 host
= request
.get_host()
1026 raise URLError('no host given')
1028 if request
.has_data(): # POST
1029 data
= request
.get_data()
1030 if not request
.has_header('Content-type'):
1031 request
.add_unredirected_header(
1033 'application/x-www-form-urlencoded')
1034 if not request
.has_header('Content-length'):
1035 request
.add_unredirected_header(
1036 'Content-length', '%d' % len(data
))
1038 scheme
, sel
= splittype(request
.get_selector())
1039 sel_host
, sel_path
= splithost(sel
)
1040 if not request
.has_header('Host'):
1041 request
.add_unredirected_header('Host', sel_host
or host
)
1042 for name
, value
in self
.parent
.addheaders
:
1043 name
= name
.capitalize()
1044 if not request
.has_header(name
):
1045 request
.add_unredirected_header(name
, value
)
1049 def do_open(self
, http_class
, req
):
1050 """Return an addinfourl object for the request, using http_class.
1052 http_class must implement the HTTPConnection API from httplib.
1053 The addinfourl return value is a file-like object. It also
1054 has methods and attributes including:
1055 - info(): return a mimetools.Message object for the headers
1056 - geturl(): return the original request URL
1057 - code: HTTP status code
1059 host
= req
.get_host()
1061 raise URLError('no host given')
1063 h
= http_class(host
) # will parse host:port
1064 h
.set_debuglevel(self
._debuglevel
)
1066 headers
= dict(req
.headers
)
1067 headers
.update(req
.unredirected_hdrs
)
1068 # We want to make an HTTP/1.1 request, but the addinfourl
1069 # class isn't prepared to deal with a persistent connection.
1070 # It will try to read all remaining data from the socket,
1071 # which will block while the server waits for the next request.
1072 # So make sure the connection gets closed after the (only)
1074 headers
["Connection"] = "close"
1076 (name
.title(), val
) for name
, val
in headers
.items())
1078 h
.request(req
.get_method(), req
.get_selector(), req
.data
, headers
)
1080 except socket
.error
, err
: # XXX what error?
1083 # Pick apart the HTTPResponse object to get the addinfourl
1084 # object initialized properly.
1086 # Wrap the HTTPResponse object in socket's file object adapter
1087 # for Windows. That adapter calls recv(), so delegate recv()
1088 # to read(). This weird wrapping allows the returned object to
1089 # have readline() and readlines() methods.
1091 # XXX It might be better to extract the read buffering code
1092 # out of socket._fileobject() and into a base class.
1095 fp
= socket
._fileobject
(r
, close
=True)
1097 resp
= addinfourl(fp
, r
.msg
, req
.get_full_url())
1098 resp
.code
= r
.status
1103 class HTTPHandler(AbstractHTTPHandler
):
1105 def http_open(self
, req
):
1106 return self
.do_open(httplib
.HTTPConnection
, req
)
1108 http_request
= AbstractHTTPHandler
.do_request_
1110 if hasattr(httplib
, 'HTTPS'):
1111 class HTTPSHandler(AbstractHTTPHandler
):
1113 def https_open(self
, req
):
1114 return self
.do_open(httplib
.HTTPSConnection
, req
)
1116 https_request
= AbstractHTTPHandler
.do_request_
1118 class HTTPCookieProcessor(BaseHandler
):
1119 def __init__(self
, cookiejar
=None):
1121 if cookiejar
is None:
1122 cookiejar
= cookielib
.CookieJar()
1123 self
.cookiejar
= cookiejar
1125 def http_request(self
, request
):
1126 self
.cookiejar
.add_cookie_header(request
)
1129 def http_response(self
, request
, response
):
1130 self
.cookiejar
.extract_cookies(response
, request
)
1133 https_request
= http_request
1134 https_response
= http_response
1136 class UnknownHandler(BaseHandler
):
1137 def unknown_open(self
, req
):
1138 type = req
.get_type()
1139 raise URLError('unknown url type: %s' % type)
1141 def parse_keqv_list(l
):
1142 """Parse list of key=value strings where keys are not duplicated."""
1145 k
, v
= elt
.split('=', 1)
1146 if v
[0] == '"' and v
[-1] == '"':
1151 def parse_http_list(s
):
1152 """Parse lists as described by RFC 2068 Section 2.
1154 In particular, parse comma-separated lists where the elements of
1155 the list may include quoted-strings. A quoted-string could
1156 contain a comma. A non-quoted string could have quotes in the
1157 middle. Neither commas nor quotes count if they are escaped.
1158 Only double-quotes count, not single-quotes.
1163 escape
= quote
= False
1192 return [part
.strip() for part
in res
]
1194 class FileHandler(BaseHandler
):
1195 # Use local file or FTP depending on form of URL
1196 def file_open(self
, req
):
1197 url
= req
.get_selector()
1198 if url
[:2] == '//' and url
[2:3] != '/':
1200 return self
.parent
.open(req
)
1202 return self
.open_local_file(req
)
1204 # names for the localhost
1206 def get_names(self
):
1207 if FileHandler
.names
is None:
1209 FileHandler
.names
= (socket
.gethostbyname('localhost'),
1210 socket
.gethostbyname(socket
.gethostname()))
1211 except socket
.gaierror
:
1212 FileHandler
.names
= (socket
.gethostbyname('localhost'),)
1213 return FileHandler
.names
1215 # not entirely sure what the rules are here
1216 def open_local_file(self
, req
):
1219 host
= req
.get_host()
1220 file = req
.get_selector()
1221 localfile
= url2pathname(file)
1223 stats
= os
.stat(localfile
)
1224 size
= stats
.st_size
1225 modified
= email
.utils
.formatdate(stats
.st_mtime
, usegmt
=True)
1226 mtype
= mimetypes
.guess_type(file)[0]
1227 headers
= mimetools
.Message(StringIO(
1228 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1229 (mtype
or 'text/plain', size
, modified
)))
1231 host
, port
= splitport(host
)
1233 (not port
and socket
.gethostbyname(host
) in self
.get_names()):
1234 return addinfourl(open(localfile
, 'rb'),
1235 headers
, 'file:'+file)
1236 except OSError, msg
:
1237 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1239 raise URLError('file not on local host')
1241 class FTPHandler(BaseHandler
):
1242 def ftp_open(self
, req
):
1245 host
= req
.get_host()
1247 raise IOError, ('ftp error', 'no host given')
1248 host
, port
= splitport(host
)
1250 port
= ftplib
.FTP_PORT
1254 # username/password handling
1255 user
, host
= splituser(host
)
1257 user
, passwd
= splitpasswd(user
)
1260 host
= unquote(host
)
1261 user
= unquote(user
or '')
1262 passwd
= unquote(passwd
or '')
1265 host
= socket
.gethostbyname(host
)
1266 except socket
.error
, msg
:
1268 path
, attrs
= splitattr(req
.get_selector())
1269 dirs
= path
.split('/')
1270 dirs
= map(unquote
, dirs
)
1271 dirs
, file = dirs
[:-1], dirs
[-1]
1272 if dirs
and not dirs
[0]:
1275 fw
= self
.connect_ftp(user
, passwd
, host
, port
, dirs
)
1276 type = file and 'I' or 'D'
1278 attr
, value
= splitvalue(attr
)
1279 if attr
.lower() == 'type' and \
1280 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
1281 type = value
.upper()
1282 fp
, retrlen
= fw
.retrfile(file, type)
1284 mtype
= mimetypes
.guess_type(req
.get_full_url())[0]
1286 headers
+= "Content-type: %s\n" % mtype
1287 if retrlen
is not None and retrlen
>= 0:
1288 headers
+= "Content-length: %d\n" % retrlen
1289 sf
= StringIO(headers
)
1290 headers
= mimetools
.Message(sf
)
1291 return addinfourl(fp
, headers
, req
.get_full_url())
1292 except ftplib
.all_errors
, msg
:
1293 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
1295 def connect_ftp(self
, user
, passwd
, host
, port
, dirs
):
1296 fw
= ftpwrapper(user
, passwd
, host
, port
, dirs
)
1297 ## fw.ftp.set_debuglevel(1)
1300 class CacheFTPHandler(FTPHandler
):
1301 # XXX would be nice to have pluggable cache strategies
1302 # XXX this stuff is definitely not thread safe
1310 def setTimeout(self
, t
):
1313 def setMaxConns(self
, m
):
1316 def connect_ftp(self
, user
, passwd
, host
, port
, dirs
):
1317 key
= user
, host
, port
, '/'.join(dirs
)
1318 if key
in self
.cache
:
1319 self
.timeout
[key
] = time
.time() + self
.delay
1321 self
.cache
[key
] = ftpwrapper(user
, passwd
, host
, port
, dirs
)
1322 self
.timeout
[key
] = time
.time() + self
.delay
1324 return self
.cache
[key
]
1326 def check_cache(self
):
1327 # first check for old ones
1329 if self
.soonest
<= t
:
1330 for k
, v
in self
.timeout
.items():
1332 self
.cache
[k
].close()
1335 self
.soonest
= min(self
.timeout
.values())
1337 # then check the size
1338 if len(self
.cache
) == self
.max_conns
:
1339 for k
, v
in self
.timeout
.items():
1340 if v
== self
.soonest
:
1344 self
.soonest
= min(self
.timeout
.values())
1346 class GopherHandler(BaseHandler
):
1347 def gopher_open(self
, req
):
1348 # XXX can raise socket.error
1349 import gopherlib
# this raises DeprecationWarning in 2.5
1350 host
= req
.get_host()
1352 raise GopherError('no host given')
1353 host
= unquote(host
)
1354 selector
= req
.get_selector()
1355 type, selector
= splitgophertype(selector
)
1356 selector
, query
= splitquery(selector
)
1357 selector
= unquote(selector
)
1359 query
= unquote(query
)
1360 fp
= gopherlib
.send_query(selector
, query
, host
)
1362 fp
= gopherlib
.send_selector(selector
, host
)
1363 return addinfourl(fp
, noheaders(), req
.get_full_url())