1 """HTTP related handlers.
3 Note that some other HTTP handlers live in more specific modules: _auth.py,
7 Copyright 2002-2006 John J Lee <jjl@pobox.com>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD or ZPL 2.1 licenses (see the file
11 COPYING.txt included with the distribution).
15 import copy
, time
, tempfile
, htmlentitydefs
, re
, logging
, socket
, \
16 urllib2
, urllib
, httplib
, sgmllib
17 from urllib2
import URLError
, HTTPError
, BaseHandler
18 from cStringIO
import StringIO
20 from _request
import Request
21 from _util
import isstringlike
22 from _response
import closeable_response
, response_seek_wrapper
23 from _html
import unescape
, unescape_charref
24 from _headersutil
import is_html
25 from _clientcookie
import CookieJar
, request_host
28 debug
= logging
.getLogger("mechanize").debug
30 # monkeypatch urllib2.HTTPError to show URL
31 ## def urllib2_str(self):
32 ## return 'HTTP Error %s: %s (%s)' % (
33 ## self.code, self.msg, self.geturl())
34 ## urllib2.HTTPError.__str__ = urllib2_str
37 CHUNK
= 1024 # size of chunks fed to HTML HEAD parser, in bytes
38 DEFAULT_ENCODING
= 'latin-1'
41 # This adds "refresh" to the list of redirectables and provides a redirection
42 # algorithm that doesn't go into a loop in the presence of cookies
43 # (Python 2.4 has this new algorithm, 2.3 doesn't).
44 class HTTPRedirectHandler(BaseHandler
):
45 # maximum number of redirections to any single URL
46 # this is needed because of the state that cookies introduce
48 # maximum total number of redirections (regardless of URL) before
49 # assuming we're in a loop
52 # Implementation notes:
54 # To avoid the server sending us into an infinite loop, the request
55 # object needs to track what URLs we have already seen. Do this by
56 # adding a handler-specific attribute to the Request object. The value
57 # of the dict is used to count the number of times the same URL has
58 # been visited. This is needed because visiting the same URL twice
59 # does not necessarily imply a loop, thanks to state introduced by
62 # Always unhandled redirection codes:
63 # 300 Multiple Choices: should not handle this here.
64 # 304 Not Modified: no need to handle here: only of interest to caches
65 # that do conditional GETs
66 # 305 Use Proxy: probably not worth dealing with here
67 # 306 Unused: what was this for in the previous versions of protocol??
69 def redirect_request(self
, newurl
, req
, fp
, code
, msg
, headers
):
70 """Return a Request or None in response to a redirect.
72 This is called by the http_error_30x methods when a redirection
73 response is received. If a redirection should take place, return a
74 new Request to allow http_error_30x to perform the redirect;
75 otherwise, return None to indicate that an HTTPError should be
79 if code
in (301, 302, 303, "refresh") or \
80 (code
== 307 and not req
.has_data()):
81 # Strictly (according to RFC 2616), 301 or 302 in response to
82 # a POST MUST NOT cause a redirection without confirmation
83 # from the user (of urllib2, in this case). In practice,
84 # essentially all clients do redirect in this case, so we do
86 # XXX really refresh redirections should be visiting; tricky to
87 # fix, so this will wait until post-stable release
90 origin_req_host
=req
.get_origin_req_host(),
94 new
._origin
_req
= getattr(req
, "_origin_req", req
)
97 raise HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
)
99 def http_error_302(self
, req
, fp
, code
, msg
, headers
):
100 # Some servers (incorrectly) return multiple Location headers
101 # (so probably same goes for URI). Use first header.
102 if headers
.has_key('location'):
103 newurl
= headers
.getheaders('location')[0]
104 elif headers
.has_key('uri'):
105 newurl
= headers
.getheaders('uri')[0]
108 newurl
= _rfc3986
.clean_url(newurl
, "latin-1")
109 newurl
= _rfc3986
.urljoin(req
.get_full_url(), newurl
)
111 # XXX Probably want to forget about the state of the current
112 # request, although that might interact poorly with other
113 # handlers that also use handler-specific request attributes
114 new
= self
.redirect_request(newurl
, req
, fp
, code
, msg
, headers
)
119 # .redirect_dict has a key url if url was previously visited.
120 if hasattr(req
, 'redirect_dict'):
121 visited
= new
.redirect_dict
= req
.redirect_dict
122 if (visited
.get(newurl
, 0) >= self
.max_repeats
or
123 len(visited
) >= self
.max_redirections
):
124 raise HTTPError(req
.get_full_url(), code
,
125 self
.inf_msg
+ msg
, headers
, fp
)
127 visited
= new
.redirect_dict
= req
.redirect_dict
= {}
128 visited
[newurl
] = visited
.get(newurl
, 0) + 1
130 # Don't close the fp until we are sure that we won't use it
135 return self
.parent
.open(new
)
137 http_error_301
= http_error_303
= http_error_307
= http_error_302
138 http_error_refresh
= http_error_302
140 inf_msg
= "The HTTP server returned a redirect error that would " \
141 "lead to an infinite loop.\n" \
142 "The last 30x error message was:\n"
145 # XXX would self.reset() work, instead of raising this exception?
146 class EndOfHeadError(Exception): pass
147 class AbstractHeadParser
:
148 # only these elements are allowed in or before HEAD of document
149 head_elems
= ("html", "head",
151 "script", "style", "meta", "link", "object")
152 _entitydefs
= htmlentitydefs
.name2codepoint
153 _encoding
= DEFAULT_ENCODING
158 def start_meta(self
, attrs
):
159 http_equiv
= content
= None
160 for key
, value
in attrs
:
161 if key
== "http-equiv":
162 http_equiv
= self
.unescape_attr_if_required(value
)
163 elif key
== "content":
164 content
= self
.unescape_attr_if_required(value
)
165 if http_equiv
is not None and content
is not None:
166 self
.http_equiv
.append((http_equiv
, content
))
169 raise EndOfHeadError()
171 def handle_entityref(self
, name
):
173 self
.handle_data(unescape(
174 '&%s;' % name
, self
._entitydefs
, self
._encoding
))
176 def handle_charref(self
, name
):
178 self
.handle_data(unescape_charref(name
, self
._encoding
))
180 def unescape_attr(self
, name
):
182 return unescape(name
, self
._entitydefs
, self
._encoding
)
184 def unescape_attrs(self
, attrs
):
187 for key
, val
in attrs
.items():
188 escaped_attrs
[key
] = self
.unescape_attr(val
)
191 def unknown_entityref(self
, ref
):
192 self
.handle_data("&%s;" % ref
)
194 def unknown_charref(self
, ref
):
195 self
.handle_data("&#%s;" % ref
)
203 class XHTMLCompatibleHeadParser(AbstractHeadParser
,
204 HTMLParser
.HTMLParser
):
206 HTMLParser
.HTMLParser
.__init
__(self
)
207 AbstractHeadParser
.__init
__(self
)
209 def handle_starttag(self
, tag
, attrs
):
210 if tag
not in self
.head_elems
:
211 raise EndOfHeadError()
213 method
= getattr(self
, 'start_' + tag
)
214 except AttributeError:
216 method
= getattr(self
, 'do_' + tag
)
217 except AttributeError:
224 def handle_endtag(self
, tag
):
225 if tag
not in self
.head_elems
:
226 raise EndOfHeadError()
228 method
= getattr(self
, 'end_' + tag
)
229 except AttributeError:
234 def unescape(self
, name
):
235 # Use the entitydefs passed into constructor, not
236 # HTMLParser.HTMLParser's entitydefs.
237 return self
.unescape_attr(name
)
239 def unescape_attr_if_required(self
, name
):
240 return name
# HTMLParser.HTMLParser already did it
242 class HeadParser(AbstractHeadParser
, sgmllib
.SGMLParser
):
244 def _not_called(self
):
248 sgmllib
.SGMLParser
.__init
__(self
)
249 AbstractHeadParser
.__init
__(self
)
251 def handle_starttag(self
, tag
, method
, attrs
):
252 if tag
not in self
.head_elems
:
253 raise EndOfHeadError()
257 def unknown_starttag(self
, tag
, attrs
):
258 self
.handle_starttag(tag
, self
._not
_called
, attrs
)
260 def handle_endtag(self
, tag
, method
):
261 if tag
in self
.head_elems
:
264 raise EndOfHeadError()
266 def unescape_attr_if_required(self
, name
):
267 return self
.unescape_attr(name
)
269 def parse_head(fileobj
, parser
):
270 """Return a list of key, value pairs."""
272 data
= fileobj
.read(CHUNK
)
275 except EndOfHeadError
:
277 if len(data
) != CHUNK
:
278 # this should only happen if there is no HTML body, or if
281 return parser
.http_equiv
283 class HTTPEquivProcessor(BaseHandler
):
284 """Append META HTTP-EQUIV headers to regular HTTP headers."""
286 handler_order
= 300 # before handlers that look at HTTP headers
288 def __init__(self
, head_parser_class
=HeadParser
,
289 i_want_broken_xhtml_support
=False,
291 self
.head_parser_class
= head_parser_class
292 self
._allow
_xhtml
= i_want_broken_xhtml_support
294 def http_response(self
, request
, response
):
295 if not hasattr(response
, "seek"):
296 response
= response_seek_wrapper(response
)
297 http_message
= response
.info()
298 url
= response
.geturl()
299 ct_hdrs
= http_message
.getheaders("content-type")
300 if is_html(ct_hdrs
, url
, self
._allow
_xhtml
):
303 html_headers
= parse_head(response
, self
.head_parser_class())
306 except (HTMLParser
.HTMLParseError
,
307 sgmllib
.SGMLParseError
):
310 for hdr
, val
in html_headers
:
312 http_message
.dict[hdr
.lower()] = val
313 text
= hdr
+ ": " + val
314 for line
in text
.split("\n"):
315 http_message
.headers
.append(line
+ "\n")
318 https_response
= http_response
320 class HTTPCookieProcessor(BaseHandler
):
321 """Handle HTTP cookies.
325 cookiejar: CookieJar instance
328 def __init__(self
, cookiejar
=None):
329 if cookiejar
is None:
330 cookiejar
= CookieJar()
331 self
.cookiejar
= cookiejar
333 def http_request(self
, request
):
334 self
.cookiejar
.add_cookie_header(request
)
337 def http_response(self
, request
, response
):
338 self
.cookiejar
.extract_cookies(response
, request
)
341 https_request
= http_request
342 https_response
= http_response
349 class MechanizeRobotFileParser(robotparser
.RobotFileParser
):
351 def __init__(self
, url
='', opener
=None):
353 robotparser
.RobotFileParser
.__init
__(self
, url
)
354 self
._opener
= opener
356 def set_opener(self
, opener
=None):
358 opener
= _opener
.OpenerDirector()
359 self
._opener
= opener
362 """Reads the robots.txt URL and feeds it to the parser."""
363 if self
._opener
is None:
365 req
= Request(self
.url
, unverifiable
=True, visit
=False)
367 f
= self
._opener
.open(req
)
370 except (IOError, socket
.error
, OSError), exc
:
371 robotparser
._debug
("ignoring error opening %r: %s" %
377 lines
.append(line
.strip())
380 if status
== 401 or status
== 403:
381 self
.disallow_all
= True
382 robotparser
._debug
("disallow all")
384 self
.allow_all
= True
385 robotparser
._debug
("allow all")
386 elif status
== 200 and lines
:
387 robotparser
._debug
("parse lines")
390 class RobotExclusionError(urllib2
.HTTPError
):
391 def __init__(self
, request
, *args
):
392 apply(urllib2
.HTTPError
.__init
__, (self
,)+args
)
393 self
.request
= request
395 class HTTPRobotRulesProcessor(BaseHandler
):
396 # before redirections, after everything else
400 from httplib
import HTTPMessage
402 from mimetools
import Message
403 http_response_class
= Message
405 http_response_class
= HTTPMessage
407 def __init__(self
, rfp_class
=MechanizeRobotFileParser
):
408 self
.rfp_class
= rfp_class
412 def http_request(self
, request
):
413 scheme
= request
.get_type()
414 if scheme
not in ["http", "https"]:
415 # robots exclusion only applies to HTTP
418 if request
.get_selector() == "/robots.txt":
419 # /robots.txt is always OK to fetch
422 host
= request
.get_host()
424 # robots.txt requests don't need to be allowed by robots.txt :-)
425 origin_req
= getattr(request
, "_origin_req", None)
426 if (origin_req
is not None and
427 origin_req
.get_selector() == "/robots.txt" and
428 origin_req
.get_host() == host
432 if host
!= self
._host
:
433 self
.rfp
= self
.rfp_class()
435 self
.rfp
.set_opener(self
.parent
)
436 except AttributeError:
437 debug("%r instance does not support set_opener" %
439 self
.rfp
.set_url(scheme
+"://"+host
+"/robots.txt")
443 ua
= request
.get_header("User-agent", "")
444 if self
.rfp
.can_fetch(ua
, request
.get_full_url()):
447 # XXX This should really have raised URLError. Too late now...
448 msg
= "request disallowed by robots.txt"
449 raise RobotExclusionError(
451 request
.get_full_url(),
453 self
.http_response_class(StringIO()), StringIO(msg
))
455 https_request
= http_request
457 class HTTPRefererProcessor(BaseHandler
):
458 """Add Referer header to requests.
460 This only makes sense if you use each RefererProcessor for a single
461 chain of requests only (so, for example, if you use a single
462 HTTPRefererProcessor to fetch a series of URLs extracted from a single
463 page, this will break).
465 There's a proper implementation of this in mechanize.Browser.
471 def http_request(self
, request
):
472 if ((self
.referer
is not None) and
473 not request
.has_header("Referer")):
474 request
.add_unredirected_header("Referer", self
.referer
)
477 def http_response(self
, request
, response
):
478 self
.referer
= response
.geturl()
481 https_request
= http_request
482 https_response
= http_response
485 def clean_refresh_url(url
):
486 # e.g. Firefox 1.5 does (something like) this
487 if ((url
.startswith('"') and url
.endswith('"')) or
488 (url
.startswith("'") and url
.endswith("'"))):
490 return _rfc3986
.clean_url(url
, "latin-1") # XXX encoding
492 def parse_refresh_header(refresh
):
494 >>> parse_refresh_header("1; url=http://example.com/")
495 (1.0, 'http://example.com/')
496 >>> parse_refresh_header("1; url='http://example.com/'")
497 (1.0, 'http://example.com/')
498 >>> parse_refresh_header("1")
500 >>> parse_refresh_header("blah")
501 Traceback (most recent call last):
502 ValueError: invalid literal for float(): blah
506 ii
= refresh
.find(";")
508 pause
, newurl_spec
= float(refresh
[:ii
]), refresh
[ii
+1:]
509 jj
= newurl_spec
.find("=")
512 key
, newurl
= newurl_spec
[:jj
], newurl_spec
[jj
+1:]
513 newurl
= clean_refresh_url(newurl
)
514 if key
is None or key
.strip().lower() != "url":
517 pause
, newurl
= float(refresh
), None
520 class HTTPRefreshProcessor(BaseHandler
):
521 """Perform HTTP Refresh redirections.
523 Note that if a non-200 HTTP code has occurred (for example, a 30x
524 redirect), this processor will do nothing.
526 By default, only zero-time Refresh headers are redirected. Use the
527 max_time attribute / constructor argument to allow Refresh with longer
528 pauses. Use the honor_time attribute / constructor argument to control
529 whether the requested pause is honoured (with a time.sleep()) or
530 skipped in favour of immediate redirection.
535 honor_time: see above
540 def __init__(self
, max_time
=0, honor_time
=True):
541 self
.max_time
= max_time
542 self
.honor_time
= honor_time
544 def http_response(self
, request
, response
):
545 code
, msg
, hdrs
= response
.code
, response
.msg
, response
.info()
547 if code
== 200 and hdrs
.has_key("refresh"):
548 refresh
= hdrs
.getheaders("refresh")[0]
550 pause
, newurl
= parse_refresh_header(refresh
)
552 debug("bad Refresh header: %r" % refresh
)
555 newurl
= response
.geturl()
556 if (self
.max_time
is None) or (pause
<= self
.max_time
):
557 if pause
> 1E-3 and self
.honor_time
:
559 hdrs
["location"] = newurl
560 # hardcoded http is NOT a bug
561 response
= self
.parent
.error(
562 "http", request
, response
,
563 "refresh", msg
, hdrs
)
567 https_response
= http_response
569 class HTTPErrorProcessor(BaseHandler
):
570 """Process HTTP error responses.
572 The purpose of this handler is to to allow other response processors a
573 look-in by removing the call to parent.error() from
576 For non-200 error codes, this just passes the job on to the
577 Handler.<proto>_error_<code> methods, via the OpenerDirector.error
578 method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
579 HTTPError if no other handler handles the error.
582 handler_order
= 1000 # after all other processors
584 def http_response(self
, request
, response
):
585 code
, msg
, hdrs
= response
.code
, response
.msg
, response
.info()
588 # hardcoded http is NOT a bug
589 response
= self
.parent
.error(
590 "http", request
, response
, code
, msg
, hdrs
)
594 https_response
= http_response
597 class HTTPDefaultErrorHandler(BaseHandler
):
598 def http_error_default(self
, req
, fp
, code
, msg
, hdrs
):
599 # why these error methods took the code, msg, headers args in the first
600 # place rather than a response object, I don't know, but to avoid
601 # multiple wrapping, we're discarding them
603 if isinstance(fp
, urllib2
.HTTPError
):
606 response
= urllib2
.HTTPError(
607 req
.get_full_url(), code
, msg
, hdrs
, fp
)
608 assert code
== response
.code
609 assert msg
== response
.msg
610 assert hdrs
== response
.hdrs
614 class AbstractHTTPHandler(BaseHandler
):
616 def __init__(self
, debuglevel
=0):
617 self
._debuglevel
= debuglevel
619 def set_http_debuglevel(self
, level
):
620 self
._debuglevel
= level
622 def do_request_(self
, request
):
623 host
= request
.get_host()
625 raise URLError('no host given')
627 if request
.has_data(): # POST
628 data
= request
.get_data()
629 if not request
.has_header('Content-type'):
630 request
.add_unredirected_header(
632 'application/x-www-form-urlencoded')
634 scheme
, sel
= urllib
.splittype(request
.get_selector())
635 sel_host
, sel_path
= urllib
.splithost(sel
)
636 if not request
.has_header('Host'):
637 request
.add_unredirected_header('Host', sel_host
or host
)
638 for name
, value
in self
.parent
.addheaders
:
639 name
= name
.capitalize()
640 if not request
.has_header(name
):
641 request
.add_unredirected_header(name
, value
)
645 def do_open(self
, http_class
, req
):
646 """Return an addinfourl object for the request, using http_class.
648 http_class must implement the HTTPConnection API from httplib.
649 The addinfourl return value is a file-like object. It also
650 has methods and attributes including:
651 - info(): return a mimetools.Message object for the headers
652 - geturl(): return the original request URL
653 - code: HTTP status code
655 host
= req
.get_host()
657 raise URLError('no host given')
659 h
= http_class(host
) # will parse host:port
660 h
.set_debuglevel(self
._debuglevel
)
662 headers
= dict(req
.headers
)
663 headers
.update(req
.unredirected_hdrs
)
664 # We want to make an HTTP/1.1 request, but the addinfourl
665 # class isn't prepared to deal with a persistent connection.
666 # It will try to read all remaining data from the socket,
667 # which will block while the server waits for the next request.
668 # So make sure the connection gets closed after the (only)
670 headers
["Connection"] = "close"
672 [(name
.title(), val
) for name
, val
in headers
.items()])
674 h
.request(req
.get_method(), req
.get_selector(), req
.data
, headers
)
676 except socket
.error
, err
: # XXX what error?
679 # Pick apart the HTTPResponse object to get the addinfourl
680 # object initialized properly.
682 # Wrap the HTTPResponse object in socket's file object adapter
683 # for Windows. That adapter calls recv(), so delegate recv()
684 # to read(). This weird wrapping allows the returned object to
685 # have readline() and readlines() methods.
687 # XXX It might be better to extract the read buffering code
688 # out of socket._fileobject() and into a base class.
691 fp
= socket
._fileobject
(r
)
693 resp
= closeable_response(fp
, r
.msg
, req
.get_full_url(),
698 class HTTPHandler(AbstractHTTPHandler
):
699 def http_open(self
, req
):
700 return self
.do_open(httplib
.HTTPConnection
, req
)
702 http_request
= AbstractHTTPHandler
.do_request_
704 if hasattr(httplib
, 'HTTPS'):
706 class HTTPSConnectionFactory
:
707 def __init__(self
, key_file
, cert_file
):
708 self
._key
_file
= key_file
709 self
._cert
_file
= cert_file
710 def __call__(self
, hostport
):
711 return httplib
.HTTPSConnection(
713 key_file
=self
._key
_file
, cert_file
=self
._cert
_file
)
715 class HTTPSHandler(AbstractHTTPHandler
):
716 def __init__(self
, client_cert_manager
=None):
717 AbstractHTTPHandler
.__init
__(self
)
718 self
.client_cert_manager
= client_cert_manager
720 def https_open(self
, req
):
721 if self
.client_cert_manager
is not None:
722 key_file
, cert_file
= self
.client_cert_manager
.find_key_cert(
724 conn_factory
= HTTPSConnectionFactory(key_file
, cert_file
)
726 conn_factory
= httplib
.HTTPSConnection
727 return self
.do_open(conn_factory
, req
)
729 https_request
= AbstractHTTPHandler
.do_request_