Fullscreen support, UI fixes, reset improved
[smpy-maemo.git] / mechanize / _http.py
blobd73f3f44e528d38882bbd78bd0d29b8b73648937
1 """HTTP related handlers.
3 Note that some other HTTP handlers live in more specific modules: _auth.py,
4 _gzip.py, etc.
7 Copyright 2002-2006 John J Lee <jjl@pobox.com>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD or ZPL 2.1 licenses (see the file
11 COPYING.txt included with the distribution).
13 """
15 import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
16 urllib2, urllib, httplib, sgmllib
17 from urllib2 import URLError, HTTPError, BaseHandler
18 from cStringIO import StringIO
20 from _request import Request
21 from _util import isstringlike
22 from _response import closeable_response, response_seek_wrapper
23 from _html import unescape, unescape_charref
24 from _headersutil import is_html
25 from _clientcookie import CookieJar, request_host
26 import _rfc3986
28 debug = logging.getLogger("mechanize").debug
30 # monkeypatch urllib2.HTTPError to show URL
31 ## def urllib2_str(self):
32 ## return 'HTTP Error %s: %s (%s)' % (
33 ## self.code, self.msg, self.geturl())
34 ## urllib2.HTTPError.__str__ = urllib2_str
37 CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
38 DEFAULT_ENCODING = 'latin-1'
41 # This adds "refresh" to the list of redirectables and provides a redirection
42 # algorithm that doesn't go into a loop in the presence of cookies
43 # (Python 2.4 has this new algorithm, 2.3 doesn't).
44 class HTTPRedirectHandler(BaseHandler):
45 # maximum number of redirections to any single URL
46 # this is needed because of the state that cookies introduce
47 max_repeats = 4
48 # maximum total number of redirections (regardless of URL) before
49 # assuming we're in a loop
50 max_redirections = 10
52 # Implementation notes:
54 # To avoid the server sending us into an infinite loop, the request
55 # object needs to track what URLs we have already seen. Do this by
56 # adding a handler-specific attribute to the Request object. The value
57 # of the dict is used to count the number of times the same URL has
58 # been visited. This is needed because visiting the same URL twice
59 # does not necessarily imply a loop, thanks to state introduced by
60 # cookies.
62 # Always unhandled redirection codes:
63 # 300 Multiple Choices: should not handle this here.
64 # 304 Not Modified: no need to handle here: only of interest to caches
65 # that do conditional GETs
66 # 305 Use Proxy: probably not worth dealing with here
67 # 306 Unused: what was this for in the previous versions of protocol??
69 def redirect_request(self, newurl, req, fp, code, msg, headers):
70 """Return a Request or None in response to a redirect.
72 This is called by the http_error_30x methods when a redirection
73 response is received. If a redirection should take place, return a
74 new Request to allow http_error_30x to perform the redirect;
75 otherwise, return None to indicate that an HTTPError should be
76 raised.
78 """
79 if code in (301, 302, 303, "refresh") or \
80 (code == 307 and not req.has_data()):
81 # Strictly (according to RFC 2616), 301 or 302 in response to
82 # a POST MUST NOT cause a redirection without confirmation
83 # from the user (of urllib2, in this case). In practice,
84 # essentially all clients do redirect in this case, so we do
85 # the same.
86 # XXX really refresh redirections should be visiting; tricky to
87 # fix, so this will wait until post-stable release
88 new = Request(newurl,
89 headers=req.headers,
90 origin_req_host=req.get_origin_req_host(),
91 unverifiable=True,
92 visit=False,
94 new._origin_req = getattr(req, "_origin_req", req)
95 return new
96 else:
97 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
99 def http_error_302(self, req, fp, code, msg, headers):
100 # Some servers (incorrectly) return multiple Location headers
101 # (so probably same goes for URI). Use first header.
102 if headers.has_key('location'):
103 newurl = headers.getheaders('location')[0]
104 elif headers.has_key('uri'):
105 newurl = headers.getheaders('uri')[0]
106 else:
107 return
108 newurl = _rfc3986.clean_url(newurl, "latin-1")
109 newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
111 # XXX Probably want to forget about the state of the current
112 # request, although that might interact poorly with other
113 # handlers that also use handler-specific request attributes
114 new = self.redirect_request(newurl, req, fp, code, msg, headers)
115 if new is None:
116 return
118 # loop detection
119 # .redirect_dict has a key url if url was previously visited.
120 if hasattr(req, 'redirect_dict'):
121 visited = new.redirect_dict = req.redirect_dict
122 if (visited.get(newurl, 0) >= self.max_repeats or
123 len(visited) >= self.max_redirections):
124 raise HTTPError(req.get_full_url(), code,
125 self.inf_msg + msg, headers, fp)
126 else:
127 visited = new.redirect_dict = req.redirect_dict = {}
128 visited[newurl] = visited.get(newurl, 0) + 1
130 # Don't close the fp until we are sure that we won't use it
131 # with HTTPError.
132 fp.read()
133 fp.close()
135 return self.parent.open(new)
137 http_error_301 = http_error_303 = http_error_307 = http_error_302
138 http_error_refresh = http_error_302
140 inf_msg = "The HTTP server returned a redirect error that would " \
141 "lead to an infinite loop.\n" \
142 "The last 30x error message was:\n"
145 # XXX would self.reset() work, instead of raising this exception?
146 class EndOfHeadError(Exception): pass
147 class AbstractHeadParser:
148 # only these elements are allowed in or before HEAD of document
149 head_elems = ("html", "head",
150 "title", "base",
151 "script", "style", "meta", "link", "object")
152 _entitydefs = htmlentitydefs.name2codepoint
153 _encoding = DEFAULT_ENCODING
155 def __init__(self):
156 self.http_equiv = []
158 def start_meta(self, attrs):
159 http_equiv = content = None
160 for key, value in attrs:
161 if key == "http-equiv":
162 http_equiv = self.unescape_attr_if_required(value)
163 elif key == "content":
164 content = self.unescape_attr_if_required(value)
165 if http_equiv is not None and content is not None:
166 self.http_equiv.append((http_equiv, content))
168 def end_head(self):
169 raise EndOfHeadError()
171 def handle_entityref(self, name):
172 #debug("%s", name)
173 self.handle_data(unescape(
174 '&%s;' % name, self._entitydefs, self._encoding))
176 def handle_charref(self, name):
177 #debug("%s", name)
178 self.handle_data(unescape_charref(name, self._encoding))
180 def unescape_attr(self, name):
181 #debug("%s", name)
182 return unescape(name, self._entitydefs, self._encoding)
184 def unescape_attrs(self, attrs):
185 #debug("%s", attrs)
186 escaped_attrs = {}
187 for key, val in attrs.items():
188 escaped_attrs[key] = self.unescape_attr(val)
189 return escaped_attrs
191 def unknown_entityref(self, ref):
192 self.handle_data("&%s;" % ref)
194 def unknown_charref(self, ref):
195 self.handle_data("&#%s;" % ref)
198 try:
199 import HTMLParser
200 except ImportError:
201 pass
202 else:
203 class XHTMLCompatibleHeadParser(AbstractHeadParser,
204 HTMLParser.HTMLParser):
205 def __init__(self):
206 HTMLParser.HTMLParser.__init__(self)
207 AbstractHeadParser.__init__(self)
209 def handle_starttag(self, tag, attrs):
210 if tag not in self.head_elems:
211 raise EndOfHeadError()
212 try:
213 method = getattr(self, 'start_' + tag)
214 except AttributeError:
215 try:
216 method = getattr(self, 'do_' + tag)
217 except AttributeError:
218 pass # unknown tag
219 else:
220 method(attrs)
221 else:
222 method(attrs)
224 def handle_endtag(self, tag):
225 if tag not in self.head_elems:
226 raise EndOfHeadError()
227 try:
228 method = getattr(self, 'end_' + tag)
229 except AttributeError:
230 pass # unknown tag
231 else:
232 method()
234 def unescape(self, name):
235 # Use the entitydefs passed into constructor, not
236 # HTMLParser.HTMLParser's entitydefs.
237 return self.unescape_attr(name)
239 def unescape_attr_if_required(self, name):
240 return name # HTMLParser.HTMLParser already did it
242 class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
244 def _not_called(self):
245 assert False
247 def __init__(self):
248 sgmllib.SGMLParser.__init__(self)
249 AbstractHeadParser.__init__(self)
251 def handle_starttag(self, tag, method, attrs):
252 if tag not in self.head_elems:
253 raise EndOfHeadError()
254 if tag == "meta":
255 method(attrs)
257 def unknown_starttag(self, tag, attrs):
258 self.handle_starttag(tag, self._not_called, attrs)
260 def handle_endtag(self, tag, method):
261 if tag in self.head_elems:
262 method()
263 else:
264 raise EndOfHeadError()
266 def unescape_attr_if_required(self, name):
267 return self.unescape_attr(name)
269 def parse_head(fileobj, parser):
270 """Return a list of key, value pairs."""
271 while 1:
272 data = fileobj.read(CHUNK)
273 try:
274 parser.feed(data)
275 except EndOfHeadError:
276 break
277 if len(data) != CHUNK:
278 # this should only happen if there is no HTML body, or if
279 # CHUNK is big
280 break
281 return parser.http_equiv
283 class HTTPEquivProcessor(BaseHandler):
284 """Append META HTTP-EQUIV headers to regular HTTP headers."""
286 handler_order = 300 # before handlers that look at HTTP headers
288 def __init__(self, head_parser_class=HeadParser,
289 i_want_broken_xhtml_support=False,
291 self.head_parser_class = head_parser_class
292 self._allow_xhtml = i_want_broken_xhtml_support
294 def http_response(self, request, response):
295 if not hasattr(response, "seek"):
296 response = response_seek_wrapper(response)
297 http_message = response.info()
298 url = response.geturl()
299 ct_hdrs = http_message.getheaders("content-type")
300 if is_html(ct_hdrs, url, self._allow_xhtml):
301 try:
302 try:
303 html_headers = parse_head(response, self.head_parser_class())
304 finally:
305 response.seek(0)
306 except (HTMLParser.HTMLParseError,
307 sgmllib.SGMLParseError):
308 pass
309 else:
310 for hdr, val in html_headers:
311 # add a header
312 http_message.dict[hdr.lower()] = val
313 text = hdr + ": " + val
314 for line in text.split("\n"):
315 http_message.headers.append(line + "\n")
316 return response
318 https_response = http_response
320 class HTTPCookieProcessor(BaseHandler):
321 """Handle HTTP cookies.
323 Public attributes:
325 cookiejar: CookieJar instance
328 def __init__(self, cookiejar=None):
329 if cookiejar is None:
330 cookiejar = CookieJar()
331 self.cookiejar = cookiejar
333 def http_request(self, request):
334 self.cookiejar.add_cookie_header(request)
335 return request
337 def http_response(self, request, response):
338 self.cookiejar.extract_cookies(response, request)
339 return response
341 https_request = http_request
342 https_response = http_response
344 try:
345 import robotparser
346 except ImportError:
347 pass
348 else:
349 class MechanizeRobotFileParser(robotparser.RobotFileParser):
351 def __init__(self, url='', opener=None):
352 import _opener
353 robotparser.RobotFileParser.__init__(self, url)
354 self._opener = opener
356 def set_opener(self, opener=None):
357 if opener is None:
358 opener = _opener.OpenerDirector()
359 self._opener = opener
361 def read(self):
362 """Reads the robots.txt URL and feeds it to the parser."""
363 if self._opener is None:
364 self.set_opener()
365 req = Request(self.url, unverifiable=True, visit=False)
366 try:
367 f = self._opener.open(req)
368 except HTTPError, f:
369 pass
370 except (IOError, socket.error, OSError), exc:
371 robotparser._debug("ignoring error opening %r: %s" %
372 (self.url, exc))
373 return
374 lines = []
375 line = f.readline()
376 while line:
377 lines.append(line.strip())
378 line = f.readline()
379 status = f.code
380 if status == 401 or status == 403:
381 self.disallow_all = True
382 robotparser._debug("disallow all")
383 elif status >= 400:
384 self.allow_all = True
385 robotparser._debug("allow all")
386 elif status == 200 and lines:
387 robotparser._debug("parse lines")
388 self.parse(lines)
390 class RobotExclusionError(urllib2.HTTPError):
391 def __init__(self, request, *args):
392 apply(urllib2.HTTPError.__init__, (self,)+args)
393 self.request = request
395 class HTTPRobotRulesProcessor(BaseHandler):
396 # before redirections, after everything else
397 handler_order = 800
399 try:
400 from httplib import HTTPMessage
401 except:
402 from mimetools import Message
403 http_response_class = Message
404 else:
405 http_response_class = HTTPMessage
407 def __init__(self, rfp_class=MechanizeRobotFileParser):
408 self.rfp_class = rfp_class
409 self.rfp = None
410 self._host = None
412 def http_request(self, request):
413 scheme = request.get_type()
414 if scheme not in ["http", "https"]:
415 # robots exclusion only applies to HTTP
416 return request
418 if request.get_selector() == "/robots.txt":
419 # /robots.txt is always OK to fetch
420 return request
422 host = request.get_host()
424 # robots.txt requests don't need to be allowed by robots.txt :-)
425 origin_req = getattr(request, "_origin_req", None)
426 if (origin_req is not None and
427 origin_req.get_selector() == "/robots.txt" and
428 origin_req.get_host() == host
430 return request
432 if host != self._host:
433 self.rfp = self.rfp_class()
434 try:
435 self.rfp.set_opener(self.parent)
436 except AttributeError:
437 debug("%r instance does not support set_opener" %
438 self.rfp.__class__)
439 self.rfp.set_url(scheme+"://"+host+"/robots.txt")
440 self.rfp.read()
441 self._host = host
443 ua = request.get_header("User-agent", "")
444 if self.rfp.can_fetch(ua, request.get_full_url()):
445 return request
446 else:
447 # XXX This should really have raised URLError. Too late now...
448 msg = "request disallowed by robots.txt"
449 raise RobotExclusionError(
450 request,
451 request.get_full_url(),
452 403, msg,
453 self.http_response_class(StringIO()), StringIO(msg))
455 https_request = http_request
457 class HTTPRefererProcessor(BaseHandler):
458 """Add Referer header to requests.
460 This only makes sense if you use each RefererProcessor for a single
461 chain of requests only (so, for example, if you use a single
462 HTTPRefererProcessor to fetch a series of URLs extracted from a single
463 page, this will break).
465 There's a proper implementation of this in mechanize.Browser.
468 def __init__(self):
469 self.referer = None
471 def http_request(self, request):
472 if ((self.referer is not None) and
473 not request.has_header("Referer")):
474 request.add_unredirected_header("Referer", self.referer)
475 return request
477 def http_response(self, request, response):
478 self.referer = response.geturl()
479 return response
481 https_request = http_request
482 https_response = http_response
485 def clean_refresh_url(url):
486 # e.g. Firefox 1.5 does (something like) this
487 if ((url.startswith('"') and url.endswith('"')) or
488 (url.startswith("'") and url.endswith("'"))):
489 url = url[1:-1]
490 return _rfc3986.clean_url(url, "latin-1") # XXX encoding
492 def parse_refresh_header(refresh):
494 >>> parse_refresh_header("1; url=http://example.com/")
495 (1.0, 'http://example.com/')
496 >>> parse_refresh_header("1; url='http://example.com/'")
497 (1.0, 'http://example.com/')
498 >>> parse_refresh_header("1")
499 (1.0, None)
500 >>> parse_refresh_header("blah")
501 Traceback (most recent call last):
502 ValueError: invalid literal for float(): blah
506 ii = refresh.find(";")
507 if ii != -1:
508 pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
509 jj = newurl_spec.find("=")
510 key = None
511 if jj != -1:
512 key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
513 newurl = clean_refresh_url(newurl)
514 if key is None or key.strip().lower() != "url":
515 raise ValueError()
516 else:
517 pause, newurl = float(refresh), None
518 return pause, newurl
520 class HTTPRefreshProcessor(BaseHandler):
521 """Perform HTTP Refresh redirections.
523 Note that if a non-200 HTTP code has occurred (for example, a 30x
524 redirect), this processor will do nothing.
526 By default, only zero-time Refresh headers are redirected. Use the
527 max_time attribute / constructor argument to allow Refresh with longer
528 pauses. Use the honor_time attribute / constructor argument to control
529 whether the requested pause is honoured (with a time.sleep()) or
530 skipped in favour of immediate redirection.
532 Public attributes:
534 max_time: see above
535 honor_time: see above
538 handler_order = 1000
540 def __init__(self, max_time=0, honor_time=True):
541 self.max_time = max_time
542 self.honor_time = honor_time
544 def http_response(self, request, response):
545 code, msg, hdrs = response.code, response.msg, response.info()
547 if code == 200 and hdrs.has_key("refresh"):
548 refresh = hdrs.getheaders("refresh")[0]
549 try:
550 pause, newurl = parse_refresh_header(refresh)
551 except ValueError:
552 debug("bad Refresh header: %r" % refresh)
553 return response
554 if newurl is None:
555 newurl = response.geturl()
556 if (self.max_time is None) or (pause <= self.max_time):
557 if pause > 1E-3 and self.honor_time:
558 time.sleep(pause)
559 hdrs["location"] = newurl
560 # hardcoded http is NOT a bug
561 response = self.parent.error(
562 "http", request, response,
563 "refresh", msg, hdrs)
565 return response
567 https_response = http_response
569 class HTTPErrorProcessor(BaseHandler):
570 """Process HTTP error responses.
572 The purpose of this handler is to to allow other response processors a
573 look-in by removing the call to parent.error() from
574 AbstractHTTPHandler.
576 For non-200 error codes, this just passes the job on to the
577 Handler.<proto>_error_<code> methods, via the OpenerDirector.error
578 method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
579 HTTPError if no other handler handles the error.
582 handler_order = 1000 # after all other processors
584 def http_response(self, request, response):
585 code, msg, hdrs = response.code, response.msg, response.info()
587 if code != 200:
588 # hardcoded http is NOT a bug
589 response = self.parent.error(
590 "http", request, response, code, msg, hdrs)
592 return response
594 https_response = http_response
597 class HTTPDefaultErrorHandler(BaseHandler):
598 def http_error_default(self, req, fp, code, msg, hdrs):
599 # why these error methods took the code, msg, headers args in the first
600 # place rather than a response object, I don't know, but to avoid
601 # multiple wrapping, we're discarding them
603 if isinstance(fp, urllib2.HTTPError):
604 response = fp
605 else:
606 response = urllib2.HTTPError(
607 req.get_full_url(), code, msg, hdrs, fp)
608 assert code == response.code
609 assert msg == response.msg
610 assert hdrs == response.hdrs
611 raise response
614 class AbstractHTTPHandler(BaseHandler):
616 def __init__(self, debuglevel=0):
617 self._debuglevel = debuglevel
619 def set_http_debuglevel(self, level):
620 self._debuglevel = level
622 def do_request_(self, request):
623 host = request.get_host()
624 if not host:
625 raise URLError('no host given')
627 if request.has_data(): # POST
628 data = request.get_data()
629 if not request.has_header('Content-type'):
630 request.add_unredirected_header(
631 'Content-type',
632 'application/x-www-form-urlencoded')
634 scheme, sel = urllib.splittype(request.get_selector())
635 sel_host, sel_path = urllib.splithost(sel)
636 if not request.has_header('Host'):
637 request.add_unredirected_header('Host', sel_host or host)
638 for name, value in self.parent.addheaders:
639 name = name.capitalize()
640 if not request.has_header(name):
641 request.add_unredirected_header(name, value)
643 return request
645 def do_open(self, http_class, req):
646 """Return an addinfourl object for the request, using http_class.
648 http_class must implement the HTTPConnection API from httplib.
649 The addinfourl return value is a file-like object. It also
650 has methods and attributes including:
651 - info(): return a mimetools.Message object for the headers
652 - geturl(): return the original request URL
653 - code: HTTP status code
655 host = req.get_host()
656 if not host:
657 raise URLError('no host given')
659 h = http_class(host) # will parse host:port
660 h.set_debuglevel(self._debuglevel)
662 headers = dict(req.headers)
663 headers.update(req.unredirected_hdrs)
664 # We want to make an HTTP/1.1 request, but the addinfourl
665 # class isn't prepared to deal with a persistent connection.
666 # It will try to read all remaining data from the socket,
667 # which will block while the server waits for the next request.
668 # So make sure the connection gets closed after the (only)
669 # request.
670 headers["Connection"] = "close"
671 headers = dict(
672 [(name.title(), val) for name, val in headers.items()])
673 try:
674 h.request(req.get_method(), req.get_selector(), req.data, headers)
675 r = h.getresponse()
676 except socket.error, err: # XXX what error?
677 raise URLError(err)
679 # Pick apart the HTTPResponse object to get the addinfourl
680 # object initialized properly.
682 # Wrap the HTTPResponse object in socket's file object adapter
683 # for Windows. That adapter calls recv(), so delegate recv()
684 # to read(). This weird wrapping allows the returned object to
685 # have readline() and readlines() methods.
687 # XXX It might be better to extract the read buffering code
688 # out of socket._fileobject() and into a base class.
690 r.recv = r.read
691 fp = socket._fileobject(r)
693 resp = closeable_response(fp, r.msg, req.get_full_url(),
694 r.status, r.reason)
695 return resp
698 class HTTPHandler(AbstractHTTPHandler):
699 def http_open(self, req):
700 return self.do_open(httplib.HTTPConnection, req)
702 http_request = AbstractHTTPHandler.do_request_
704 if hasattr(httplib, 'HTTPS'):
706 class HTTPSConnectionFactory:
707 def __init__(self, key_file, cert_file):
708 self._key_file = key_file
709 self._cert_file = cert_file
710 def __call__(self, hostport):
711 return httplib.HTTPSConnection(
712 hostport,
713 key_file=self._key_file, cert_file=self._cert_file)
715 class HTTPSHandler(AbstractHTTPHandler):
716 def __init__(self, client_cert_manager=None):
717 AbstractHTTPHandler.__init__(self)
718 self.client_cert_manager = client_cert_manager
720 def https_open(self, req):
721 if self.client_cert_manager is not None:
722 key_file, cert_file = self.client_cert_manager.find_key_cert(
723 req.get_full_url())
724 conn_factory = HTTPSConnectionFactory(key_file, cert_file)
725 else:
726 conn_factory = httplib.HTTPSConnection
727 return self.do_open(conn_factory, req)
729 https_request = AbstractHTTPHandler.do_request_