straw/httplib2/__init__.py

   1 from __future__ import generators
   2 """
   3 httplib2
   4
   5 A caching http interface that supports ETags and gzip
   6 to conserve bandwidth.
   7
   8 Requires Python 2.3 or later
   9
  10 Changelog:
  11 2007-08-18, Rick: Modified so it's able to use a socks proxy if needed.
  12
  13 """
  14
  15 __author__ = "Joe Gregorio (joe@bitworking.org)"
  16 __copyright__ = "Copyright 2006, Joe Gregorio"
  17 __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
  18     "James Antill",
  19     "Xavier Verges Farrero",
  20     "Jonathan Feinberg",
  21     "Blair Zajac",
  22     "Sam Ruby",
  23     "Louis Nyffenegger"]
  24 __license__ = "MIT"
  25 __version__ = "$Rev: 259 $"
  26
  27 import re
  28 import sys
  29 import md5
  30 import email
  31 import email.Utils
  32 import email.Message
  33 import StringIO
  34 import gzip
  35 import zlib
  36 import httplib
  37 import urlparse
  38 import base64
  39 import os
  40 import copy
  41 import calendar
  42 import time
  43 import random
  44 import sha
  45 import hmac
  46 from gettext import gettext as _
  47 import socket
  48
  49 try:
  50     import socks
  51 except ImportError:
  52     socks = None
  53
  54 if sys.version_info >= (2,3):
  55     from iri2uri import iri2uri
  56 else:
  57     def iri2uri(uri):
  58         return uri
  59
  60 __all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error',
  61   'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
  62   'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
  63   'debuglevel']
  64
  65
  66 # The httplib debug level, set to a non-zero value to get debug output
  67 debuglevel = 0
  68
  69 # Python 2.3 support
  70 if sys.version_info < (2,4):
  71     def sorted(seq):
  72         seq.sort()
  73         return seq
  74
  75 # Python 2.3 support
  76 def HTTPResponse__getheaders(self):
  77     """Return list of (header, value) tuples."""
  78     if self.msg is None:
  79         raise httplib.ResponseNotReady()
  80     return self.msg.items()
  81
  82 if not hasattr(httplib.HTTPResponse, 'getheaders'):
  83     httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
  84
  85 # All exceptions raised here derive from HttpLib2Error
  86 class HttpLib2Error(Exception): pass
  87
  88 # Some exceptions can be caught and optionally
  89 # be turned back into responses.
  90 class HttpLib2ErrorWithResponse(HttpLib2Error):
  91     def __init__(self, desc, response, content):
  92         self.response = response
  93         self.content = content
  94         HttpLib2Error.__init__(self, desc)
  95
  96 class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass
  97 class RedirectLimit(HttpLib2ErrorWithResponse): pass
  98 class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass
  99 class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
 100 class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass
 101
 102 class RelativeURIError(HttpLib2Error): pass
 103 class ServerNotFoundError(HttpLib2Error): pass
 104
 105 # Open Items:
 106 # -----------
 107 # Proxy support
 108
 109 # Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
 110
 111 # Pluggable cache storage (supports storing the cache in
 112 #   flat files by default. We need a plug-in architecture
 113 #   that can support Berkeley DB and Squid)
 114
 115 # == Known Issues ==
 116 # Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
 117 # Does not handle Cache-Control: max-stale
 118 # Does not use Age: headers when calculating cache freshness.
 119
 120
 121 # The number of redirections to follow before giving up.
 122 # Note that only GET redirects are automatically followed.
 123 # Will also honor 301 requests by saving that info and never
 124 # requesting that URI again.
 125 DEFAULT_MAX_REDIRECTS = 5
 126
 127 # Which headers are hop-by-hop headers by default
 128 HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
 129
 130 def _get_end2end_headers(response):
 131     hopbyhop = list(HOP_BY_HOP)
 132     hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
 133     return [header for header in response.keys() if header not in hopbyhop]
 134
 135 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
 136
 137 def parse_uri(uri):
 138     """Parses a URI using the regex given in Appendix B of RFC 3986.
 139
 140         (scheme, authority, path, query, fragment) = parse_uri(uri)
 141     """
 142     groups = URI.match(uri).groups()
 143     return (groups[1], groups[3], groups[4], groups[6], groups[8])
 144
 145 def urlnorm(uri):
 146     (scheme, authority, path, query, fragment) = parse_uri(uri)
 147     if not scheme or not authority:
 148         raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
 149     authority = authority.lower()
 150     scheme = scheme.lower()
 151     if not path:
 152         path = "/"
 153     # Could do syntax based normalization of the URI before
 154     # computing the digest. See Section 6.2.2 of Std 66.
 155     request_uri = query and "?".join([path, query]) or path
 156     scheme = scheme.lower()
 157     defrag_uri = scheme + "://" + authority + request_uri
 158     return scheme, authority, request_uri, defrag_uri
 159
 160
 161 # Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
 162 re_url_scheme    = re.compile(r'^\w+://')
 163 re_slash         = re.compile(r'[?/:|]+')
 164
 165 def safename(filename):
 166     """Return a filename suitable for the cache.
 167
 168     Strips dangerous and common characters to create a filename we
 169     can use to store the cache in.
 170     """
 171
 172     try:
 173         if re_url_scheme.match(filename):
 174             if isinstance(filename,str):
 175                 filename = filename.decode('utf-8')
 176                 filename = filename.encode('idna')
 177             else:
 178                 filename = filename.encode('idna')
 179     except UnicodeError:
 180         pass
 181     if isinstance(filename,unicode):
 182         filename=filename.encode('utf-8')
 183     filemd5 = md5.new(filename).hexdigest()
 184     filename = re_url_scheme.sub("", filename)
 185     filename = re_slash.sub(",", filename)
 186
 187     # limit length of filename
 188     if len(filename)>200:
 189         filename=filename[:200]
 190     return ",".join((filename, filemd5))
 191
 192 NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
 193 def _normalize_headers(headers):
 194     return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
 195
 196 def _parse_cache_control(headers):
 197     retval = {}
 198     if headers.has_key('cache-control'):
 199         parts =  headers['cache-control'].split(',')
 200         parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")]
 201         parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")]
 202         retval = dict(parts_with_args + parts_wo_args)
 203     return retval
 204
 205 # Whether to use a strict mode to parse WWW-Authenticate headers
 206 # Might lead to bad results in case of ill-formed header value,
 207 # so disabled by default, falling back to relaxed parsing.
 208 # Set to true to turn on, usefull for testing servers.
 209 USE_WWW_AUTH_STRICT_PARSING = 0
 210
 211 # In regex below:
 212 #    [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+             matches a "token" as defined by HTTP
 213 #    "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?"    matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
 214 # Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
 215 #    \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
 216 WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
 217 WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
 218 UNQUOTE_PAIRS = re.compile(r'\\(.)')
 219 def _parse_www_authenticate(headers, headername='www-authenticate'):
 220     """Returns a dictionary of dictionaries, one dict
 221     per auth_scheme."""
 222     retval = {}
 223     if headers.has_key(headername):
 224         authenticate = headers[headername].strip()
 225         www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
 226         while authenticate:
 227             # Break off the scheme at the beginning of the line
 228             if headername == 'authentication-info':
 229                 (auth_scheme, the_rest) = ('digest', authenticate)
 230             else:
 231                 (auth_scheme, the_rest) = authenticate.split(" ", 1)
 232             # Now loop over all the key value pairs that come after the scheme,
 233             # being careful not to roll into the next scheme
 234             match = www_auth.search(the_rest)
 235             auth_params = {}
 236             while match:
 237                 if match and len(match.groups()) == 3:
 238                     (key, value, the_rest) = match.groups()
 239                     auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
 240                 match = www_auth.search(the_rest)
 241             retval[auth_scheme.lower()] = auth_params
 242             authenticate = the_rest.strip()
 243     return retval
 244
 245
 246 def _entry_disposition(response_headers, request_headers):
 247     """Determine freshness from the Date, Expires and Cache-Control headers.
 248
 249     We don't handle the following:
 250
 251     1. Cache-Control: max-stale
 252     2. Age: headers are not used in the calculations.
 253
 254     Not that this algorithm is simpler than you might think
 255     because we are operating as a private (non-shared) cache.
 256     This lets us ignore 's-maxage'. We can also ignore
 257     'proxy-invalidate' since we aren't a proxy.
 258     We will never return a stale document as
 259     fresh as a design decision, and thus the non-implementation
 260     of 'max-stale'. This also lets us safely ignore 'must-revalidate'
 261     since we operate as if every server has sent 'must-revalidate'.
 262     Since we are private we get to ignore both 'public' and
 263     'private' parameters. We also ignore 'no-transform' since
 264     we don't do any transformations.
 265     The 'no-store' parameter is handled at a higher level.
 266     So the only Cache-Control parameters we look at are:
 267
 268     no-cache
 269     only-if-cached
 270     max-age
 271     min-fresh
 272     """
 273
 274     retval = "STALE"
 275     cc = _parse_cache_control(request_headers)
 276     cc_response = _parse_cache_control(response_headers)
 277
 278     if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
 279         retval = "TRANSPARENT"
 280         if 'cache-control' not in request_headers:
 281             request_headers['cache-control'] = 'no-cache'
 282     elif cc.has_key('no-cache'):
 283         retval = "TRANSPARENT"
 284     elif cc_response.has_key('no-cache'):
 285         retval = "STALE"
 286     elif cc.has_key('only-if-cached'):
 287         retval = "FRESH"
 288     elif response_headers.has_key('date'):
 289         date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
 290         now = time.time()
 291         current_age = max(0, now - date)
 292         if cc_response.has_key('max-age'):
 293             try:
 294                 freshness_lifetime = int(cc_response['max-age'])
 295             except ValueError:
 296                 freshness_lifetime = 0
 297         elif response_headers.has_key('expires'):
 298             expires = email.Utils.parsedate_tz(response_headers['expires'])
 299             if None == expires:
 300                 freshness_lifetime = 0
 301             else:
 302                 freshness_lifetime = max(0, calendar.timegm(expires) - date)
 303         else:
 304             freshness_lifetime = 0
 305         if cc.has_key('max-age'):
 306             try:
 307                 freshness_lifetime = int(cc['max-age'])
 308             except ValueError:
 309                 freshness_lifetime = 0
 310         if cc.has_key('min-fresh'):
 311             try:
 312                 min_fresh = int(cc['min-fresh'])
 313             except ValueError:
 314                 min_fresh = 0
 315             current_age += min_fresh
 316         if freshness_lifetime > current_age:
 317             retval = "FRESH"
 318     return retval
 319
 320 def _decompressContent(response, new_content):
 321     content = new_content
 322     try:
 323         encoding = response.get('content-encoding', None)
 324         if encoding in ['gzip', 'deflate']:
 325             if encoding == 'gzip':
 326                 content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
 327             if encoding == 'deflate':
 328                 content = zlib.decompress(content)
 329             response['content-length'] = str(len(content))
 330             del response['content-encoding']
 331     except IOError:
 332         content = ""
 333         raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
 334     return content
 335
 336 def _updateCache(request_headers, response_headers, content, cache, cachekey):
 337     if cachekey:
 338         cc = _parse_cache_control(request_headers)
 339         cc_response = _parse_cache_control(response_headers)
 340         if cc.has_key('no-store') or cc_response.has_key('no-store'):
 341             cache.delete(cachekey)
 342         else:
 343             info = email.Message.Message()
 344             for key, value in response_headers.iteritems():
 345                 if key not in ['status','content-encoding','transfer-encoding']:
 346                     info[key] = value
 347
 348             status = response_headers.status
 349             if status == 304:
 350                 status = 200
 351
 352             status_header = 'status: %d\r\n' % response_headers.status
 353
 354             header_str = info.as_string()
 355
 356             header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
 357             text = "".join([status_header, header_str, content])
 358
 359             cache.set(cachekey, text)
 360
 361 def _cnonce():
 362     dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
 363     return dig[:16]
 364
 365 def _wsse_username_token(cnonce, iso_now, password):
 366     return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
 367
 368
 369 # For credentials we need two things, first
 370 # a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
 371 # Then we also need a list of URIs that have already demanded authentication
 372 # That list is tricky since sub-URIs can take the same auth, or the
 373 # auth scheme may change as you descend the tree.
 374 # So we also need each Auth instance to be able to tell us
 375 # how close to the 'top' it is.
 376
 377 class Authentication(object):
 378     def __init__(self, credentials, host, request_uri, headers, response, content, http):
 379         (scheme, authority, path, query, fragment) = parse_uri(request_uri)
 380         self.path = path
 381         self.host = host
 382         self.credentials = credentials
 383         self.http = http
 384
 385     def depth(self, request_uri):
 386         (scheme, authority, path, query, fragment) = parse_uri(request_uri)
 387         return request_uri[len(self.path):].count("/")
 388
 389     def inscope(self, host, request_uri):
 390         # XXX Should we normalize the request_uri?
 391         (scheme, authority, path, query, fragment) = parse_uri(request_uri)
 392         return (host == self.host) and path.startswith(self.path)
 393
 394     def request(self, method, request_uri, headers, content):
 395         """Modify the request headers to add the appropriate
 396         Authorization header. Over-rise this in sub-classes."""
 397         pass
 398
 399     def response(self, response, content):
 400         """Gives us a chance to update with new nonces
 401         or such returned from the last authorized response.
 402         Over-rise this in sub-classes if necessary.
 403
 404         Return TRUE is the request is to be retried, for
 405         example Digest may return stale=true.
 406         """
 407         return False
 408
 409
 410
 411 class BasicAuthentication(Authentication):
 412     def __init__(self, credentials, host, request_uri, headers, response, content, http):
 413         Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
 414
 415     def request(self, method, request_uri, headers, content):
 416         """Modify the request headers to add the appropriate
 417         Authorization header."""
 418         headers['authorization'] = 'Basic ' + base64.encodestring("%s:%s" % self.credentials).strip()
 419
 420
 421 class DigestAuthentication(Authentication):
 422     """Only do qop='auth' and MD5, since that
 423     is all Apache currently implements"""
 424     def __init__(self, credentials, host, request_uri, headers, response, content, http):
 425         Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
 426         challenge = _parse_www_authenticate(response, 'www-authenticate')
 427         self.challenge = challenge['digest']
 428         qop = self.challenge.get('qop')
 429         self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
 430         if self.challenge['qop'] is None:
 431             raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
 432         self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5')
 433         if self.challenge['algorithm'] != 'MD5':
 434             raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
 435         self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
 436         self.challenge['nc'] = 1
 437
 438     def request(self, method, request_uri, headers, content, cnonce = None):
 439         """Modify the request headers"""
 440         H = lambda x: md5.new(x).hexdigest()
 441         KD = lambda s, d: H("%s:%s" % (s, d))
 442         A2 = "".join([method, ":", request_uri])
 443         self.challenge['cnonce'] = cnonce or _cnonce()
 444         request_digest  = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
 445                     '%08x' % self.challenge['nc'],
 446                     self.challenge['cnonce'],
 447                     self.challenge['qop'], H(A2)
 448                     ))
 449         headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
 450                 self.credentials[0],
 451                 self.challenge['realm'],
 452                 self.challenge['nonce'],
 453                 request_uri,
 454                 self.challenge['algorithm'],
 455                 request_digest,
 456                 self.challenge['qop'],
 457                 self.challenge['nc'],
 458                 self.challenge['cnonce'],
 459                 )
 460         self.challenge['nc'] += 1
 461
 462     def response(self, response, content):
 463         if not response.has_key('authentication-info'):
 464             challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
 465             if 'true' == challenge.get('stale'):
 466                 self.challenge['nonce'] = challenge['nonce']
 467                 self.challenge['nc'] = 1
 468                 return True
 469         else:
 470             updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
 471
 472             if updated_challenge.has_key('nextnonce'):
 473                 self.challenge['nonce'] = updated_challenge['nextnonce']
 474                 self.challenge['nc'] = 1
 475         return False
 476
 477
 478 class HmacDigestAuthentication(Authentication):
 479     """Adapted from Robert Sayre's code and DigestAuthentication above."""
 480     __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
 481
 482     def __init__(self, credentials, host, request_uri, headers, response, content, http):
 483         Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
 484         challenge = _parse_www_authenticate(response, 'www-authenticate')
 485         self.challenge = challenge['hmacdigest']
 486         # TODO: self.challenge['domain']
 487         self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
 488         if self.challenge['reason'] not in ['unauthorized', 'integrity']:
 489             self.challenge['reason'] = 'unauthorized'
 490         self.challenge['salt'] = self.challenge.get('salt', '')
 491         if not self.challenge.get('snonce'):
 492             raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
 493         self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
 494         if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
 495             raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
 496         self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
 497         if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
 498             raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
 499         if self.challenge['algorithm'] == 'HMAC-MD5':
 500             self.hashmod = md5
 501         else:
 502             self.hashmod = sha
 503         if self.challenge['pw-algorithm'] == 'MD5':
 504             self.pwhashmod = md5
 505         else:
 506             self.pwhashmod = sha
 507         self.key = "".join([self.credentials[0], ":",
 508                     self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
 509                     ":", self.challenge['realm']
 510                     ])
 511         self.key = self.pwhashmod.new(self.key).hexdigest().lower()
 512
 513     def request(self, method, request_uri, headers, content):
 514         """Modify the request headers"""
 515         keys = _get_end2end_headers(headers)
 516         keylist = "".join(["%s " % k for k in keys])
 517         headers_val = "".join([headers[k] for k in keys])
 518         created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
 519         cnonce = _cnonce()
 520         request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
 521         request_digest  = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
 522         headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
 523                 self.credentials[0],
 524                 self.challenge['realm'],
 525                 self.challenge['snonce'],
 526                 cnonce,
 527                 request_uri,
 528                 created,
 529                 request_digest,
 530                 keylist,
 531                 )
 532
 533     def response(self, response, content):
 534         challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
 535         if challenge.get('reason') in ['integrity', 'stale']:
 536             return True
 537         return False
 538
 539
 540 class WsseAuthentication(Authentication):
 541     """This is thinly tested and should not be relied upon.
 542     At this time there isn't any third party server to test against.
 543     Blogger and TypePad implemented this algorithm at one point
 544     but Blogger has since switched to Basic over HTTPS and
 545     TypePad has implemented it wrong, by never issuing a 401
 546     challenge but instead requiring your client to telepathically know that
 547     their endpoint is expecting WSSE profile="UsernameToken"."""
 548     def __init__(self, credentials, host, request_uri, headers, response, content, http):
 549         Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
 550
 551     def request(self, method, request_uri, headers, content):
 552         """Modify the request headers to add the appropriate
 553         Authorization header."""
 554         headers['Authorization'] = 'WSSE profile="UsernameToken"'
 555         iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
 556         cnonce = _cnonce()
 557         password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
 558         headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
 559                 self.credentials[0],
 560                 password_digest,
 561                 cnonce,
 562                 iso_now)
 563
 564 class GoogleLoginAuthentication(Authentication):
 565     def __init__(self, credentials, host, request_uri, headers, response, content, http):
 566         from urllib import urlencode
 567         Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
 568         challenge = _parse_www_authenticate(response, 'www-authenticate')
 569         service = challenge['googlelogin'].get('service', 'xapi')
 570         # Bloggger actually returns the service in the challenge
 571         # For the rest we guess based on the URI
 572         if service == 'xapi' and  request_uri.find("calendar") > 0:
 573             service = "cl"
 574         # No point in guessing Base or Spreadsheet
 575         #elif request_uri.find("spreadsheets") > 0:
 576         #    service = "wise"
 577
 578         auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent'])
 579         resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
 580         lines = content.split('\n')
 581         d = dict([tuple(line.split("=", 1)) for line in lines if line])
 582         if resp.status == 403:
 583             self.Auth = ""
 584         else:
 585             self.Auth = d['Auth']
 586
 587     def request(self, method, request_uri, headers, content):
 588         """Modify the request headers to add the appropriate
 589         Authorization header."""
 590         headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
 591
 592
 593 AUTH_SCHEME_CLASSES = {
 594     "basic": BasicAuthentication,
 595     "wsse": WsseAuthentication,
 596     "digest": DigestAuthentication,
 597     "hmacdigest": HmacDigestAuthentication,
 598     "googlelogin": GoogleLoginAuthentication
 599 }
 600
 601 AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
 602
 603 def _md5(s):
 604     return
 605
 606 class FileCache(object):
 607     """Uses a local directory as a store for cached files.
 608     Not really safe to use if multiple threads or processes are going to
 609     be running on the same cache.
 610     """
 611     def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
 612         self.cache = cache
 613         self.safe = safe
 614         if not os.path.exists(cache):
 615             os.makedirs(self.cache)
 616
 617     def get(self, key):
 618         retval = None
 619         cacheFullPath = os.path.join(self.cache, self.safe(key))
 620         try:
 621             f = file(cacheFullPath, "r")
 622             retval = f.read()
 623             f.close()
 624         except IOError:
 625             pass
 626         return retval
 627
 628     def set(self, key, value):
 629         cacheFullPath = os.path.join(self.cache, self.safe(key))
 630         f = file(cacheFullPath, "w")
 631         f.write(value)
 632         f.close()
 633
 634     def delete(self, key):
 635         cacheFullPath = os.path.join(self.cache, self.safe(key))
 636         if os.path.exists(cacheFullPath):
 637             os.remove(cacheFullPath)
 638
 639 class Credentials(object):
 640     def __init__(self):
 641         self.credentials = []
 642
 643     def add(self, name, password, domain=""):
 644         self.credentials.append((domain.lower(), name, password))
 645
 646     def clear(self):
 647         self.credentials = []
 648
 649     def iter(self, domain):
 650         for (cdomain, name, password) in self.credentials:
 651             if cdomain == "" or domain == cdomain:
 652                 yield (name, password)
 653
 654 class KeyCerts(Credentials):
 655     """Identical to Credentials except that
 656     name/password are mapped to key/cert."""
 657     pass
 658
 659
 660 class ProxyInfo(object):
 661   """Collect information required to use a proxy."""
 662   def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None):
 663       """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX
 664       constants. For example:
 665
 666 p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000)
 667       """
 668       self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass
 669
 670   def astuple(self):
 671     return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns,
 672         self.proxy_user, self.proxy_pass)
 673
 674   def isgood(self):
 675     return socks and (self.proxy_host != None) and (self.proxy_port != None)
 676
 677
 678 class HTTPConnectionWithTimeout(httplib.HTTPConnection):
 679     """HTTPConnection subclass that supports timeouts"""
 680
 681     def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None):
 682         httplib.HTTPConnection.__init__(self, host, port, strict)
 683         self.timeout = timeout
 684         self.proxy_info = proxy_info
 685
 686     def connect(self):
 687         """Connect to the host and port specified in __init__."""
 688         # Mostly verbatim from httplib.py.
 689         msg = "getaddrinfo returns an empty list"
 690         for res in socket.getaddrinfo(self.host, self.port, 0,
 691                 socket.SOCK_STREAM):
 692             af, socktype, proto, canonname, sa = res
 693             try:
 694                 if self.proxy_info and self.proxy_info.isgood():
 695                     self.sock = socks.socksocket(af, socktype, proto)
 696                     self.sock.setproxy(*self.proxy_info.astuple())
 697                 else:
 698                     self.sock = socket.socket(af, socktype, proto)
 699                 # Different from httplib: support timeouts.
 700                 if self.timeout is not None:
 701                     self.sock.settimeout(self.timeout)
 702                     # End of difference from httplib.
 703                 if self.debuglevel > 0:
 704                     print "connect: (%s, %s)" % (self.host, self.port)
 705                 self.sock.connect(sa)
 706             except socket.error, msg:
 707                 if self.debuglevel > 0:
 708                     print 'connect fail:', (self.host, self.port)
 709                 if self.sock:
 710                     self.sock.close()
 711                 self.sock = None
 712                 continue
 713             break
 714         if not self.sock:
 715             raise socket.error, msg
 716
 717 class HTTPSConnectionWithTimeout(httplib.HTTPSConnection):
 718     "This class allows communication via SSL."
 719
 720     def __init__(self, host, port=None, key_file=None, cert_file=None,
 721                  strict=None, timeout=None, proxy_info=None):
 722         self.timeout = timeout
 723         self.proxy_info = proxy_info
 724         httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file,
 725                 cert_file=cert_file, strict=strict)
 726
 727     def connect(self):
 728         "Connect to a host on a given (SSL) port."
 729
 730         if self.proxy_info and self.proxy_info.isgood():
 731             self.sock.setproxy(*self.proxy_info.astuple())
 732             sock.setproxy(*self.proxy_info.astuple())
 733         else:
 734             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 735         if self.timeout is not None:
 736             sock.settimeout(self.timeout)
 737         sock.connect((self.host, self.port))
 738         ssl = socket.ssl(sock, self.key_file, self.cert_file)
 739         self.sock = httplib.FakeSocket(sock, ssl)
 740
 741
 742
 743 class Http(object):
 744     """An HTTP client that handles:
 745 - all methods
 746 - caching
 747 - ETags
 748 - compression,
 749 - HTTPS
 750 - Basic
 751 - Digest
 752 - WSSE
 753
 754 and more.
 755     """
 756     def __init__(self, cache=None, timeout=None, proxy_info=None):
 757         """The value of proxy_info is a ProxyInfo instance.
 758
 759 If 'cache' is a string then it is used as a directory name
 760 for a disk cache. Otherwise it must be an object that supports
 761 the same interface as FileCache."""
 762         self.proxy_info = proxy_info
 763         # Map domain name to an httplib connection
 764         self.connections = {}
 765         # The location of the cache, for now a directory
 766         # where cached responses are held.
 767         if cache and isinstance(cache, str):
 768             self.cache = FileCache(cache)
 769         else:
 770             self.cache = cache
 771
 772         # Name/password
 773         self.credentials = Credentials()
 774
 775         # Key/cert
 776         self.certificates = KeyCerts()
 777
 778         # authorization objects
 779         self.authorizations = []
 780
 781         # If set to False then no redirects are followed, even safe ones.
 782         self.follow_redirects = True
 783
 784         # If 'follow_redirects' is True, and this is set to True then
 785         # all redirecs are followed, including unsafe ones.
 786         self.follow_all_redirects = False
 787
 788         self.ignore_etag = False
 789
 790         self.force_exception_to_status_code = False
 791
 792         self.timeout = timeout
 793
 794     def _auth_from_challenge(self, host, request_uri, headers, response, content):
 795         """A generator that creates Authorization objects
 796            that can be applied to requests.
 797         """
 798         challenges = _parse_www_authenticate(response, 'www-authenticate')
 799         for cred in self.credentials.iter(host):
 800             for scheme in AUTH_SCHEME_ORDER:
 801                 if challenges.has_key(scheme):
 802                     yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
 803
 804     def add_credentials(self, name, password, domain=""):
 805         """Add a name and password that will be used
 806         any time a request requires authentication."""
 807         self.credentials.add(name, password, domain)
 808
 809     def add_certificate(self, key, cert, domain):
 810         """Add a key and cert that will be used
 811         any time a request requires authentication."""
 812         self.certificates.add(key, cert, domain)
 813
 814     def clear_credentials(self):
 815         """Remove all the names and passwords
 816         that are used for authentication"""
 817         self.credentials.clear()
 818         self.authorizations = []
 819
 820     def _conn_request(self, conn, request_uri, method, body, headers):
 821         for i in range(2):
 822             try:
 823                 conn.request(method, request_uri, body, headers)
 824                 response = conn.getresponse()
 825             except socket.gaierror:
 826                 conn.close()
 827                 raise ServerNotFoundError("Unable to find the server at %s" % conn.host)
 828             except httplib.HTTPException, e:
 829                 if i == 0:
 830                     conn.close()
 831                     conn.connect()
 832                     continue
 833                 else:
 834                     raise
 835             else:
 836                 content = response.read()
 837                 response = Response(response)
 838                 if method != "HEAD":
 839                     content = _decompressContent(response, content)
 840
 841             break;
 842         return (response, content)
 843
 844
 845     def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
 846         """Do the actual request using the connection object
 847         and also follow one level of redirects if necessary"""
 848
 849         auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
 850         auth = auths and sorted(auths)[0][1] or None
 851         if auth:
 852             auth.request(method, request_uri, headers, body)
 853
 854         (response, content) = self._conn_request(conn, request_uri, method, body, headers)
 855
 856         if auth:
 857             if auth.response(response, body):
 858                 auth.request(method, request_uri, headers, body)
 859                 (response, content) = self._conn_request(conn, request_uri, method, body, headers )
 860                 response._stale_digest = 1
 861
 862         if response.status == 401:
 863             for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
 864                 authorization.request(method, request_uri, headers, body)
 865                 (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
 866                 if response.status != 401:
 867                     self.authorizations.append(authorization)
 868                     authorization.response(response, body)
 869                     break
 870
 871         if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303):
 872             if self.follow_redirects and response.status in [300, 301, 302, 303, 307]:
 873                 # Pick out the location header and basically start from the beginning
 874                 # remembering first to strip the ETag header and decrement our 'depth'
 875                 if redirections:
 876                     if not response.has_key('location') and response.status != 300:
 877                         raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content)
 878                     # Fix-up relative redirects (which violate an RFC 2616 MUST)
 879                     if response.has_key('location'):
 880                         location = response['location']
 881                         (scheme, authority, path, query, fragment) = parse_uri(location)
 882                         if authority == None:
 883                             response['location'] = urlparse.urljoin(absolute_uri, location)
 884                     if response.status == 301 and method in ["GET", "HEAD"]:
 885                         response['-x-permanent-redirect-url'] = response['location']
 886                         if not response.has_key('content-location'):
 887                             response['content-location'] = absolute_uri
 888                         _updateCache(headers, response, content, self.cache, cachekey)
 889                     if headers.has_key('if-none-match'):
 890                         del headers['if-none-match']
 891                     if headers.has_key('if-modified-since'):
 892                         del headers['if-modified-since']
 893                     if response.has_key('location'):
 894                         location = response['location']
 895                         old_response = copy.deepcopy(response)
 896                         if not old_response.has_key('content-location'):
 897                             old_response['content-location'] = absolute_uri
 898                         redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
 899                         (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
 900                         response.previous = old_response
 901                 else:
 902                     raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content)
 903             elif response.status in [200, 203] and method == "GET":
 904                 # Don't cache 206's since we aren't going to handle byte range requests
 905                 if not response.has_key('content-location'):
 906                     response['content-location'] = absolute_uri
 907                 _updateCache(headers, response, content, self.cache, cachekey)
 908
 909         return (response, content)
 910
 911
 912 # Need to catch and rebrand some exceptions
 913 # Then need to optionally turn all exceptions into status codes
 914 # including all socket.* and httplib.* exceptions.
 915
 916
 917     def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
 918         """ Performs a single HTTP request.
 919 The 'uri' is the URI of the HTTP resource and can begin
 920 with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
 921
 922 The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
 923 There is no restriction on the methods allowed.
 924
 925 The 'body' is the entity body to be sent with the request. It is a string
 926 object.
 927
 928 Any extra headers that are to be sent with the request should be provided in the
 929 'headers' dictionary.
 930
 931 The maximum number of redirect to follow before raising an
 932 exception is 'redirections. The default is 5.
 933
 934 The return value is a tuple of (response, content), the first
 935 being and instance of the 'Response' class, the second being
 936 a string that contains the response entity body.
 937         """
 938         try:
 939             if headers is None:
 940                 headers = {}
 941             else:
 942                 headers = _normalize_headers(headers)
 943
 944             if not headers.has_key('user-agent'):
 945                 headers['user-agent'] = "Python-httplib2/%s" % __version__
 946
 947             uri = iri2uri(uri)
 948
 949             (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
 950
 951             conn_key = scheme+":"+authority
 952             if conn_key in self.connections:
 953                 conn = self.connections[conn_key]
 954             else:
 955                 if not connection_type:
 956                     connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
 957                 certs = list(self.certificates.iter(authority))
 958                 if scheme == 'https' and certs:
 959                     conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
 960                         cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
 961                 else:
 962                     conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
 963                 conn.set_debuglevel(debuglevel)
 964
 965             if method in ["GET", "HEAD"] and 'range' not in headers:
 966                 headers['accept-encoding'] = 'compress, gzip'
 967
 968             info = email.Message.Message()
 969             cached_value = None
 970             if self.cache:
 971                 cachekey = defrag_uri
 972                 cached_value = self.cache.get(cachekey)
 973                 if cached_value:
 974                     info = email.message_from_string(cached_value)
 975                     try:
 976                         content = cached_value.split('\r\n\r\n', 1)[1]
 977                     except IndexError:
 978                         self.cache.delete(cachekey)
 979                         cachekey = None
 980                         cached_value = None
 981             else:
 982                 cachekey = None
 983
 984             if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
 985                 # http://www.w3.org/1999/04/Editing/
 986                 headers['if-match'] = info['etag']
 987
 988             if method not in ["GET", "HEAD"] and self.cache and cachekey:
 989                 # RFC 2616 Section 13.10
 990                 self.cache.delete(cachekey)
 991
 992             if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
 993                 if info.has_key('-x-permanent-redirect-url'):
 994                     # Should cached permanent redirects be counted in our redirection count? For now, yes.
 995                     (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
 996                     response.previous = Response(info)
 997                     response.previous.fromcache = True
 998                 else:
 999                     # Determine our course of action:
1000                     #   Is the cached entry fresh or stale?
1001                     #   Has the client requested a non-cached response?
1002                     #
1003                     # There seems to be three possible answers:
1004                     # 1. [FRESH] Return the cache entry w/o doing a GET
1005                     # 2. [STALE] Do the GET (but add in cache validators if available)
1006                     # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
1007                     entry_disposition = _entry_disposition(info, headers)
1008
1009                     if entry_disposition == "FRESH":
1010                         if not cached_value:
1011                             info['status'] = '504'
1012                             content = ""
1013                         response = Response(info)
1014                         if cached_value:
1015                             response.fromcache = True
1016                         return (response, content)
1017
1018                     if entry_disposition == "STALE":
1019                         if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
1020                             headers['if-none-match'] = info['etag']
1021                         if info.has_key('last-modified') and not 'last-modified' in headers:
1022                             headers['if-modified-since'] = info['last-modified']
1023                     elif entry_disposition == "TRANSPARENT":
1024                         pass
1025
1026                     (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1027
1028                 if response.status == 304 and method == "GET":
1029                     # Rewrite the cache entry with the new end-to-end headers
1030                     # Take all headers that are in response
1031                     # and overwrite their values in info.
1032                     # unless they are hop-by-hop, or are listed in the connection header.
1033
1034                     for key in _get_end2end_headers(response):
1035                         info[key] = response[key]
1036                     merged_response = Response(info)
1037                     if hasattr(response, "_stale_digest"):
1038                         merged_response._stale_digest = response._stale_digest
1039                     _updateCache(headers, merged_response, content, self.cache, cachekey)
1040                     response = merged_response
1041                     response.status = 200
1042                     response.fromcache = True
1043
1044                 elif response.status == 200:
1045                     content = new_content
1046                 else:
1047                     self.cache.delete(cachekey)
1048                     content = new_content
1049             else:
1050                 (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
1051         except Exception, e:
1052             if self.force_exception_to_status_code:
1053                 if isinstance(e, HttpLib2ErrorWithResponse):
1054                     response = e.response
1055                     content = e.content
1056                     response.status = 500
1057                     response.reason = str(e)
1058                 elif isinstance(e, socket.timeout):
1059                     content = "Request Timeout"
1060                     response = Response( {
1061                             "content-type": "text/plain",
1062                             "status": "408",
1063                             "content-length": len(content)
1064                             })
1065                     response.reason = "Request Timeout"
1066                 else:
1067                     content = str(e)
1068                     response = Response( {
1069                             "content-type": "text/plain",
1070                             "status": "400",
1071                             "content-length": len(content)
1072                             })
1073                     response.reason = "Bad Request"
1074             else:
1075                 raise
1076
1077
1078         return (response, content)
1079
1080
1081
1082 class Response(dict):
1083     """An object more like email.Message than httplib.HTTPResponse."""
1084
1085     """Is this response from our local cache"""
1086     fromcache = False
1087
1088     """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
1089     version = 11
1090
1091     "Status code returned by server. "
1092     status = 200
1093
1094     """Reason phrase returned by server."""
1095     reason = "Ok"
1096
1097     previous = None
1098
1099     def __init__(self, info):
1100         # info is either an email.Message or
1101         # an httplib.HTTPResponse object.
1102         if isinstance(info, httplib.HTTPResponse):
1103             for key, value in info.getheaders():
1104                 self[key] = value
1105             self.status = info.status
1106             self['status'] = str(self.status)
1107             self.reason = info.reason
1108             self.version = info.version
1109         elif isinstance(info, email.Message.Message):
1110             for key, value in info.items():
1111                 self[key] = value
1112             self.status = int(self['status'])
1113         else:
1114             for key, value in info.iteritems():
1115                 self[key] = value
1116             self.status = int(self.get('status', self.status))
1117
1118
1119     def __getattr__(self, name):
1120         if name == 'dict':
1121             return self
1122         else:
1123             raise AttributeError, name