Lib/httplib.py

   1 """HTTP/1.1 client library
   2
   3 <intro stuff goes here>
   4 <other stuff, too>
   5
   6 HTTPConnection goes through a number of "states", which define when a client
   7 may legally make another request or fetch the response for a particular
   8 request. This diagram details these state transitions:
   9
  10     (null)
  11       |
  12       | HTTPConnection()
  13       v
  14     Idle
  15       |
  16       | putrequest()
  17       v
  18     Request-started
  19       |
  20       | ( putheader() )*  endheaders()
  21       v
  22     Request-sent
  23       |
  24       | response = getresponse()
  25       v
  26     Unread-response   [Response-headers-read]
  27       |\____________________
  28       |                     |
  29       | response.read()     | putrequest()
  30       v                     v
  31     Idle                  Req-started-unread-response
  32                      ______/|
  33                    /        |
  34    response.read() |        | ( putheader() )*  endheaders()
  35                    v        v
  36        Request-started    Req-sent-unread-response
  37                             |
  38                             | response.read()
  39                             v
  40                           Request-sent
  41
  42 This diagram presents the following rules:
  43   -- a second request may not be started until {response-headers-read}
  44   -- a response [object] cannot be retrieved until {request-sent}
  45   -- there is no differentiation between an unread response body and a
  46      partially read response body
  47
  48 Note: this enforcement is applied by the HTTPConnection class. The
  49       HTTPResponse class does not enforce this state machine, which
  50       implies sophisticated clients may accelerate the request/response
  51       pipeline. Caution should be taken, though: accelerating the states
  52       beyond the above pattern may imply knowledge of the server's
  53       connection-close behavior for certain requests. For example, it
  54       is impossible to tell whether the server will close the connection
  55       UNTIL the response headers have been read; this means that further
  56       requests cannot be placed into the pipeline until it is known that
  57       the server will NOT be closing the connection.
  58
  59 Logical State                  __state            __response
  60 -------------                  -------            ----------
  61 Idle                           _CS_IDLE           None
  62 Request-started                _CS_REQ_STARTED    None
  63 Request-sent                   _CS_REQ_SENT       None
  64 Unread-response                _CS_IDLE           <response_class>
  65 Req-started-unread-response    _CS_REQ_STARTED    <response_class>
  66 Req-sent-unread-response       _CS_REQ_SENT       <response_class>
  67 """
  68
  69 from array import array
  70 import socket
  71 from sys import py3kwarning
  72 from urlparse import urlsplit
  73 import warnings
  74 with warnings.catch_warnings():
  75     if py3kwarning:
  76         warnings.filterwarnings("ignore", ".*mimetools has been removed",
  77                                 DeprecationWarning)
  78     import mimetools
  79
  80 try:
  81     from cStringIO import StringIO
  82 except ImportError:
  83     from StringIO import StringIO
  84
  85 __all__ = ["HTTP", "HTTPResponse", "HTTPConnection",
  86            "HTTPException", "NotConnected", "UnknownProtocol",
  87            "UnknownTransferEncoding", "UnimplementedFileMode",
  88            "IncompleteRead", "InvalidURL", "ImproperConnectionState",
  89            "CannotSendRequest", "CannotSendHeader", "ResponseNotReady",
  90            "BadStatusLine", "error", "responses"]
  91
  92 HTTP_PORT = 80
  93 HTTPS_PORT = 443
  94
  95 _UNKNOWN = 'UNKNOWN'
  96
  97 # connection states
  98 _CS_IDLE = 'Idle'
  99 _CS_REQ_STARTED = 'Request-started'
 100 _CS_REQ_SENT = 'Request-sent'
 101
 102 # status codes
 103 # informational
 104 CONTINUE = 100
 105 SWITCHING_PROTOCOLS = 101
 106 PROCESSING = 102
 107
 108 # successful
 109 OK = 200
 110 CREATED = 201
 111 ACCEPTED = 202
 112 NON_AUTHORITATIVE_INFORMATION = 203
 113 NO_CONTENT = 204
 114 RESET_CONTENT = 205
 115 PARTIAL_CONTENT = 206
 116 MULTI_STATUS = 207
 117 IM_USED = 226
 118
 119 # redirection
 120 MULTIPLE_CHOICES = 300
 121 MOVED_PERMANENTLY = 301
 122 FOUND = 302
 123 SEE_OTHER = 303
 124 NOT_MODIFIED = 304
 125 USE_PROXY = 305
 126 TEMPORARY_REDIRECT = 307
 127
 128 # client error
 129 BAD_REQUEST = 400
 130 UNAUTHORIZED = 401
 131 PAYMENT_REQUIRED = 402
 132 FORBIDDEN = 403
 133 NOT_FOUND = 404
 134 METHOD_NOT_ALLOWED = 405
 135 NOT_ACCEPTABLE = 406
 136 PROXY_AUTHENTICATION_REQUIRED = 407
 137 REQUEST_TIMEOUT = 408
 138 CONFLICT = 409
 139 GONE = 410
 140 LENGTH_REQUIRED = 411
 141 PRECONDITION_FAILED = 412
 142 REQUEST_ENTITY_TOO_LARGE = 413
 143 REQUEST_URI_TOO_LONG = 414
 144 UNSUPPORTED_MEDIA_TYPE = 415
 145 REQUESTED_RANGE_NOT_SATISFIABLE = 416
 146 EXPECTATION_FAILED = 417
 147 UNPROCESSABLE_ENTITY = 422
 148 LOCKED = 423
 149 FAILED_DEPENDENCY = 424
 150 UPGRADE_REQUIRED = 426
 151
 152 # server error
 153 INTERNAL_SERVER_ERROR = 500
 154 NOT_IMPLEMENTED = 501
 155 BAD_GATEWAY = 502
 156 SERVICE_UNAVAILABLE = 503
 157 GATEWAY_TIMEOUT = 504
 158 HTTP_VERSION_NOT_SUPPORTED = 505
 159 INSUFFICIENT_STORAGE = 507
 160 NOT_EXTENDED = 510
 161
 162 # Mapping status codes to official W3C names
 163 responses = {
 164     100: 'Continue',
 165     101: 'Switching Protocols',
 166
 167     200: 'OK',
 168     201: 'Created',
 169     202: 'Accepted',
 170     203: 'Non-Authoritative Information',
 171     204: 'No Content',
 172     205: 'Reset Content',
 173     206: 'Partial Content',
 174
 175     300: 'Multiple Choices',
 176     301: 'Moved Permanently',
 177     302: 'Found',
 178     303: 'See Other',
 179     304: 'Not Modified',
 180     305: 'Use Proxy',
 181     306: '(Unused)',
 182     307: 'Temporary Redirect',
 183
 184     400: 'Bad Request',
 185     401: 'Unauthorized',
 186     402: 'Payment Required',
 187     403: 'Forbidden',
 188     404: 'Not Found',
 189     405: 'Method Not Allowed',
 190     406: 'Not Acceptable',
 191     407: 'Proxy Authentication Required',
 192     408: 'Request Timeout',
 193     409: 'Conflict',
 194     410: 'Gone',
 195     411: 'Length Required',
 196     412: 'Precondition Failed',
 197     413: 'Request Entity Too Large',
 198     414: 'Request-URI Too Long',
 199     415: 'Unsupported Media Type',
 200     416: 'Requested Range Not Satisfiable',
 201     417: 'Expectation Failed',
 202
 203     500: 'Internal Server Error',
 204     501: 'Not Implemented',
 205     502: 'Bad Gateway',
 206     503: 'Service Unavailable',
 207     504: 'Gateway Timeout',
 208     505: 'HTTP Version Not Supported',
 209 }
 210
 211 # maximal amount of data to read at one time in _safe_read
 212 MAXAMOUNT = 1048576
 213
 214 class HTTPMessage(mimetools.Message):
 215
 216     def addheader(self, key, value):
 217         """Add header for field key handling repeats."""
 218         prev = self.dict.get(key)
 219         if prev is None:
 220             self.dict[key] = value
 221         else:
 222             combined = ", ".join((prev, value))
 223             self.dict[key] = combined
 224
 225     def addcontinue(self, key, more):
 226         """Add more field data from a continuation line."""
 227         prev = self.dict[key]
 228         self.dict[key] = prev + "\n " + more
 229
 230     def readheaders(self):
 231         """Read header lines.
 232
 233         Read header lines up to the entirely blank line that terminates them.
 234         The (normally blank) line that ends the headers is skipped, but not
 235         included in the returned list.  If a non-header line ends the headers,
 236         (which is an error), an attempt is made to backspace over it; it is
 237         never included in the returned list.
 238
 239         The variable self.status is set to the empty string if all went well,
 240         otherwise it is an error message.  The variable self.headers is a
 241         completely uninterpreted list of lines contained in the header (so
 242         printing them will reproduce the header exactly as it appears in the
 243         file).
 244
 245         If multiple header fields with the same name occur, they are combined
 246         according to the rules in RFC 2616 sec 4.2:
 247
 248         Appending each subsequent field-value to the first, each separated
 249         by a comma. The order in which header fields with the same field-name
 250         are received is significant to the interpretation of the combined
 251         field value.
 252         """
 253         # XXX The implementation overrides the readheaders() method of
 254         # rfc822.Message.  The base class design isn't amenable to
 255         # customized behavior here so the method here is a copy of the
 256         # base class code with a few small changes.
 257
 258         self.dict = {}
 259         self.unixfrom = ''
 260         self.headers = hlist = []
 261         self.status = ''
 262         headerseen = ""
 263         firstline = 1
 264         startofline = unread = tell = None
 265         if hasattr(self.fp, 'unread'):
 266             unread = self.fp.unread
 267         elif self.seekable:
 268             tell = self.fp.tell
 269         while True:
 270             if tell:
 271                 try:
 272                     startofline = tell()
 273                 except IOError:
 274                     startofline = tell = None
 275                     self.seekable = 0
 276             line = self.fp.readline()
 277             if not line:
 278                 self.status = 'EOF in headers'
 279                 break
 280             # Skip unix From name time lines
 281             if firstline and line.startswith('From '):
 282                 self.unixfrom = self.unixfrom + line
 283                 continue
 284             firstline = 0
 285             if headerseen and line[0] in ' \t':
 286                 # XXX Not sure if continuation lines are handled properly
 287                 # for http and/or for repeating headers
 288                 # It's a continuation line.
 289                 hlist.append(line)
 290                 self.addcontinue(headerseen, line.strip())
 291                 continue
 292             elif self.iscomment(line):
 293                 # It's a comment.  Ignore it.
 294                 continue
 295             elif self.islast(line):
 296                 # Note! No pushback here!  The delimiter line gets eaten.
 297                 break
 298             headerseen = self.isheader(line)
 299             if headerseen:
 300                 # It's a legal header line, save it.
 301                 hlist.append(line)
 302                 self.addheader(headerseen, line[len(headerseen)+1:].strip())
 303                 continue
 304             else:
 305                 # It's not a header line; throw it back and stop here.
 306                 if not self.dict:
 307                     self.status = 'No headers'
 308                 else:
 309                     self.status = 'Non-header line where header expected'
 310                 # Try to undo the read.
 311                 if unread:
 312                     unread(line)
 313                 elif tell:
 314                     self.fp.seek(startofline)
 315                 else:
 316                     self.status = self.status + '; bad seek'
 317                 break
 318
 319 class HTTPResponse:
 320
 321     # strict: If true, raise BadStatusLine if the status line can't be
 322     # parsed as a valid HTTP/1.0 or 1.1 status line.  By default it is
 323     # false because it prevents clients from talking to HTTP/0.9
 324     # servers.  Note that a response with a sufficiently corrupted
 325     # status line will look like an HTTP/0.9 response.
 326
 327     # See RFC 2616 sec 19.6 and RFC 1945 sec 6 for details.
 328
 329     def __init__(self, sock, debuglevel=0, strict=0, method=None, buffering=False):
 330         if buffering:
 331             # The caller won't be using any sock.recv() calls, so buffering
 332             # is fine and recommended for performance.
 333             self.fp = sock.makefile('rb')
 334         else:
 335             # The buffer size is specified as zero, because the headers of
 336             # the response are read with readline().  If the reads were
 337             # buffered the readline() calls could consume some of the
 338             # response, which make be read via a recv() on the underlying
 339             # socket.
 340             self.fp = sock.makefile('rb', 0)
 341         self.debuglevel = debuglevel
 342         self.strict = strict
 343         self._method = method
 344
 345         self.msg = None
 346
 347         # from the Status-Line of the response
 348         self.version = _UNKNOWN # HTTP-Version
 349         self.status = _UNKNOWN  # Status-Code
 350         self.reason = _UNKNOWN  # Reason-Phrase
 351
 352         self.chunked = _UNKNOWN         # is "chunked" being used?
 353         self.chunk_left = _UNKNOWN      # bytes left to read in current chunk
 354         self.length = _UNKNOWN          # number of bytes left in response
 355         self.will_close = _UNKNOWN      # conn will close at end of response
 356
 357     def _read_status(self):
 358         # Initialize with Simple-Response defaults
 359         line = self.fp.readline()
 360         if self.debuglevel > 0:
 361             print "reply:", repr(line)
 362         if not line:
 363             # Presumably, the server closed the connection before
 364             # sending a valid response.
 365             raise BadStatusLine(line)
 366         try:
 367             [version, status, reason] = line.split(None, 2)
 368         except ValueError:
 369             try:
 370                 [version, status] = line.split(None, 1)
 371                 reason = ""
 372             except ValueError:
 373                 # empty version will cause next test to fail and status
 374                 # will be treated as 0.9 response.
 375                 version = ""
 376         if not version.startswith('HTTP/'):
 377             if self.strict:
 378                 self.close()
 379                 raise BadStatusLine(line)
 380             else:
 381                 # assume it's a Simple-Response from an 0.9 server
 382                 self.fp = LineAndFileWrapper(line, self.fp)
 383                 return "HTTP/0.9", 200, ""
 384
 385         # The status code is a three-digit number
 386         try:
 387             status = int(status)
 388             if status < 100 or status > 999:
 389                 raise BadStatusLine(line)
 390         except ValueError:
 391             raise BadStatusLine(line)
 392         return version, status, reason
 393
 394     def begin(self):
 395         if self.msg is not None:
 396             # we've already started reading the response
 397             return
 398
 399         # read until we get a non-100 response
 400         while True:
 401             version, status, reason = self._read_status()
 402             if status != CONTINUE:
 403                 break
 404             # skip the header from the 100 response
 405             while True:
 406                 skip = self.fp.readline().strip()
 407                 if not skip:
 408                     break
 409                 if self.debuglevel > 0:
 410                     print "header:", skip
 411
 412         self.status = status
 413         self.reason = reason.strip()
 414         if version == 'HTTP/1.0':
 415             self.version = 10
 416         elif version.startswith('HTTP/1.'):
 417             self.version = 11   # use HTTP/1.1 code for HTTP/1.x where x>=1
 418         elif version == 'HTTP/0.9':
 419             self.version = 9
 420         else:
 421             raise UnknownProtocol(version)
 422
 423         if self.version == 9:
 424             self.length = None
 425             self.chunked = 0
 426             self.will_close = 1
 427             self.msg = HTTPMessage(StringIO())
 428             return
 429
 430         self.msg = HTTPMessage(self.fp, 0)
 431         if self.debuglevel > 0:
 432             for hdr in self.msg.headers:
 433                 print "header:", hdr,
 434
 435         # don't let the msg keep an fp
 436         self.msg.fp = None
 437
 438         # are we using the chunked-style of transfer encoding?
 439         tr_enc = self.msg.getheader('transfer-encoding')
 440         if tr_enc and tr_enc.lower() == "chunked":
 441             self.chunked = 1
 442             self.chunk_left = None
 443         else:
 444             self.chunked = 0
 445
 446         # will the connection close at the end of the response?
 447         self.will_close = self._check_close()
 448
 449         # do we have a Content-Length?
 450         # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked"
 451         length = self.msg.getheader('content-length')
 452         if length and not self.chunked:
 453             try:
 454                 self.length = int(length)
 455             except ValueError:
 456                 self.length = None
 457             else:
 458                 if self.length < 0:  # ignore nonsensical negative lengths
 459                     self.length = None
 460         else:
 461             self.length = None
 462
 463         # does the body have a fixed length? (of zero)
 464         if (status == NO_CONTENT or status == NOT_MODIFIED or
 465             100 <= status < 200 or      # 1xx codes
 466             self._method == 'HEAD'):
 467             self.length = 0
 468
 469         # if the connection remains open, and we aren't using chunked, and
 470         # a content-length was not provided, then assume that the connection
 471         # WILL close.
 472         if not self.will_close and \
 473            not self.chunked and \
 474            self.length is None:
 475             self.will_close = 1
 476
 477     def _check_close(self):
 478         conn = self.msg.getheader('connection')
 479         if self.version == 11:
 480             # An HTTP/1.1 proxy is assumed to stay open unless
 481             # explicitly closed.
 482             conn = self.msg.getheader('connection')
 483             if conn and "close" in conn.lower():
 484                 return True
 485             return False
 486
 487         # Some HTTP/1.0 implementations have support for persistent
 488         # connections, using rules different than HTTP/1.1.
 489
 490         # For older HTTP, Keep-Alive indicates persistent connection.
 491         if self.msg.getheader('keep-alive'):
 492             return False
 493
 494         # At least Akamai returns a "Connection: Keep-Alive" header,
 495         # which was supposed to be sent by the client.
 496         if conn and "keep-alive" in conn.lower():
 497             return False
 498
 499         # Proxy-Connection is a netscape hack.
 500         pconn = self.msg.getheader('proxy-connection')
 501         if pconn and "keep-alive" in pconn.lower():
 502             return False
 503
 504         # otherwise, assume it will close
 505         return True
 506
 507     def close(self):
 508         if self.fp:
 509             self.fp.close()
 510             self.fp = None
 511
 512     def isclosed(self):
 513         # NOTE: it is possible that we will not ever call self.close(). This
 514         #       case occurs when will_close is TRUE, length is None, and we
 515         #       read up to the last byte, but NOT past it.
 516         #
 517         # IMPLIES: if will_close is FALSE, then self.close() will ALWAYS be
 518         #          called, meaning self.isclosed() is meaningful.
 519         return self.fp is None
 520
 521     # XXX It would be nice to have readline and __iter__ for this, too.
 522
 523     def read(self, amt=None):
 524         if self.fp is None:
 525             return ''
 526
 527         if self.chunked:
 528             return self._read_chunked(amt)
 529
 530         if amt is None:
 531             # unbounded read
 532             if self.length is None:
 533                 s = self.fp.read()
 534             else:
 535                 s = self._safe_read(self.length)
 536                 self.length = 0
 537             self.close()        # we read everything
 538             return s
 539
 540         if self.length is not None:
 541             if amt > self.length:
 542                 # clip the read to the "end of response"
 543                 amt = self.length
 544
 545         # we do not use _safe_read() here because this may be a .will_close
 546         # connection, and the user is reading more bytes than will be provided
 547         # (for example, reading in 1k chunks)
 548         s = self.fp.read(amt)
 549         if self.length is not None:
 550             self.length -= len(s)
 551             if not self.length:
 552                 self.close()
 553         return s
 554
 555     def _read_chunked(self, amt):
 556         assert self.chunked != _UNKNOWN
 557         chunk_left = self.chunk_left
 558         value = []
 559         while True:
 560             if chunk_left is None:
 561                 line = self.fp.readline()
 562                 i = line.find(';')
 563                 if i >= 0:
 564                     line = line[:i] # strip chunk-extensions
 565                 try:
 566                     chunk_left = int(line, 16)
 567                 except ValueError:
 568                     # close the connection as protocol synchronisation is
 569                     # probably lost
 570                     self.close()
 571                     raise IncompleteRead(''.join(value))
 572                 if chunk_left == 0:
 573                     break
 574             if amt is None:
 575                 value.append(self._safe_read(chunk_left))
 576             elif amt < chunk_left:
 577                 value.append(self._safe_read(amt))
 578                 self.chunk_left = chunk_left - amt
 579                 return ''.join(value)
 580             elif amt == chunk_left:
 581                 value.append(self._safe_read(amt))
 582                 self._safe_read(2)  # toss the CRLF at the end of the chunk
 583                 self.chunk_left = None
 584                 return ''.join(value)
 585             else:
 586                 value.append(self._safe_read(chunk_left))
 587                 amt -= chunk_left
 588
 589             # we read the whole chunk, get another
 590             self._safe_read(2)      # toss the CRLF at the end of the chunk
 591             chunk_left = None
 592
 593         # read and discard trailer up to the CRLF terminator
 594         ### note: we shouldn't have any trailers!
 595         while True:
 596             line = self.fp.readline()
 597             if not line:
 598                 # a vanishingly small number of sites EOF without
 599                 # sending the trailer
 600                 break
 601             if line == '\r\n':
 602                 break
 603
 604         # we read everything; close the "file"
 605         self.close()
 606
 607         return ''.join(value)
 608
 609     def _safe_read(self, amt):
 610         """Read the number of bytes requested, compensating for partial reads.
 611
 612         Normally, we have a blocking socket, but a read() can be interrupted
 613         by a signal (resulting in a partial read).
 614
 615         Note that we cannot distinguish between EOF and an interrupt when zero
 616         bytes have been read. IncompleteRead() will be raised in this
 617         situation.
 618
 619         This function should be used when <amt> bytes "should" be present for
 620         reading. If the bytes are truly not available (due to EOF), then the
 621         IncompleteRead exception can be used to detect the problem.
 622         """
 623         # NOTE(gps): As of svn r74426 socket._fileobject.read(x) will never
 624         # return less than x bytes unless EOF is encountered.  It now handles
 625         # signal interruptions (socket.error EINTR) internally.  This code
 626         # never caught that exception anyways.  It seems largely pointless.
 627         # self.fp.read(amt) will work fine.
 628         s = []
 629         while amt > 0:
 630             chunk = self.fp.read(min(amt, MAXAMOUNT))
 631             if not chunk:
 632                 raise IncompleteRead(''.join(s), amt)
 633             s.append(chunk)
 634             amt -= len(chunk)
 635         return ''.join(s)
 636
 637     def getheader(self, name, default=None):
 638         if self.msg is None:
 639             raise ResponseNotReady()
 640         return self.msg.getheader(name, default)
 641
 642     def getheaders(self):
 643         """Return list of (header, value) tuples."""
 644         if self.msg is None:
 645             raise ResponseNotReady()
 646         return self.msg.items()
 647
 648
 649 class HTTPConnection:
 650
 651     _http_vsn = 11
 652     _http_vsn_str = 'HTTP/1.1'
 653
 654     response_class = HTTPResponse
 655     default_port = HTTP_PORT
 656     auto_open = 1
 657     debuglevel = 0
 658     strict = 0
 659
 660     def __init__(self, host, port=None, strict=None,
 661                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 662         self.timeout = timeout
 663         self.sock = None
 664         self._buffer = []
 665         self.__response = None
 666         self.__state = _CS_IDLE
 667         self._method = None
 668         self._tunnel_host = None
 669         self._tunnel_port = None
 670
 671         self._set_hostport(host, port)
 672         if strict is not None:
 673             self.strict = strict
 674
 675     def set_tunnel(self, host, port=None):
 676         """ Sets up the host and the port for the HTTP CONNECT Tunnelling."""
 677         self._tunnel_host = host
 678         self._tunnel_port = port
 679
 680     def _set_hostport(self, host, port):
 681         if port is None:
 682             i = host.rfind(':')
 683             j = host.rfind(']')         # ipv6 addresses have [...]
 684             if i > j:
 685                 try:
 686                     port = int(host[i+1:])
 687                 except ValueError:
 688                     raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
 689                 host = host[:i]
 690             else:
 691                 port = self.default_port
 692             if host and host[0] == '[' and host[-1] == ']':
 693                 host = host[1:-1]
 694         self.host = host
 695         self.port = port
 696
 697     def set_debuglevel(self, level):
 698         self.debuglevel = level
 699
 700     def _tunnel(self):
 701         self._set_hostport(self._tunnel_host, self._tunnel_port)
 702         self.send("CONNECT %s:%d HTTP/1.0\r\n\r\n" % (self.host, self.port))
 703         response = self.response_class(self.sock, strict = self.strict,
 704                                        method = self._method)
 705         (version, code, message) = response._read_status()
 706
 707         if code != 200:
 708             self.close()
 709             raise socket.error("Tunnel connection failed: %d %s" % (code,
 710                                                                     message.strip()))
 711         while True:
 712             line = response.fp.readline()
 713             if line == '\r\n': break
 714
 715
 716     def connect(self):
 717         """Connect to the host and port specified in __init__."""
 718         self.sock = socket.create_connection((self.host,self.port),
 719                                              self.timeout)
 720
 721         if self._tunnel_host:
 722             self._tunnel()
 723
 724     def close(self):
 725         """Close the connection to the HTTP server."""
 726         if self.sock:
 727             self.sock.close()   # close it manually... there may be other refs
 728             self.sock = None
 729         if self.__response:
 730             self.__response.close()
 731             self.__response = None
 732         self.__state = _CS_IDLE
 733
 734     def send(self, str):
 735         """Send `str' to the server."""
 736         if self.sock is None:
 737             if self.auto_open:
 738                 self.connect()
 739             else:
 740                 raise NotConnected()
 741
 742         # send the data to the server. if we get a broken pipe, then close
 743         # the socket. we want to reconnect when somebody tries to send again.
 744         #
 745         # NOTE: we DO propagate the error, though, because we cannot simply
 746         #       ignore the error... the caller will know if they can retry.
 747         if self.debuglevel > 0:
 748             print "send:", repr(str)
 749         try:
 750             blocksize=8192
 751             if hasattr(str,'read') and not isinstance(str, array):
 752                 if self.debuglevel > 0: print "sendIng a read()able"
 753                 data=str.read(blocksize)
 754                 while data:
 755                     self.sock.sendall(data)
 756                     data=str.read(blocksize)
 757             else:
 758                 self.sock.sendall(str)
 759         except socket.error, v:
 760             if v.args[0] == 32:      # Broken pipe
 761                 self.close()
 762             raise
 763
 764     def _output(self, s):
 765         """Add a line of output to the current request buffer.
 766
 767         Assumes that the line does *not* end with \\r\\n.
 768         """
 769         self._buffer.append(s)
 770
 771     def _send_output(self, message_body=None):
 772         """Send the currently buffered request and clear the buffer.
 773
 774         Appends an extra \\r\\n to the buffer.
 775         A message_body may be specified, to be appended to the request.
 776         """
 777         self._buffer.extend(("", ""))
 778         msg = "\r\n".join(self._buffer)
 779         del self._buffer[:]
 780         # If msg and message_body are sent in a single send() call,
 781         # it will avoid performance problems caused by the interaction
 782         # between delayed ack and the Nagle algorithim.
 783         if isinstance(message_body, str):
 784             msg += message_body
 785             message_body = None
 786         self.send(msg)
 787         if message_body is not None:
 788             #message_body was not a string (i.e. it is a file) and
 789             #we must run the risk of Nagle
 790             self.send(message_body)
 791
 792     def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0):
 793         """Send a request to the server.
 794
 795         `method' specifies an HTTP request method, e.g. 'GET'.
 796         `url' specifies the object being requested, e.g. '/index.html'.
 797         `skip_host' if True does not add automatically a 'Host:' header
 798         `skip_accept_encoding' if True does not add automatically an
 799            'Accept-Encoding:' header
 800         """
 801
 802         # if a prior response has been completed, then forget about it.
 803         if self.__response and self.__response.isclosed():
 804             self.__response = None
 805
 806
 807         # in certain cases, we cannot issue another request on this connection.
 808         # this occurs when:
 809         #   1) we are in the process of sending a request.   (_CS_REQ_STARTED)
 810         #   2) a response to a previous request has signalled that it is going
 811         #      to close the connection upon completion.
 812         #   3) the headers for the previous response have not been read, thus
 813         #      we cannot determine whether point (2) is true.   (_CS_REQ_SENT)
 814         #
 815         # if there is no prior response, then we can request at will.
 816         #
 817         # if point (2) is true, then we will have passed the socket to the
 818         # response (effectively meaning, "there is no prior response"), and
 819         # will open a new one when a new request is made.
 820         #
 821         # Note: if a prior response exists, then we *can* start a new request.
 822         #       We are not allowed to begin fetching the response to this new
 823         #       request, however, until that prior response is complete.
 824         #
 825         if self.__state == _CS_IDLE:
 826             self.__state = _CS_REQ_STARTED
 827         else:
 828             raise CannotSendRequest()
 829
 830         # Save the method we use, we need it later in the response phase
 831         self._method = method
 832         if not url:
 833             url = '/'
 834         str = '%s %s %s' % (method, url, self._http_vsn_str)
 835
 836         self._output(str)
 837
 838         if self._http_vsn == 11:
 839             # Issue some standard headers for better HTTP/1.1 compliance
 840
 841             if not skip_host:
 842                 # this header is issued *only* for HTTP/1.1
 843                 # connections. more specifically, this means it is
 844                 # only issued when the client uses the new
 845                 # HTTPConnection() class. backwards-compat clients
 846                 # will be using HTTP/1.0 and those clients may be
 847                 # issuing this header themselves. we should NOT issue
 848                 # it twice; some web servers (such as Apache) barf
 849                 # when they see two Host: headers
 850
 851                 # If we need a non-standard port,include it in the
 852                 # header.  If the request is going through a proxy,
 853                 # but the host of the actual URL, not the host of the
 854                 # proxy.
 855
 856                 netloc = ''
 857                 if url.startswith('http'):
 858                     nil, netloc, nil, nil, nil = urlsplit(url)
 859
 860                 if netloc:
 861                     try:
 862                         netloc_enc = netloc.encode("ascii")
 863                     except UnicodeEncodeError:
 864                         netloc_enc = netloc.encode("idna")
 865                     self.putheader('Host', netloc_enc)
 866                 else:
 867                     try:
 868                         host_enc = self.host.encode("ascii")
 869                     except UnicodeEncodeError:
 870                         host_enc = self.host.encode("idna")
 871                     if self.port == self.default_port:
 872                         self.putheader('Host', host_enc)
 873                     else:
 874                         self.putheader('Host', "%s:%s" % (host_enc, self.port))
 875
 876             # note: we are assuming that clients will not attempt to set these
 877             #       headers since *this* library must deal with the
 878             #       consequences. this also means that when the supporting
 879             #       libraries are updated to recognize other forms, then this
 880             #       code should be changed (removed or updated).
 881
 882             # we only want a Content-Encoding of "identity" since we don't
 883             # support encodings such as x-gzip or x-deflate.
 884             if not skip_accept_encoding:
 885                 self.putheader('Accept-Encoding', 'identity')
 886
 887             # we can accept "chunked" Transfer-Encodings, but no others
 888             # NOTE: no TE header implies *only* "chunked"
 889             #self.putheader('TE', 'chunked')
 890
 891             # if TE is supplied in the header, then it must appear in a
 892             # Connection header.
 893             #self.putheader('Connection', 'TE')
 894
 895         else:
 896             # For HTTP/1.0, the server will assume "not chunked"
 897             pass
 898
 899     def putheader(self, header, *values):
 900         """Send a request header line to the server.
 901
 902         For example: h.putheader('Accept', 'text/html')
 903         """
 904         if self.__state != _CS_REQ_STARTED:
 905             raise CannotSendHeader()
 906
 907         str = '%s: %s' % (header, '\r\n\t'.join(values))
 908         self._output(str)
 909
 910     def endheaders(self, message_body=None):
 911         """Indicate that the last header line has been sent to the server.
 912
 913         This method sends the request to the server.  The optional
 914         message_body argument can be used to pass message body
 915         associated with the request.  The message body will be sent in
 916         the same packet as the message headers if possible.  The
 917         message_body should be a string.
 918         """
 919         if self.__state == _CS_REQ_STARTED:
 920             self.__state = _CS_REQ_SENT
 921         else:
 922             raise CannotSendHeader()
 923         self._send_output(message_body)
 924
 925     def request(self, method, url, body=None, headers={}):
 926         """Send a complete request to the server."""
 927
 928         try:
 929             self._send_request(method, url, body, headers)
 930         except socket.error, v:
 931             # trap 'Broken pipe' if we're allowed to automatically reconnect
 932             if v.args[0] != 32 or not self.auto_open:
 933                 raise
 934             # try one more time
 935             self._send_request(method, url, body, headers)
 936
 937     def _set_content_length(self, body):
 938         # Set the content-length based on the body.
 939         thelen = None
 940         try:
 941             thelen = str(len(body))
 942         except TypeError, te:
 943             # If this is a file-like object, try to
 944             # fstat its file descriptor
 945             import os
 946             try:
 947                 thelen = str(os.fstat(body.fileno()).st_size)
 948             except (AttributeError, OSError):
 949                 # Don't send a length if this failed
 950                 if self.debuglevel > 0: print "Cannot stat!!"
 951
 952         if thelen is not None:
 953             self.putheader('Content-Length', thelen)
 954
 955     def _send_request(self, method, url, body, headers):
 956         # honour explicitly requested Host: and Accept-Encoding headers
 957         header_names = dict.fromkeys([k.lower() for k in headers])
 958         skips = {}
 959         if 'host' in header_names:
 960             skips['skip_host'] = 1
 961         if 'accept-encoding' in header_names:
 962             skips['skip_accept_encoding'] = 1
 963
 964         self.putrequest(method, url, **skips)
 965
 966         if body and ('content-length' not in header_names):
 967             self._set_content_length(body)
 968         for hdr, value in headers.iteritems():
 969             self.putheader(hdr, value)
 970         self.endheaders(body)
 971
 972     def getresponse(self, buffering=False):
 973         "Get the response from the server."
 974
 975         # if a prior response has been completed, then forget about it.
 976         if self.__response and self.__response.isclosed():
 977             self.__response = None
 978
 979         #
 980         # if a prior response exists, then it must be completed (otherwise, we
 981         # cannot read this response's header to determine the connection-close
 982         # behavior)
 983         #
 984         # note: if a prior response existed, but was connection-close, then the
 985         # socket and response were made independent of this HTTPConnection
 986         # object since a new request requires that we open a whole new
 987         # connection
 988         #
 989         # this means the prior response had one of two states:
 990         #   1) will_close: this connection was reset and the prior socket and
 991         #                  response operate independently
 992         #   2) persistent: the response was retained and we await its
 993         #                  isclosed() status to become true.
 994         #
 995         if self.__state != _CS_REQ_SENT or self.__response:
 996             raise ResponseNotReady()
 997
 998         args = (self.sock,)
 999         kwds = {"strict":self.strict, "method":self._method}
1000         if self.debuglevel > 0:
1001             args += (self.debuglevel,)
1002         if buffering:
1003             #only add this keyword if non-default, for compatibility with
1004             #other response_classes.
1005             kwds["buffering"] = True;
1006         response = self.response_class(*args, **kwds)
1007
1008         response.begin()
1009         assert response.will_close != _UNKNOWN
1010         self.__state = _CS_IDLE
1011
1012         if response.will_close:
1013             # this effectively passes the connection to the response
1014             self.close()
1015         else:
1016             # remember this, so we can tell when it is complete
1017             self.__response = response
1018
1019         return response
1020
1021
1022 class HTTP:
1023     "Compatibility class with httplib.py from 1.5."
1024
1025     _http_vsn = 10
1026     _http_vsn_str = 'HTTP/1.0'
1027
1028     debuglevel = 0
1029
1030     _connection_class = HTTPConnection
1031
1032     def __init__(self, host='', port=None, strict=None):
1033         "Provide a default host, since the superclass requires one."
1034
1035         # some joker passed 0 explicitly, meaning default port
1036         if port == 0:
1037             port = None
1038
1039         # Note that we may pass an empty string as the host; this will throw
1040         # an error when we attempt to connect. Presumably, the client code
1041         # will call connect before then, with a proper host.
1042         self._setup(self._connection_class(host, port, strict))
1043
1044     def _setup(self, conn):
1045         self._conn = conn
1046
1047         # set up delegation to flesh out interface
1048         self.send = conn.send
1049         self.putrequest = conn.putrequest
1050         self.putheader = conn.putheader
1051         self.endheaders = conn.endheaders
1052         self.set_debuglevel = conn.set_debuglevel
1053
1054         conn._http_vsn = self._http_vsn
1055         conn._http_vsn_str = self._http_vsn_str
1056
1057         self.file = None
1058
1059     def connect(self, host=None, port=None):
1060         "Accept arguments to set the host/port, since the superclass doesn't."
1061
1062         if host is not None:
1063             self._conn._set_hostport(host, port)
1064         self._conn.connect()
1065
1066     def getfile(self):
1067         "Provide a getfile, since the superclass' does not use this concept."
1068         return self.file
1069
1070     def getreply(self, buffering=False):
1071         """Compat definition since superclass does not define it.
1072
1073         Returns a tuple consisting of:
1074         - server status code (e.g. '200' if all goes well)
1075         - server "reason" corresponding to status code
1076         - any RFC822 headers in the response from the server
1077         """
1078         try:
1079             if not buffering:
1080                 response = self._conn.getresponse()
1081             else:
1082                 #only add this keyword if non-default for compatibility
1083                 #with other connection classes
1084                 response = self._conn.getresponse(buffering)
1085         except BadStatusLine, e:
1086             ### hmm. if getresponse() ever closes the socket on a bad request,
1087             ### then we are going to have problems with self.sock
1088
1089             ### should we keep this behavior? do people use it?
1090             # keep the socket open (as a file), and return it
1091             self.file = self._conn.sock.makefile('rb', 0)
1092
1093             # close our socket -- we want to restart after any protocol error
1094             self.close()
1095
1096             self.headers = None
1097             return -1, e.line, None
1098
1099         self.headers = response.msg
1100         self.file = response.fp
1101         return response.status, response.reason, response.msg
1102
1103     def close(self):
1104         self._conn.close()
1105
1106         # note that self.file == response.fp, which gets closed by the
1107         # superclass. just clear the object ref here.
1108         ### hmm. messy. if status==-1, then self.file is owned by us.
1109         ### well... we aren't explicitly closing, but losing this ref will
1110         ### do it
1111         self.file = None
1112
1113 try:
1114     import ssl
1115 except ImportError:
1116     pass
1117 else:
1118     class HTTPSConnection(HTTPConnection):
1119         "This class allows communication via SSL."
1120
1121         default_port = HTTPS_PORT
1122
1123         def __init__(self, host, port=None, key_file=None, cert_file=None,
1124                      strict=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
1125             HTTPConnection.__init__(self, host, port, strict, timeout)
1126             self.key_file = key_file
1127             self.cert_file = cert_file
1128
1129         def connect(self):
1130             "Connect to a host on a given (SSL) port."
1131
1132             sock = socket.create_connection((self.host, self.port), self.timeout)
1133             if self._tunnel_host:
1134                 self.sock = sock
1135                 self._tunnel()
1136             self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
1137
1138     __all__.append("HTTPSConnection")
1139
1140     class HTTPS(HTTP):
1141         """Compatibility with 1.5 httplib interface
1142
1143         Python 1.5.2 did not have an HTTPS class, but it defined an
1144         interface for sending http requests that is also useful for
1145         https.
1146         """
1147
1148         _connection_class = HTTPSConnection
1149
1150         def __init__(self, host='', port=None, key_file=None, cert_file=None,
1151                      strict=None):
1152             # provide a default host, pass the X509 cert info
1153
1154             # urf. compensate for bad input.
1155             if port == 0:
1156                 port = None
1157             self._setup(self._connection_class(host, port, key_file,
1158                                                cert_file, strict))
1159
1160             # we never actually use these for anything, but we keep them
1161             # here for compatibility with post-1.5.2 CVS.
1162             self.key_file = key_file
1163             self.cert_file = cert_file
1164
1165
1166     def FakeSocket (sock, sslobj):
1167         warnings.warn("FakeSocket is deprecated, and won't be in 3.x.  " +
1168                       "Use the result of ssl.wrap_socket() directly instead.",
1169                       DeprecationWarning, stacklevel=2)
1170         return sslobj
1171
1172
1173 class HTTPException(Exception):
1174     # Subclasses that define an __init__ must call Exception.__init__
1175     # or define self.args.  Otherwise, str() will fail.
1176     pass
1177
1178 class NotConnected(HTTPException):
1179     pass
1180
1181 class InvalidURL(HTTPException):
1182     pass
1183
1184 class UnknownProtocol(HTTPException):
1185     def __init__(self, version):
1186         self.args = version,
1187         self.version = version
1188
1189 class UnknownTransferEncoding(HTTPException):
1190     pass
1191
1192 class UnimplementedFileMode(HTTPException):
1193     pass
1194
1195 class IncompleteRead(HTTPException):
1196     def __init__(self, partial, expected=None):
1197         self.args = partial,
1198         self.partial = partial
1199         self.expected = expected
1200     def __repr__(self):
1201         if self.expected is not None:
1202             e = ', %i more expected' % self.expected
1203         else:
1204             e = ''
1205         return 'IncompleteRead(%i bytes read%s)' % (len(self.partial), e)
1206     def __str__(self):
1207         return repr(self)
1208
1209 class ImproperConnectionState(HTTPException):
1210     pass
1211
1212 class CannotSendRequest(ImproperConnectionState):
1213     pass
1214
1215 class CannotSendHeader(ImproperConnectionState):
1216     pass
1217
1218 class ResponseNotReady(ImproperConnectionState):
1219     pass
1220
1221 class BadStatusLine(HTTPException):
1222     def __init__(self, line):
1223         self.args = line,
1224         self.line = line
1225
1226 # for backwards compatibility
1227 error = HTTPException
1228
1229 class LineAndFileWrapper:
1230     """A limited file-like object for HTTP/0.9 responses."""
1231
1232     # The status-line parsing code calls readline(), which normally
1233     # get the HTTP status line.  For a 0.9 response, however, this is
1234     # actually the first line of the body!  Clients need to get a
1235     # readable file object that contains that line.
1236
1237     def __init__(self, line, file):
1238         self._line = line
1239         self._file = file
1240         self._line_consumed = 0
1241         self._line_offset = 0
1242         self._line_left = len(line)
1243
1244     def __getattr__(self, attr):
1245         return getattr(self._file, attr)
1246
1247     def _done(self):
1248         # called when the last byte is read from the line.  After the
1249         # call, all read methods are delegated to the underlying file
1250         # object.
1251         self._line_consumed = 1
1252         self.read = self._file.read
1253         self.readline = self._file.readline
1254         self.readlines = self._file.readlines
1255
1256     def read(self, amt=None):
1257         if self._line_consumed:
1258             return self._file.read(amt)
1259         assert self._line_left
1260         if amt is None or amt > self._line_left:
1261             s = self._line[self._line_offset:]
1262             self._done()
1263             if amt is None:
1264                 return s + self._file.read()
1265             else:
1266                 return s + self._file.read(amt - len(s))
1267         else:
1268             assert amt <= self._line_left
1269             i = self._line_offset
1270             j = i + amt
1271             s = self._line[i:j]
1272             self._line_offset = j
1273             self._line_left -= amt
1274             if self._line_left == 0:
1275                 self._done()
1276             return s
1277
1278     def readline(self):
1279         if self._line_consumed:
1280             return self._file.readline()
1281         assert self._line_left
1282         s = self._line[self._line_offset:]
1283         self._done()
1284         return s
1285
1286     def readlines(self, size=None):
1287         if self._line_consumed:
1288             return self._file.readlines(size)
1289         assert self._line_left
1290         L = [self._line[self._line_offset:]]
1291         self._done()
1292         if size is None:
1293             return L + self._file.readlines()
1294         else:
1295             return L + self._file.readlines(size)
1296
1297 def test():
1298     """Test this module.
1299
1300     A hodge podge of tests collected here, because they have too many
1301     external dependencies for the regular test suite.
1302     """
1303
1304     import sys
1305     import getopt
1306     opts, args = getopt.getopt(sys.argv[1:], 'd')
1307     dl = 0
1308     for o, a in opts:
1309         if o == '-d': dl = dl + 1
1310     host = 'www.python.org'
1311     selector = '/'
1312     if args[0:]: host = args[0]
1313     if args[1:]: selector = args[1]
1314     h = HTTP()
1315     h.set_debuglevel(dl)
1316     h.connect(host)
1317     h.putrequest('GET', selector)
1318     h.endheaders()
1319     status, reason, headers = h.getreply()
1320     print 'status =', status
1321     print 'reason =', reason
1322     print "read", len(h.getfile().read())
1323     print
1324     if headers:
1325         for header in headers.headers: print header.strip()
1326     print
1327
1328     # minimal test that code to extract host from url works
1329     class HTTP11(HTTP):
1330         _http_vsn = 11
1331         _http_vsn_str = 'HTTP/1.1'
1332
1333     h = HTTP11('www.python.org')
1334     h.putrequest('GET', 'http://www.python.org/~jeremy/')
1335     h.endheaders()
1336     h.getreply()
1337     h.close()
1338
1339     try:
1340         import ssl
1341     except ImportError:
1342         pass
1343     else:
1344
1345         for host, selector in (('sourceforge.net', '/projects/python'),
1346                                ):
1347             print "https://%s%s" % (host, selector)
1348             hs = HTTPS()
1349             hs.set_debuglevel(dl)
1350             hs.connect(host)
1351             hs.putrequest('GET', selector)
1352             hs.endheaders()
1353             status, reason, headers = hs.getreply()
1354             print 'status =', status
1355             print 'reason =', reason
1356             print "read", len(hs.getfile().read())
1357             print
1358             if headers:
1359                 for header in headers.headers: print header.strip()
1360             print
1361
1362 if __name__ == '__main__':
1363     test()