python/google/appengine/api/urlfetch_stub.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Google Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18
  19
  20
  21
  22
  23 """Stub version of the urlfetch API, based on httplib."""
  24
  25
  26
  27
  28 _successfully_imported_fancy_urllib = False
  29 _fancy_urllib_InvalidCertException = None
  30 _fancy_urllib_SSLError = None
  31 try:
  32   import fancy_urllib
  33   _successfully_imported_fancy_urllib = True
  34   _fancy_urllib_InvalidCertException = fancy_urllib.InvalidCertificateException
  35   _fancy_urllib_SSLError = fancy_urllib.SSLError
  36 except ImportError:
  37   pass
  38
  39 import gzip
  40 import httplib
  41 import logging
  42 import os
  43 import socket
  44 import StringIO
  45 import sys
  46 import urllib
  47 import urlparse
  48
  49 from google.appengine.api import apiproxy_stub
  50 from google.appengine.api import urlfetch
  51 from google.appengine.api import urlfetch_errors
  52 from google.appengine.api import urlfetch_service_pb
  53 from google.appengine.runtime import apiproxy_errors
  54
  55
  56
  57 MAX_REQUEST_SIZE = 10 << 20
  58
  59 MAX_RESPONSE_SIZE = 2 ** 25
  60
  61 MAX_REDIRECTS = urlfetch.MAX_REDIRECTS
  62
  63 REDIRECT_STATUSES = frozenset([
  64   httplib.MOVED_PERMANENTLY,
  65   httplib.FOUND,
  66   httplib.SEE_OTHER,
  67   httplib.TEMPORARY_REDIRECT,
  68 ])
  69
  70
  71
  72
  73
  74 _API_CALL_DEADLINE = 5.0
  75
  76
  77
  78
  79 _API_CALL_VALIDATE_CERTIFICATE_DEFAULT = False
  80
  81
  82 _CONNECTION_SUPPORTS_TIMEOUT = sys.version_info >= (2, 6)
  83
  84
  85
  86
  87
  88
  89
  90 _UNTRUSTED_REQUEST_HEADERS = frozenset([
  91   'content-length',
  92   'host',
  93   'vary',
  94   'via',
  95   'x-forwarded-for',
  96 ])
  97
  98 _MAX_URL_LENGTH = 2048
  99
 100
 101 def _CanValidateCerts():
 102   return (_successfully_imported_fancy_urllib and
 103           fancy_urllib.can_validate_certs())
 104
 105
 106 def _SetupSSL(path):
 107   global CERT_PATH
 108   if os.path.exists(path):
 109     CERT_PATH = path
 110   else:
 111     CERT_PATH = None
 112     logging.warning('%s missing; without this urlfetch will not be able to '
 113                     'validate SSL certificates.', path)
 114
 115   if not _CanValidateCerts():
 116     logging.warning('No ssl package found. urlfetch will not be able to '
 117                     'validate SSL certificates.')
 118
 119
 120 _SetupSSL(os.path.normpath(os.path.join(os.path.dirname(__file__), '..', '..',
 121                                         '..', 'lib', 'cacerts',
 122                                         'urlfetch_cacerts.txt')))
 123
 124 def _IsAllowedPort(port):
 125
 126   if port is None:
 127     return True
 128   try:
 129     port = int(port)
 130   except ValueError, e:
 131     return False
 132
 133
 134
 135
 136   if ((port >= 80 and port <= 90) or
 137       (port >= 440 and port <= 450) or
 138       port >= 1024):
 139     return True
 140   return False
 141
 142
 143 class URLFetchServiceStub(apiproxy_stub.APIProxyStub):
 144   """Stub version of the urlfetch API to be used with apiproxy_stub_map."""
 145
 146   THREADSAFE = True
 147
 148   def __init__(self,
 149                service_name='urlfetch',
 150                urlmatchers_to_fetch_functions=None):
 151     """Initializer.
 152
 153     Args:
 154       service_name: Service name expected for all calls.
 155       urlmatchers_to_fetch_functions: A list of two-element tuples.
 156         The first element is a urlmatcher predicate function that takes
 157         a url and determines a match. The second is a function that
 158         can retrieve result for that url. If no match is found, a url is
 159         handled by the default _RetrieveURL function.
 160         When more than one match is possible, the first match is used.
 161     """
 162     super(URLFetchServiceStub, self).__init__(service_name,
 163                                               max_request_size=MAX_REQUEST_SIZE)
 164     self._urlmatchers_to_fetch_functions = urlmatchers_to_fetch_functions or []
 165
 166   def _Dynamic_Fetch(self, request, response):
 167     """Trivial implementation of URLFetchService::Fetch().
 168
 169     Args:
 170       request: the fetch to perform, a URLFetchRequest
 171       response: the fetch response, a URLFetchResponse
 172     """
 173
 174
 175     if len(request.url()) >= _MAX_URL_LENGTH:
 176       logging.error('URL is too long: %s...' % request.url()[:50])
 177       raise apiproxy_errors.ApplicationError(
 178           urlfetch_service_pb.URLFetchServiceError.INVALID_URL)
 179
 180     (protocol, host, path, query, fragment) = urlparse.urlsplit(request.url())
 181
 182     payload = None
 183     if request.method() == urlfetch_service_pb.URLFetchRequest.GET:
 184       method = 'GET'
 185     elif request.method() == urlfetch_service_pb.URLFetchRequest.POST:
 186       method = 'POST'
 187       payload = request.payload()
 188     elif request.method() == urlfetch_service_pb.URLFetchRequest.HEAD:
 189       method = 'HEAD'
 190     elif request.method() == urlfetch_service_pb.URLFetchRequest.PUT:
 191       method = 'PUT'
 192       payload = request.payload()
 193     elif request.method() == urlfetch_service_pb.URLFetchRequest.DELETE:
 194       method = 'DELETE'
 195     elif request.method() == urlfetch_service_pb.URLFetchRequest.PATCH:
 196       method = 'PATCH'
 197       payload = request.payload()
 198     else:
 199       logging.error('Invalid method: %s', request.method())
 200       raise apiproxy_errors.ApplicationError(
 201         urlfetch_service_pb.URLFetchServiceError.INVALID_URL)
 202
 203     if not (protocol == 'http' or protocol == 'https'):
 204       logging.error('Invalid protocol: %s', protocol)
 205       raise apiproxy_errors.ApplicationError(
 206         urlfetch_service_pb.URLFetchServiceError.INVALID_URL)
 207
 208     if not host:
 209       logging.error('Missing host.')
 210       raise apiproxy_errors.ApplicationError(
 211           urlfetch_service_pb.URLFetchServiceError.INVALID_URL)
 212
 213     self._SanitizeHttpHeaders(_UNTRUSTED_REQUEST_HEADERS,
 214                               request.header_list())
 215     deadline = _API_CALL_DEADLINE
 216     if request.has_deadline():
 217       deadline = request.deadline()
 218     validate_certificate = _API_CALL_VALIDATE_CERTIFICATE_DEFAULT
 219     if request.has_mustvalidateservercertificate():
 220       validate_certificate = request.mustvalidateservercertificate()
 221
 222     fetch_function = self._GetFetchFunction(request.url())
 223     fetch_function(request.url(), payload, method,
 224                    request.header_list(), request, response,
 225                    follow_redirects=request.followredirects(),
 226                    deadline=deadline,
 227                    validate_certificate=validate_certificate)
 228
 229   def _GetFetchFunction(self, url):
 230     """Get the fetch function for a url.
 231
 232     Args:
 233       url: A url to fetch from. str.
 234
 235     Returns:
 236       A fetch function for this url.
 237     """
 238     for urlmatcher, fetch_function in self._urlmatchers_to_fetch_functions:
 239       if urlmatcher(url):
 240         return fetch_function
 241     return self._RetrieveURL
 242
 243   @staticmethod
 244   def _RetrieveURL(url, payload, method, headers, request, response,
 245                    follow_redirects=True, deadline=_API_CALL_DEADLINE,
 246                    validate_certificate=_API_CALL_VALIDATE_CERTIFICATE_DEFAULT):
 247     """Retrieves a URL over network.
 248
 249     Args:
 250       url: String containing the URL to access.
 251       payload: Request payload to send, if any; None if no payload.
 252         If the payload is unicode, we assume it is utf-8.
 253       method: HTTP method to use (e.g., 'GET')
 254       headers: List of additional header objects to use for the request.
 255       request: A urlfetch_service_pb.URLFetchRequest proto object from
 256           original request.
 257       response: A urlfetch_service_pb.URLFetchResponse proto object to
 258           populate with the response data.
 259       follow_redirects: optional setting (defaulting to True) for whether or not
 260         we should transparently follow redirects (up to MAX_REDIRECTS)
 261       deadline: Number of seconds to wait for the urlfetch to finish.
 262       validate_certificate: If true, do not send request to server unless the
 263         certificate is valid, signed by a trusted CA and the hostname matches
 264         the certificate.
 265
 266     Raises:
 267       Raises an apiproxy_errors.ApplicationError exception with
 268       INVALID_URL_ERROR in cases where:
 269         - The protocol of the redirected URL is bad or missing.
 270         - The port is not in the allowable range of ports.
 271       Raises an apiproxy_errors.ApplicationError exception with
 272       TOO_MANY_REDIRECTS in cases when MAX_REDIRECTS is exceeded
 273     """
 274     last_protocol = ''
 275     last_host = ''
 276     if isinstance(payload, unicode):
 277       payload = payload.encode('utf-8')
 278
 279     for redirect_number in xrange(MAX_REDIRECTS + 1):
 280       parsed = urlparse.urlsplit(url)
 281       protocol, host, path, query, fragment = parsed
 282
 283
 284
 285
 286
 287
 288
 289       port = urllib.splitport(urllib.splituser(host)[1])[1]
 290
 291       if not _IsAllowedPort(port):
 292         logging.error(
 293           'urlfetch received %s ; port %s is not allowed in production!' %
 294           (url, port))
 295
 296
 297
 298
 299
 300         raise apiproxy_errors.ApplicationError(
 301           urlfetch_service_pb.URLFetchServiceError.INVALID_URL)
 302
 303       if protocol and not host:
 304
 305         logging.error('Missing host on redirect; target url is %s' % url)
 306         raise apiproxy_errors.ApplicationError(
 307           urlfetch_service_pb.URLFetchServiceError.INVALID_URL)
 308
 309
 310
 311
 312       if not host and not protocol:
 313         host = last_host
 314         protocol = last_protocol
 315
 316
 317
 318
 319
 320
 321       adjusted_headers = {
 322           'User-Agent':
 323           'AppEngine-Google; (+http://code.google.com/appengine)',
 324           'Host': host,
 325           'Accept-Encoding': 'gzip',
 326       }
 327       if payload is not None:
 328
 329
 330         adjusted_headers['Content-Length'] = str(len(payload))
 331
 332
 333       if method == 'POST' and payload:
 334         adjusted_headers['Content-Type'] = 'application/x-www-form-urlencoded'
 335
 336       passthrough_content_encoding = False
 337       for header in headers:
 338         if header.key().title().lower() == 'user-agent':
 339           adjusted_headers['User-Agent'] = (
 340               '%s %s' %
 341               (header.value(), adjusted_headers['User-Agent']))
 342         else:
 343           if header.key().lower() == 'accept-encoding':
 344             passthrough_content_encoding = True
 345           adjusted_headers[header.key().title()] = header.value()
 346
 347       if payload is not None:
 348         escaped_payload = payload.encode('string_escape')
 349       else:
 350         escaped_payload = ''
 351       logging.debug('Making HTTP request: host = %r, '
 352                     'url = %r, payload = %.1000r, headers = %r',
 353                     host, url, escaped_payload, adjusted_headers)
 354       try:
 355         if protocol == 'http':
 356           connection_class = httplib.HTTPConnection
 357         elif protocol == 'https':
 358           if (validate_certificate and _CanValidateCerts() and
 359               CERT_PATH):
 360
 361             connection_class = fancy_urllib.create_fancy_connection(
 362                 ca_certs=CERT_PATH)
 363           else:
 364             connection_class = httplib.HTTPSConnection
 365         else:
 366
 367           error_msg = 'Redirect specified invalid protocol: "%s"' % protocol
 368           logging.error(error_msg)
 369           raise apiproxy_errors.ApplicationError(
 370               urlfetch_service_pb.URLFetchServiceError.INVALID_URL, error_msg)
 371
 372
 373
 374
 375
 376
 377         if _CONNECTION_SUPPORTS_TIMEOUT:
 378           connection = connection_class(host, timeout=deadline)
 379         else:
 380           connection = connection_class(host)
 381
 382
 383
 384         last_protocol = protocol
 385         last_host = host
 386
 387         if query != '':
 388           full_path = path + '?' + query
 389         else:
 390           full_path = path
 391
 392         if not _CONNECTION_SUPPORTS_TIMEOUT:
 393           orig_timeout = socket.getdefaulttimeout()
 394         try:
 395           if not _CONNECTION_SUPPORTS_TIMEOUT:
 396
 397
 398             socket.setdefaulttimeout(deadline)
 399           connection.request(method, full_path, payload, adjusted_headers)
 400           http_response = connection.getresponse()
 401           if method == 'HEAD':
 402             http_response_data = ''
 403           else:
 404             http_response_data = http_response.read()
 405         finally:
 406           if not _CONNECTION_SUPPORTS_TIMEOUT:
 407             socket.setdefaulttimeout(orig_timeout)
 408           connection.close()
 409       except _fancy_urllib_InvalidCertException, e:
 410         raise apiproxy_errors.ApplicationError(
 411           urlfetch_service_pb.URLFetchServiceError.SSL_CERTIFICATE_ERROR,
 412           str(e))
 413       except _fancy_urllib_SSLError, e:
 414
 415
 416
 417
 418
 419         app_error = (
 420             urlfetch_service_pb.URLFetchServiceError.DEADLINE_EXCEEDED
 421             if 'timed out' in e.message else
 422             urlfetch_service_pb.URLFetchServiceError.SSL_CERTIFICATE_ERROR)
 423         raise apiproxy_errors.ApplicationError(app_error, str(e))
 424       except socket.timeout, e:
 425         raise apiproxy_errors.ApplicationError(
 426           urlfetch_service_pb.URLFetchServiceError.DEADLINE_EXCEEDED, str(e))
 427       except (httplib.error, socket.error, IOError), e:
 428         raise apiproxy_errors.ApplicationError(
 429           urlfetch_service_pb.URLFetchServiceError.FETCH_ERROR, str(e))
 430
 431
 432
 433
 434       if http_response.status in REDIRECT_STATUSES and follow_redirects:
 435
 436         url = http_response.getheader('Location', None)
 437         if url is None:
 438           error_msg = 'Redirecting response was missing "Location" header'
 439           logging.error(error_msg)
 440           raise apiproxy_errors.ApplicationError(
 441               urlfetch_service_pb.URLFetchServiceError.MALFORMED_REPLY,
 442               error_msg)
 443       else:
 444         response.set_statuscode(http_response.status)
 445         if (http_response.getheader('content-encoding') == 'gzip' and
 446             not passthrough_content_encoding):
 447           gzip_stream = StringIO.StringIO(http_response_data)
 448           gzip_file = gzip.GzipFile(fileobj=gzip_stream)
 449           http_response_data = gzip_file.read()
 450         response.set_content(http_response_data[:MAX_RESPONSE_SIZE])
 451
 452
 453         for header_key in http_response.msg.keys():
 454           for header_value in http_response.msg.getheaders(header_key):
 455             if (header_key.lower() == 'content-encoding' and
 456                 header_value == 'gzip' and
 457                 not passthrough_content_encoding):
 458               continue
 459             if header_key.lower() == 'content-length' and method != 'HEAD':
 460               header_value = str(len(response.content()))
 461             header_proto = response.add_header()
 462             header_proto.set_key(header_key)
 463             header_proto.set_value(header_value)
 464
 465         if len(http_response_data) > MAX_RESPONSE_SIZE:
 466           response.set_contentwastruncated(True)
 467
 468
 469
 470         if request.url() != url:
 471           response.set_finalurl(url)
 472
 473
 474         break
 475     else:
 476       error_msg = 'Too many repeated redirects'
 477       logging.error(error_msg)
 478       raise apiproxy_errors.ApplicationError(
 479           urlfetch_service_pb.URLFetchServiceError.TOO_MANY_REDIRECTS,
 480           error_msg)
 481
 482   def _SanitizeHttpHeaders(self, untrusted_headers, headers):
 483     """Cleans "unsafe" headers from the HTTP request, in place.
 484
 485     Args:
 486       untrusted_headers: Set of untrusted headers names (all lowercase).
 487       headers: List of Header objects. The list is modified in place.
 488     """
 489     prohibited_headers = [h.key() for h in headers
 490                           if h.key().lower() in untrusted_headers]
 491     if prohibited_headers:
 492       logging.warn('Stripped prohibited headers from URLFetch request: %s',
 493                    prohibited_headers)
 494
 495       for index in reversed(xrange(len(headers))):
 496         if headers[index].key().lower() in untrusted_headers:
 497           del headers[index]