3 Module for retrieving data from a URL.
6 __copyright__
= "Copyright (c) 2002-2005 Free Software Foundation, Inc"
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
29 from cStringIO
import StringIO
31 from StringIO
import StringIO
34 import NetworkConstants
40 parts
= host
.split(".")
46 except (ValueError, TypeError):
52 class RequestSchemeException(Exception):
53 def __init__(self
, scheme
):
54 Exception.__init
__(self
)
59 def __init__(self
, host
= None, port
= None, path
= None, ip
= None,
60 headers
= None, user
= None, password
= None, priority
= None,
61 consumer
= None, uri
= None):
66 self
.headers
= headers
68 self
.password
= password
69 self
.priority
= priority
70 self
.consumer
= consumer
74 def __init__(self
, f
):
78 if not hasattr(self
, "_f"): return
84 class ConnectionManager
:
88 def request(self
, uri
, consumer
, headers
={}, user
=None, password
=None, priority
=NetworkConstants
.PRIORITY_DEFAULT
):
90 scheme
, host
, path
, params
, query
, fragment
= urlparse
.urlparse(uri
)
92 raise RequestSchemeException(scheme
)
94 host
, port
= host
.split(":", 1)
96 except (TypeError, ValueError):
97 port
= 80 # default port
102 path
= path
+ ";" + params
104 path
= path
+ "?" + query
105 req
= Request(host
= host
, port
= port
, path
= path
, headers
= headers
,
106 user
= user
, password
= password
, priority
= priority
,
107 consumer
= consumer
, uri
= uri
)
108 pc
= Config
.get_instance().proxy_config
110 # we don't set ConsumerAdapter's proxy here because we might not
111 # yet know its ip. However, we can set the Proxy-Authorization
113 if pc
.use_authentication
:
114 req
.headers
['Proxy-Authorization'] = ('Basic %s' % base64
.encodestring('%s:%s' % (pc
.user
, pc
.password
)).strip())
115 adapter
= ConsumerAdapter(req
)
117 # no lookup necessary if host is an ip address.
118 # if we are using a proxy, let it handle the lookup.
120 adapter
.request
.ip
= host
121 self
._queue
_request
(adapter
)
123 self
._queue
_request
(adapter
)
126 LookupManager
.get_instance().lookup(
127 host
, self
._request
_resolved
, adapter
)
128 except LookupManager
.NameFormatException
, e
:
129 adapter
.http_failed(e
)
130 # don't return the categoryadapter object, because nothing in
131 # it except stop() should be called from other parts of Straw
132 return Caller(adapter
.stop
)
134 def _queue_request(self
, adapter
):
136 while i
< len(self
._queue
):
137 if self
._queue
[i
].request
.priority
> adapter
.request
.priority
:
138 self
._queue
.insert(i
, adapter
)
142 self
._queue
.append(adapter
)
144 def _request_resolved(self
, host
, ip
, adapter
):
145 if ip
is not None and ip
!= "":
146 adapter
.request
.ip
= ip
147 self
._queue
_request
(adapter
)
149 adapter
.http_failed(_("Host name lookup failed"))
151 def poll(self
, timeout
=0.1):
152 proxy_config
= Config
.get_instance().proxy_config
153 lookup_manager
= LookupManager
.get_instance()
154 # activate up to MAX_CONNECTIONS channels
155 while self
._queue
and len(asyncore
.socket_map
) < NetworkConstants
.MAX_CONNECTIONS
and not proxy_config
.is_waiting
:
156 adapter
= self
._queue
.pop(0)
157 # has the user switched off the proxy after this request was queued
158 if (not proxy_config
.use
) and (not adapter
.request
.ip
):
159 lookup_manager
.lookup(
160 adapter
.request
.host
, self
._request
_resolved
, adapter
)
163 adapter
.set_proxy((proxy_config
.ip
, proxy_config
.port
))
165 # keep the network running
167 lookup_manager
.poll(timeout
)
168 timeout
-= (time
.time() - now
)
170 asyncore
.poll(timeout
)
171 # time out stuck consumers
172 self
.time_out_consumers()
173 # return non-zero if we should keep polling
174 return len(self
._queue
) or len(asyncore
.socket_map
)
176 def time_out_consumers(self
):
178 for obj
in asyncore
.socket_map
.values():
180 if now
- pc
.start_time
> NetworkConstants
.MAX_DOWNLOAD_TIME
:
183 connection_manager
= ConnectionManager()
185 class ConsumerAdapter(object):
190 # TODO: we do nothing with the proxy?
191 def __init__(self
, req
):
192 self
.connection
= httplib_async
.HTTPConnection_async(req
.host
, req
.ip
, req
.port
, self
)
193 self
.consumer
= req
.consumer
198 self
.finished
= False
199 self
.start_time
= None
200 self
.state
= self
.CREATED
201 if req
.user
and req
.password
:
202 req
.headers
['Authorization'] = 'Basic %s' % base64
.encodestring('%s:%s' % (req
.user
, req
.password
)).strip()
203 req
.headers
['Accept-encoding'] = 'gzip'
204 req
.headers
['User-agent'] = 'Straw/%s' % constants
.VERSION
206 # interface used by ConnectionManager
208 if self
.state
!= self
.CREATED
:
210 self
.state
= self
.STARTED
211 self
.start_time
= time
.time()
213 self
.connection
.request(
214 "GET", self
._request
.path
, self
._request
.headers
)
215 self
.connection
.execute()
216 except Exception, ex
:
217 self
._send
_failed
(ex
)
218 self
._close
_connection
()
219 self
.state
= self
.CLOSED
221 def set_proxy(self
, proxy
):
222 self
.connection
.set_proxy(proxy
)
224 def time_exceeded(self
):
225 if self
.state
!= self
.STARTED
:
227 self
.state
= self
.CLOSED
228 self
._close
_connection
()
229 self
._send
_failed
(_("Maximum download time exceeded"))
231 def get_request(self
):
233 request
= property(get_request
)
236 if self
.state
!= self
.CLOSED
:
237 self
.state
= self
.CLOSED
238 self
._close
_connection
()
239 self
.consumer
.operation_stopped()
241 # the following methods are the interface HTTPConnection_async uses
242 def http_failed(self
, exception
):
243 """Called by HTTPConnection_async when connection failed with
244 exception. Also used by ConnectionManager if host name lookup failed.
246 self
._send
_failed
(exception
)
247 self
._close
_connection
()
248 self
.state
= self
.CLOSED
250 def http_header(self
, status
, header
):
251 """Called by HTTPConnection_async with status and header"""
254 if header
.getheader('content-length') == '0':
256 self
._close
_connection
()
257 self
.state
= self
.CLOSED
259 def http_redirect(self
, location
, permanent
= 0):
260 """Called by HTTPConnection_async with the new location in case of 301 or 302"""
261 assert type(location
) == type(''), "Invalid redirect"
262 if urlparse
.urlparse(location
)[0] != 'http':
263 location
= urlparse
.urljoin(self
.request
.uri
, location
)
265 self
.consumer
.http_permanent_redirect(location
)
267 connection_manager
.request(
268 location
, self
.consumer
, self
.request
.headers
,
269 self
.request
.user
, self
.request
.password
)
270 self
._close
_connection
()
271 self
.state
= self
.CLOSED
273 def feed(self
, data
):
274 """Called by HTTPConnection_async with (part of the) data, after http_header"""
276 datalength
= len(self
._data
)
277 cl
= self
.header
.getheader('content-length')
278 if cl
is not None and datalength
>= int(cl
):
279 #if datalength >= int(self.header.getheader('content-length', 0)):
280 self
._close
_connection
()
282 self
.state
= self
.CLOSED
283 elif datalength
>= NetworkConstants
.MAX_DOWNLOAD_SIZE
:
284 self
._close
_connection
()
285 self
._send
_failed
(_("Maximum download file size exceeded"))
286 self
.state
= self
.CLOSED
288 def http_close(self
):
289 """Called by HTTPConsumer_async when the connection is closed"""
290 if self
.header
and (len(self
._data
) <
291 int(self
.header
.getheader('content-length', 0))):
292 msg
= _("Feed is empty.")
293 self
._send
_failed
(msg
)
296 self
._close
_connection
()
297 self
.state
= self
.CLOSED
300 def _send_failed(self
, data
):
301 if not self
.finished
:
303 self
.consumer
.http_failed(data
)
305 def _send_results(self
):
306 if not self
.finished
:
308 if self
.header
and self
.header
.getheader(
309 'content-encoding') == 'gzip':
310 self
._data
= gzip
.GzipFile(
311 fileobj
= StringIO(self
._data
)).read()
312 self
.consumer
.http_results(self
.status
, self
.header
, self
._data
)
314 def _close_connection(self
):
315 if hasattr(self
, "connection"):
317 self
.connection
.close()
319 # silently ignore errors: if it didn't succeed, it wasn't
320 # open in the first place. we don't care.
327 if getattr(self
, "request", None) is not None:
328 host
= self
.request
.host
329 path
= self
.request
.path
330 return '<%s for %s %s>' % (repr(self
), host
, path
)