Updated Arabic Translation by Djihed Afifi.
[straw.git] / src / lib / URLFetch.py
blobd8dbe575bbf34a352201aebe92a8202f80e19442
1 """ URLFetch.py
3 Module for retrieving data from a URL.
5 """
6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc"
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
23 import httplib_async
24 import asyncore
25 import urlparse
26 import base64
27 import gzip
28 try:
29 from cStringIO import StringIO
30 except:
31 from StringIO import StringIO
32 import error
33 import time
34 import NetworkConstants
35 import Config
36 import LookupManager
37 import constants
39 def is_ip(host):
40 parts = host.split(".")
41 if len(parts) != 4:
42 return 0
43 for p in parts:
44 try:
45 np = int(p)
46 except (ValueError, TypeError):
47 return 0
48 if not 0 <= np < 256:
49 return 0
50 return 1
52 class RequestSchemeException(Exception):
53 def __init__(self, scheme):
54 Exception.__init__(self)
55 self.scheme = scheme
56 self.args = {}
58 class Request:
59 def __init__(self, host = None, port = None, path = None, ip = None,
60 headers = None, user = None, password = None, priority = None,
61 consumer = None, uri = None):
62 self.host = host
63 self.port = port
64 self.path = path
65 self.ip = ip
66 self.headers = headers
67 self.user = user
68 self.password = password
69 self.priority = priority
70 self.consumer = consumer
71 self.uri = uri
73 class Caller:
74 def __init__(self, f):
75 self._f = f
77 def __call__(self):
78 if not hasattr(self, "_f"): return
79 try:
80 self._f()
81 finally:
82 del self._f
84 class ConnectionManager:
85 def __init__(self):
86 self._queue = []
88 def request(self, uri, consumer, headers={}, user=None, password=None, priority=NetworkConstants.PRIORITY_DEFAULT):
89 uri = uri.strip()
90 scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
91 if scheme != "http":
92 raise RequestSchemeException(scheme)
93 try:
94 host, port = host.split(":", 1)
95 port = int(port)
96 except (TypeError, ValueError):
97 port = 80 # default port
99 if not path:
100 path = "/"
101 if params:
102 path = path + ";" + params
103 if query:
104 path = path + "?" + query
105 req = Request(host = host, port = port, path = path, headers = headers,
106 user = user, password = password, priority = priority,
107 consumer = consumer, uri = uri)
108 pc = Config.get_instance().proxy_config
109 if pc.use:
110 # we don't set ConsumerAdapter's proxy here because we might not
111 # yet know its ip. However, we can set the Proxy-Authorization
112 # line
113 if pc.use_authentication:
114 req.headers['Proxy-Authorization'] = ('Basic %s' % base64.encodestring('%s:%s' % (pc.user, pc.password)).strip())
115 adapter = ConsumerAdapter(req)
117 # no lookup necessary if host is an ip address.
118 # if we are using a proxy, let it handle the lookup.
119 if is_ip(host):
120 adapter.request.ip = host
121 self._queue_request(adapter)
122 elif pc.use:
123 self._queue_request(adapter)
124 else:
125 try:
126 LookupManager.get_instance().lookup(
127 host, self._request_resolved, adapter)
128 except LookupManager.NameFormatException, e:
129 adapter.http_failed(e)
130 # don't return the categoryadapter object, because nothing in
131 # it except stop() should be called from other parts of Straw
132 return Caller(adapter.stop)
134 def _queue_request(self, adapter):
135 i = 0
136 while i < len(self._queue):
137 if self._queue[i].request.priority > adapter.request.priority:
138 self._queue.insert(i, adapter)
139 return
140 i += 1
141 # from outside while
142 self._queue.append(adapter)
144 def _request_resolved(self, host, ip, adapter):
145 if ip is not None and ip != "":
146 adapter.request.ip = ip
147 self._queue_request(adapter)
148 else:
149 adapter.http_failed(_("Host name lookup failed"))
151 def poll(self, timeout=0.1):
152 proxy_config = Config.get_instance().proxy_config
153 lookup_manager = LookupManager.get_instance()
154 # activate up to MAX_CONNECTIONS channels
155 while self._queue and len(asyncore.socket_map) < NetworkConstants.MAX_CONNECTIONS and not proxy_config.is_waiting:
156 adapter = self._queue.pop(0)
157 # has the user switched off the proxy after this request was queued
158 if (not proxy_config.use) and (not adapter.request.ip):
159 lookup_manager.lookup(
160 adapter.request.host, self._request_resolved, adapter)
161 else:
162 if proxy_config.use:
163 adapter.set_proxy((proxy_config.ip, proxy_config.port))
164 adapter.start()
165 # keep the network running
166 now = time.time()
167 lookup_manager.poll(timeout)
168 timeout -= (time.time() - now)
169 if timeout > 0.0:
170 asyncore.poll(timeout)
171 # time out stuck consumers
172 self.time_out_consumers()
173 # return non-zero if we should keep polling
174 return len(self._queue) or len(asyncore.socket_map)
176 def time_out_consumers(self):
177 now = time.time()
178 for obj in asyncore.socket_map.values():
179 pc = obj.consumer
180 if now - pc.start_time > NetworkConstants.MAX_DOWNLOAD_TIME:
181 pc.time_exceeded()
183 connection_manager = ConnectionManager()
185 class ConsumerAdapter(object):
186 CREATED = 0
187 STARTED = 1
188 CLOSED = 2
190 # TODO: we do nothing with the proxy?
191 def __init__(self, req):
192 self.connection = httplib_async.HTTPConnection_async(req.host, req.ip, req.port, self)
193 self.consumer = req.consumer
194 self._data = ""
195 self.header = None
196 self.status = None
197 self._request = req
198 self.finished = False
199 self.start_time = None
200 self.state = self.CREATED
201 if req.user and req.password:
202 req.headers['Authorization'] = 'Basic %s' % base64.encodestring('%s:%s' % (req.user, req.password)).strip()
203 req.headers['Accept-encoding'] = 'gzip'
204 req.headers['User-agent'] = 'Straw/%s' % constants.VERSION
206 # interface used by ConnectionManager
207 def start(self):
208 if self.state != self.CREATED:
209 return
210 self.state = self.STARTED
211 self.start_time = time.time()
212 try:
213 self.connection.request(
214 "GET", self._request.path, self._request.headers)
215 self.connection.execute()
216 except Exception, ex:
217 self._send_failed(ex)
218 self._close_connection()
219 self.state = self.CLOSED
221 def set_proxy(self, proxy):
222 self.connection.set_proxy(proxy)
224 def time_exceeded(self):
225 if self.state != self.STARTED:
226 return
227 self.state = self.CLOSED
228 self._close_connection()
229 self._send_failed(_("Maximum download time exceeded"))
231 def get_request(self):
232 return self._request
233 request = property(get_request)
235 def stop(self):
236 if self.state != self.CLOSED:
237 self.state = self.CLOSED
238 self._close_connection()
239 self.consumer.operation_stopped()
241 # the following methods are the interface HTTPConnection_async uses
242 def http_failed(self, exception):
243 """Called by HTTPConnection_async when connection failed with
244 exception. Also used by ConnectionManager if host name lookup failed.
246 self._send_failed(exception)
247 self._close_connection()
248 self.state = self.CLOSED
250 def http_header(self, status, header):
251 """Called by HTTPConnection_async with status and header"""
252 self.status = status
253 self.header = header
254 if header.getheader('content-length') == '0':
255 self._send_results()
256 self._close_connection()
257 self.state = self.CLOSED
259 def http_redirect(self, location, permanent = 0):
260 """Called by HTTPConnection_async with the new location in case of 301 or 302"""
261 assert type(location) == type(''), "Invalid redirect"
262 if urlparse.urlparse(location)[0] != 'http':
263 location = urlparse.urljoin(self.request.uri, location)
264 if permanent:
265 self.consumer.http_permanent_redirect(location)
267 connection_manager.request(
268 location, self.consumer, self.request.headers,
269 self.request.user, self.request.password)
270 self._close_connection()
271 self.state = self.CLOSED
273 def feed(self, data):
274 """Called by HTTPConnection_async with (part of the) data, after http_header"""
275 self._data += data
276 datalength = len(self._data)
277 cl = self.header.getheader('content-length')
278 if cl is not None and datalength >= int(cl):
279 #if datalength >= int(self.header.getheader('content-length', 0)):
280 self._close_connection()
281 self._send_results()
282 self.state = self.CLOSED
283 elif datalength >= NetworkConstants.MAX_DOWNLOAD_SIZE:
284 self._close_connection()
285 self._send_failed(_("Maximum download file size exceeded"))
286 self.state = self.CLOSED
288 def http_close(self):
289 """Called by HTTPConsumer_async when the connection is closed"""
290 if self.header and (len(self._data) <
291 int(self.header.getheader('content-length', 0))):
292 msg = _("Feed is empty.")
293 self._send_failed(msg)
294 else:
295 self._send_results()
296 self._close_connection()
297 self.state = self.CLOSED
299 # internal
300 def _send_failed(self, data):
301 if not self.finished:
302 self.finished = True
303 self.consumer.http_failed(data)
305 def _send_results(self):
306 if not self.finished:
307 self.finished = True
308 if self.header and self.header.getheader(
309 'content-encoding') == 'gzip':
310 self._data = gzip.GzipFile(
311 fileobj = StringIO(self._data)).read()
312 self.consumer.http_results(self.status, self.header, self._data)
314 def _close_connection(self):
315 if hasattr(self, "connection"):
316 try:
317 self.connection.close()
318 except:
319 # silently ignore errors: if it didn't succeed, it wasn't
320 # open in the first place. we don't care.
321 pass
322 del self.connection
324 def __str__(self):
325 host = "Unknown"
326 path = "Unknown"
327 if getattr(self, "request", None) is not None:
328 host = self.request.host
329 path = self.request.path
330 return '<%s for %s %s>' % (repr(self), host, path)