Truncate partial download file if resume is unsupported (bug 409)
[gpodder.git] / src / gpodder / download.py
blob1ebe3b502c0766942a4fc3540690e8140d487ac3
1 # -*- coding: utf-8 -*-
3 # gPodder - A media aggregator and podcast client
4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
6 # gPodder is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # gPodder is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
22 # download.py -- Download client using DownloadStatusManager
23 # Thomas Perl <thp@perli.net> 2007-09-15
25 # Based on libwget.py (2005-10-29)
28 from __future__ import with_statement
30 from gpodder.liblogger import log
31 from gpodder.libgpodder import gl
32 from gpodder.dbsqlite import db
33 from gpodder import util
34 from gpodder import resolver
35 import gpodder
37 import threading
38 import urllib
39 import shutil
40 import os.path
41 import os
42 import time
43 import collections
45 from xml.sax import saxutils
48 class ContentRange(object):
49 # Based on:
50 # http://svn.pythonpaste.org/Paste/WebOb/trunk/webob/byterange.py
52 # Copyright (c) 2007 Ian Bicking and Contributors
54 # Permission is hereby granted, free of charge, to any person obtaining
55 # a copy of this software and associated documentation files (the
56 # "Software"), to deal in the Software without restriction, including
57 # without limitation the rights to use, copy, modify, merge, publish,
58 # distribute, sublicense, and/or sell copies of the Software, and to
59 # permit persons to whom the Software is furnished to do so, subject to
60 # the following conditions:
62 # The above copyright notice and this permission notice shall be
63 # included in all copies or substantial portions of the Software.
65 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
66 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
67 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
68 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
69 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
70 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
71 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
72 """
73 Represents the Content-Range header
75 This header is ``start-stop/length``, where stop and length can be
76 ``*`` (represented as None in the attributes).
77 """
79 def __init__(self, start, stop, length):
80 assert start >= 0, "Bad start: %r" % start
81 assert stop is None or (stop >= 0 and stop >= start), (
82 "Bad stop: %r" % stop)
83 self.start = start
84 self.stop = stop
85 self.length = length
87 def __repr__(self):
88 return '<%s %s>' % (
89 self.__class__.__name__,
90 self)
92 def __str__(self):
93 if self.stop is None:
94 stop = '*'
95 else:
96 stop = self.stop + 1
97 if self.length is None:
98 length = '*'
99 else:
100 length = self.length
101 return 'bytes %s-%s/%s' % (self.start, stop, length)
103 def __iter__(self):
105 Mostly so you can unpack this, like:
107 start, stop, length = res.content_range
109 return iter([self.start, self.stop, self.length])
111 @classmethod
112 def parse(cls, value):
114 Parse the header. May return None if it cannot parse.
116 if value is None:
117 return None
118 value = value.strip()
119 if not value.startswith('bytes '):
120 # Unparseable
121 return None
122 value = value[len('bytes '):].strip()
123 if '/' not in value:
124 # Invalid, no length given
125 return None
126 range, length = value.split('/', 1)
127 if '-' not in range:
128 # Invalid, no range
129 return None
130 start, end = range.split('-', 1)
131 try:
132 start = int(start)
133 if end == '*':
134 end = None
135 else:
136 end = int(end)
137 if length == '*':
138 length = None
139 else:
140 length = int(length)
141 except ValueError:
142 # Parse problem
143 return None
144 if end is None:
145 return cls(start, None, length)
146 else:
147 return cls(start, end-1, length)
150 class DownloadCancelledException(Exception): pass
152 class gPodderDownloadHTTPError(Exception):
153 def __init__(self, url, error_code, error_message):
154 self.url = url
155 self.error_code = error_code
156 self.error_message = error_message
158 class DownloadURLOpener(urllib.FancyURLopener):
159 version = gpodder.user_agent
161 def __init__( self, channel):
162 if gl.config.proxy_use_environment:
163 proxies = None
164 else:
165 proxies = {}
166 if gl.config.http_proxy:
167 proxies['http'] = gl.config.http_proxy
168 if gl.config.ftp_proxy:
169 proxies['ftp'] = gl.config.ftp_proxy
171 self.channel = channel
172 urllib.FancyURLopener.__init__( self, proxies)
174 def http_error_default(self, url, fp, errcode, errmsg, headers):
176 FancyURLopener by default does not raise an exception when
177 there is some unknown HTTP error code. We want to override
178 this and provide a function to log the error and raise an
179 exception, so we don't download the HTTP error page here.
181 # The following two lines are copied from urllib.URLopener's
182 # implementation of http_error_default
183 void = fp.read()
184 fp.close()
185 raise gPodderDownloadHTTPError(url, errcode, errmsg)
187 # The following is based on Python's urllib.py "URLopener.retrieve"
188 # Also based on http://mail.python.org/pipermail/python-list/2001-October/110069.html
190 def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
191 # The next line is taken from urllib's URLopener.open_http
192 # method, at the end after the line "if errcode == 200:"
193 return urllib.addinfourl(fp, headers, 'http:' + url)
195 def retrieve_resume(self, url, filename, reporthook=None, data=None):
196 """retrieve_resume(url) returns (filename, headers) for a local object
197 or (tempfilename, headers) for a remote object.
199 The filename argument is REQUIRED (no tempfile creation code here!)
201 Additionally resumes a download if the local filename exists"""
203 current_size = 0
204 tfp = None
205 if os.path.exists(filename):
206 try:
207 current_size = os.path.getsize(filename)
208 tfp = open(filename, 'ab')
209 #If the file exists, then only download the remainder
210 self.addheader('Range', 'bytes=%s-' % (current_size))
211 except:
212 log('Cannot open file for resuming: %s', filename, sender=self, traceback=True)
213 tfp = None
214 current_size = 0
216 if tfp is None:
217 tfp = open(filename, 'wb')
219 url = urllib.unwrap(urllib.toBytes(url))
220 fp = self.open(url, data)
221 headers = fp.info()
223 if current_size > 0:
224 # We told the server to resume - see if she agrees
225 # See RFC2616 (206 Partial Content + Section 14.16)
226 # XXX check status code here, too...
227 range = ContentRange.parse(headers.get('content-range', ''))
228 if range is None or range.start != current_size:
229 # Ok, that did not work. Reset the download
230 # TODO: seek and truncate if content-range differs from request
231 tfp.close()
232 tfp = open(filename, 'wb')
233 current_size = 0
234 log('Cannot resume. Missing or wrong Content-Range header (RFC2616)', sender=self)
237 # gPodder TODO: we can get the real url via fp.geturl() here
238 # (if anybody wants to fix filenames in the future)
240 result = filename, headers
241 bs = 1024*8
242 size = -1
243 read = current_size
244 blocknum = int(current_size/bs)
245 if reporthook:
246 if "content-length" in headers:
247 size = int(headers["Content-Length"]) + current_size
248 reporthook(blocknum, bs, size)
249 while 1:
250 block = fp.read(bs)
251 if block == "":
252 break
253 read += len(block)
254 tfp.write(block)
255 blocknum += 1
256 if reporthook:
257 reporthook(blocknum, bs, size)
258 fp.close()
259 tfp.close()
260 del fp
261 del tfp
263 # raise exception if actual size does not match content-length header
264 if size >= 0 and read < size:
265 raise urllib.ContentTooShortError("retrieval incomplete: got only %i out "
266 "of %i bytes" % (read, size), result)
268 return result
270 # end code based on urllib.py
272 def prompt_user_passwd( self, host, realm):
273 if self.channel.username or self.channel.password:
274 log( 'Authenticating as "%s" to "%s" for realm "%s".', self.channel.username, host, realm, sender = self)
275 return ( self.channel.username, self.channel.password )
277 return ( None, None )
280 class DownloadQueueWorker(threading.Thread):
281 def __init__(self, queue, exit_callback):
282 threading.Thread.__init__(self)
283 self.queue = queue
284 self.exit_callback = exit_callback
285 self.cancelled = False
287 def stop_accepting_tasks(self):
289 When this is called, the worker will not accept new tasks,
290 but quit when the current task has been finished.
292 if not self.cancelled:
293 self.cancelled = True
294 log('%s stopped accepting tasks.', self.getName(), sender=self)
296 def run(self):
297 log('Running new thread: %s', self.getName(), sender=self)
298 while not self.cancelled:
299 try:
300 task = self.queue.pop()
301 log('%s is processing: %s', self.getName(), task, sender=self)
302 task.run()
303 except IndexError, e:
304 log('No more tasks for %s to carry out.', self.getName(), sender=self)
305 break
306 self.exit_callback(self)
309 class DownloadQueueManager(object):
310 def __init__(self, download_status_manager):
311 self.download_status_manager = download_status_manager
312 self.tasks = collections.deque()
314 self.worker_threads_access = threading.RLock()
315 self.worker_threads = []
317 def __exit_callback(self, worker_thread):
318 with self.worker_threads_access:
319 self.worker_threads.remove(worker_thread)
321 def spawn_and_retire_threads(self, request_new_thread=False):
322 with self.worker_threads_access:
323 if len(self.worker_threads) > gl.config.max_downloads and \
324 gl.config.max_downloads_enabled:
325 # Tell the excessive amount of oldest worker threads to quit, but keep at least one
326 count = min(len(self.worker_threads)-1, len(self.worker_threads)-gl.config.max_downloads)
327 for worker in self.worker_threads[:count]:
328 worker.stop_accepting_tasks()
330 if request_new_thread and (len(self.worker_threads) == 0 or \
331 len(self.worker_threads) < gl.config.max_downloads or \
332 not gl.config.max_downloads_enabled):
333 # We have to create a new thread here, there's work to do
334 log('I am going to spawn a new worker thread.', sender=self)
335 worker = DownloadQueueWorker(self.tasks, self.__exit_callback)
336 self.worker_threads.append(worker)
337 worker.start()
339 def add_resumed_task(self, task):
340 """Simply add the task without starting the download"""
341 self.download_status_manager.register_task(task)
343 def add_task(self, task):
344 if task.status == DownloadTask.INIT:
345 # This task is fresh, so add it to our status manager
346 self.download_status_manager.register_task(task)
347 else:
348 # This task is old so update episode from db
349 task.episode.reload_from_db()
350 task.status = DownloadTask.QUEUED
351 self.tasks.appendleft(task)
352 self.spawn_and_retire_threads(request_new_thread=True)
355 class DownloadTask(object):
356 """An object representing the download task of an episode
358 You can create a new download task like this:
360 task = DownloadTask(episode)
361 task.status = DownloadTask.QUEUED
362 task.run()
364 While the download is in progress, you can access its properties:
366 task.total_size # in bytes
367 task.progress # from 0.0 to 1.0
368 task.speed # in bytes per second
369 str(task) # name of the episode
370 task.status # current status
371 task.status_changed # True if the status has been changed
373 You can cancel a running download task by setting its status:
375 task.status = DownloadTask.CANCELLED
377 The task will then abort as soon as possible (due to the nature
378 of downloading data, this can take a while when the Internet is
379 busy).
381 The "status_changed" attribute gets set to True everytime the
382 "status" attribute changes its value. After you get the value of
383 the "status_changed" attribute, it is always reset to False:
385 if task.status_changed:
386 new_status = task.status
387 # .. update the UI accordingly ..
389 Obviously, this also means that you must have at most *one*
390 place in your UI code where you check for status changes and
391 broadcast the status updates from there.
393 While the download is taking place and after the .run() method
394 has finished, you can get the final status to check if the download
395 was successful:
397 if task.status == DownloadTask.DONE:
398 # .. everything ok ..
399 elif task.status == DownloadTask.FAILED:
400 # .. an error happened, and the
401 # error_message attribute is set ..
402 print task.error_message
403 elif task.status == DownloadTask.PAUSED:
404 # .. user paused the download ..
405 elif task.status == DownloadTask.CANCELLED:
406 # .. user cancelled the download ..
408 The difference between cancelling and pausing a DownloadTask is
409 that the temporary file gets deleted when cancelling, but does
410 not get deleted when pausing.
412 Be sure to call .removed_from_list() on this task when removing
413 it from the UI, so that it can carry out any pending clean-up
414 actions (e.g. removing the temporary file when the task has not
415 finished successfully; i.e. task.status != DownloadTask.DONE).
417 # Possible states this download task can be in
418 STATUS_MESSAGE = (_('Added'), _('Queued'), _('Downloading'),
419 _('Finished'), _('Failed'), _('Cancelled'), _('Paused'))
420 (INIT, QUEUED, DOWNLOADING, DONE, FAILED, CANCELLED, PAUSED) = range(7)
422 def __str__(self):
423 return self.__episode.title
425 def __get_status(self):
426 return self.__status
428 def __set_status(self, status):
429 if status != self.__status:
430 self.__status_changed = True
431 self.__status = status
433 status = property(fget=__get_status, fset=__set_status)
435 def __get_status_changed(self):
436 if self.__status_changed:
437 self.__status_changed = False
438 return True
439 else:
440 return False
442 status_changed = property(fget=__get_status_changed)
444 def __get_url(self):
445 return self.__episode.url
447 url = property(fget=__get_url)
449 def __get_episode(self):
450 return self.__episode
452 episode = property(fget=__get_episode)
454 def removed_from_list(self):
455 if self.status != self.DONE:
456 util.delete_file(self.tempname)
458 def __init__(self, episode):
459 self.__status = DownloadTask.INIT
460 self.__status_changed = True
461 self.__episode = episode
463 # Create the target filename and save it in the database
464 self.filename = self.__episode.local_filename(create=True)
465 self.tempname = self.filename + '.partial'
466 db.commit()
468 self.total_size = self.__episode.length
469 self.speed = 0.0
470 self.progress = 0.0
471 self.error_message = None
473 # Variables for speed limit and speed calculation
474 self.__start_time = 0
475 self.__start_blocks = 0
476 self.__limit_rate_value = gl.config.limit_rate_value
477 self.__limit_rate = gl.config.limit_rate
479 # If the tempname already exists, set progress accordingly
480 if os.path.exists(self.tempname):
481 try:
482 already_downloaded = os.path.getsize(self.tempname)
483 if self.total_size > 0:
484 self.progress = max(0.0, min(1.0, float(already_downloaded)/self.total_size))
485 except OSError, os_error:
486 log('Error while getting size for existing file: %s', os_error, sender=self)
487 else:
488 # "touch self.tempname", so we also get partial
489 # files for resuming when the file is queued
490 open(self.tempname, 'w').close()
492 def status_updated(self, count, blockSize, totalSize):
493 # We see a different "total size" while downloading,
494 # so correct the total size variable in the thread
495 if totalSize != self.total_size and totalSize > 0:
496 self.total_size = float(totalSize)
498 if self.total_size > 0:
499 self.progress = max(0.0, min(1.0, float(count*blockSize)/self.total_size))
501 self.calculate_speed(count, blockSize)
503 if self.status == DownloadTask.CANCELLED:
504 raise DownloadCancelledException()
506 if self.status == DownloadTask.PAUSED:
507 raise DownloadCancelledException()
509 def calculate_speed(self, count, blockSize):
510 if count % 5 == 0:
511 now = time.time()
512 if self.__start_time > 0:
513 # Has rate limiting been enabled or disabled?
514 if self.__limit_rate != gl.config.limit_rate:
515 # If it has been enabled then reset base time and block count
516 if gl.config.limit_rate:
517 self.__start_time = now
518 self.__start_blocks = count
519 self.__limit_rate = gl.config.limit_rate
521 # Has the rate been changed and are we currently limiting?
522 if self.__limit_rate_value != gl.config.limit_rate_value and self.__limit_rate:
523 self.__start_time = now
524 self.__start_blocks = count
525 self.__limit_rate_value = gl.config.limit_rate_value
527 passed = now - self.__start_time
528 if passed > 0:
529 speed = ((count-self.__start_blocks)*blockSize)/passed
530 else:
531 speed = 0
532 else:
533 self.__start_time = now
534 self.__start_blocks = count
535 passed = now - self.__start_time
536 speed = count*blockSize
538 self.speed = float(speed)
540 if gl.config.limit_rate and speed > gl.config.limit_rate_value:
541 # calculate the time that should have passed to reach
542 # the desired download rate and wait if necessary
543 should_have_passed = float((count-self.__start_blocks)*blockSize)/(gl.config.limit_rate_value*1024.0)
544 if should_have_passed > passed:
545 # sleep a maximum of 10 seconds to not cause time-outs
546 delay = min(10.0, float(should_have_passed-passed))
547 time.sleep(delay)
549 def run(self):
550 # Speed calculation (re-)starts here
551 self.__start_time = 0
552 self.__start_blocks = 0
554 # If the download has already been cancelled, skip it
555 if self.status == DownloadTask.CANCELLED:
556 util.delete_file(self.tempname)
557 return False
559 # We only start this download if its status is "queued"
560 if self.status != DownloadTask.QUEUED:
561 return False
563 # We are downloading this file right now
564 self.status = DownloadTask.DOWNLOADING
566 try:
567 # Resolve URL and start downloading the episode
568 url = resolver.get_real_download_url(self.__episode.url)
569 downloader = DownloadURLOpener(self.__episode.channel)
570 (unused, headers) = downloader.retrieve_resume(url,
571 self.tempname, reporthook=self.status_updated)
573 new_mimetype = headers.get('content-type', self.__episode.mimetype)
574 old_mimetype = self.__episode.mimetype
575 if new_mimetype != old_mimetype:
576 log('Correcting mime type: %s => %s', old_mimetype, new_mimetype, sender=self)
577 old_extension = self.__episode.extension()
578 self.__episode.mimetype = new_mimetype
579 new_extension = self.__episode.extension()
581 # If the desired filename extension changed due to the new mimetype,
582 # we force an update of the local filename to fix the extension
583 if old_extension != new_extension:
584 self.filename = self.__episode.local_filename(create=True, force_update=True)
586 shutil.move(self.tempname, self.filename)
588 # Get the _real_ filesize once we actually have the file
589 self.__episode.length = os.path.getsize(self.filename)
590 self.__episode.channel.addDownloadedItem(self.__episode)
592 # If a user command has been defined, execute the command setting some environment variables
593 if len(gl.config.cmd_download_complete) > 0:
594 os.environ["GPODDER_EPISODE_URL"]=self.__episode.url or ''
595 os.environ["GPODDER_EPISODE_TITLE"]=self.__episode.title or ''
596 os.environ["GPODDER_EPISODE_FILENAME"]=self.filename or ''
597 os.environ["GPODDER_EPISODE_PUBDATE"]=str(int(self.__episode.pubDate))
598 os.environ["GPODDER_EPISODE_LINK"]=self.__episode.link or ''
599 os.environ["GPODDER_EPISODE_DESC"]=self.__episode.description or ''
600 util.run_external_command(gl.config.cmd_download_complete)
601 except DownloadCancelledException:
602 log('Download has been cancelled/paused: %s', self, sender=self)
603 if self.status == DownloadTask.CANCELLED:
604 util.delete_file(self.tempname)
605 self.progress = 0.0
606 self.speed = 0.0
607 except IOError, ioe:
608 log( 'Error "%s" while downloading "%s": %s', ioe.strerror, self.__episode.title, ioe.filename, sender=self)
609 self.status = DownloadTask.FAILED
610 self.error_message = _('I/O Error: %s: %s') % (ioe.strerror, ioe.filename)
611 except gPodderDownloadHTTPError, gdhe:
612 log( 'HTTP error %s while downloading "%s": %s', gdhe.error_code, self.__episode.title, gdhe.error_message, sender=self)
613 self.status = DownloadTask.FAILED
614 self.error_message = _('HTTP Error %s: %s') % (gdhe.error_code, gdhe.error_message)
615 except Exception, e:
616 self.status = DownloadTask.FAILED
617 self.error_message = _('Error: %s') % (e.message,)
619 if self.status == DownloadTask.DOWNLOADING:
620 # Everything went well - we're done
621 self.status = DownloadTask.DONE
622 self.progress = 1.0
623 return True
625 self.speed = 0.0
627 # We finished, but not successfully (at least not really)
628 return False