[Feeds] update / fix feed-downloader
[mygpo.git] / mygpo / data / feeddownloader.py
blobdfda3e8002225cdc6f9b2ca59e552b00805844f9
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain, islice
26 import socket
28 from django.db import transaction
29 from django.conf import settings
31 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
32 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
33 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
34 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
35 from feedservice.parse import parse_feed, FetchFeedException
36 from feedservice.parse.text import ConvertMarkdown
37 from feedservice.parse.models import ParserException
38 from feedservice.parse.vimeo import VimeoError
39 from mygpo.utils import file_hash, to_maxlength
40 from mygpo.web.logo import CoverArt
41 from mygpo.data.podcast import subscribe_at_hub
42 from mygpo.data.tasks import update_related_podcasts
43 from mygpo.pubsub.models import SubscriptionError
44 from mygpo.directory.tags import update_category
46 import logging
47 logger = logging.getLogger(__name__)
49 MAX_EPISODES_UPDATE=200
51 class NoPodcastCreated(Exception):
52 """ raised when no podcast obj was created for a new URL """
55 class NoEpisodesException(Exception):
56 """ raised when parsing something that doesn't contain any episodes """
59 class PodcastUpdater(object):
60 """ Updates a number of podcasts with data from their feeds """
62 def update_queue(self, queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield self.update(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update(self, podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = self._fetch_feed(podcast_url)
84 self._validate_parsed(parsed)
86 except (ParserException, FetchFeedException, NoEpisodesException,
87 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
88 #TODO: catch valueError (for invalid Ipv6 in feedservice)
90 if isinstance(ex, VimeoError):
91 logger.exception('Problem when updating Vimeo feed %s',
92 podcast_url)
94 # if we fail to parse the URL, we don't even create the
95 # podcast object
96 try:
97 p = Podcast.objects.get(urls__url=podcast_url)
98 # if it exists already, we mark it as outdated
99 self._mark_outdated(p, 'error while fetching feed: %s' %
100 str(ex))
101 return p
103 except Podcast.DoesNotExist:
104 raise NoPodcastCreated(ex)
106 assert parsed, 'fetch_feed must return something'
107 p = Podcast.objects.get_or_create_for_url(podcast_url)
108 episodes = self._update_episodes(p, parsed.episodes)
109 max_episode_order = self._order_episodes(p)
110 self._update_podcast(p, parsed, episodes, max_episode_order)
111 return p
114 def verify_podcast_url(self, podcast_url):
115 parsed = self._fetch_feed(podcast_url)
116 self._validate_parsed(parsed)
117 return True
120 def _fetch_feed(self, podcast_url):
121 import socket
122 t = socket.getdefaulttimeout()
123 socket.setdefaulttimeout(10)
124 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
125 socket.setdefaulttimeout(t)
129 def _validate_parsed(self, parsed):
130 """ validates the parsed results and raises an exception if invalid
132 feedparser parses pretty much everything. We reject anything that
133 doesn't look like a feed"""
135 if not parsed or not parsed.episodes:
136 raise NoEpisodesException('no episodes found')
139 def _update_podcast(self, podcast, parsed, episodes, max_episode_order):
140 """ updates a podcast according to new parser results """
142 # we need that later to decide if we can "bump" a category
143 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
145 podcast.title = parsed.title or podcast.title
146 podcast.description = parsed.description or podcast.description
147 podcast.subtitle = parsed.subtitle or podcast.subtitle
148 podcast.link = parsed.link or podcast.link
149 podcast.logo_url = parsed.logo or podcast.logo_url
150 podcast.author = to_maxlength(Podcast, 'author', parsed.author or podcast.author)
151 podcast.language = to_maxlength(Podcast, 'language', parsed.language or podcast.language)
152 podcast.content_types = ','.join(parsed.content_types) or podcast.content_types
153 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
154 podcast.common_episode_title = to_maxlength(Podcast,
155 'common_episode_title',
156 parsed.common_title or podcast.common_episode_title)
157 podcast.new_location = parsed.new_location or podcast.new_location
158 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
159 parsed.flattr or podcast.flattr_url)
160 podcast.hub = parsed.hub or podcast.hub
161 podcast.license = parsed.license or podcast.license
162 podcast.max_episode_order = max_episode_order
164 podcast.add_missing_urls(parsed.urls)
166 if podcast.new_location:
167 try:
168 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
169 if new_podcast != podcast:
170 self._mark_outdated(podcast, 'redirected to different podcast')
171 return
172 except Podcast.DoesNotExist:
173 podcast.set_url(podcast.new_location)
176 # latest episode timestamp
177 episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released')
179 podcast.update_interval = get_update_interval(episodes)
181 latest_episode = episodes.last()
182 if latest_episode:
183 podcast.latest_episode_timestamp = latest_episode.released
185 podcast.episode_count = Episode.objects.filter(podcast=podcast).count()
188 self._update_categories(podcast, prev_latest_episode_timestamp)
190 # try to download the logo and reset logo_url to None on http errors
191 found = self._save_podcast_logo(podcast.logo_url)
192 if not found:
193 podcast.logo_url = None
195 # The podcast is always saved (not just when there are changes) because
196 # we need to record the last update
197 logger.info('Saving podcast.')
198 podcast.last_update = datetime.utcnow()
199 podcast.save()
202 try:
203 subscribe_at_hub(podcast)
204 except SubscriptionError as se:
205 logger.warn('subscribing to hub failed: %s', str(se))
208 if not podcast.slug:
209 slug = PodcastSlug(podcast).get_slug()
210 if slug:
211 podcast.add_slug(slug)
213 assign_missing_episode_slugs(podcast)
214 update_related_podcasts.delay(podcast)
217 def _update_categories(self, podcast, prev_timestamp):
218 """ checks some practical requirements and updates a category """
220 from datetime import timedelta
222 max_timestamp = datetime.utcnow() + timedelta(days=1)
224 # no episodes at all
225 if not podcast.latest_episode_timestamp:
226 return
228 # no new episode
229 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
230 return
232 # too far in the future
233 if podcast.latest_episode_timestamp > max_timestamp:
234 return
236 # not enough subscribers
237 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
238 return
240 update_category(podcast)
243 def _update_episodes(self, podcast, parsed_episodes):
245 pid = podcast.get_id()
247 # list of (obj, fun) where fun is the function to update obj
248 updated_episodes = []
249 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
250 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
251 len(episodes_to_update))
253 logger.info('Updating %d episodes', len(episodes_to_update))
254 for n, parsed in enumerate(episodes_to_update, 1):
256 url = get_episode_url(parsed)
257 if not url:
258 logger.info('Skipping episode %d for missing URL', n)
259 continue
261 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
263 episode = Episode.objects.get_or_create_for_url(podcast, url)
265 update_episode(parsed, episode, podcast)
266 updated_episodes.append(episode)
268 # and mark the remaining ones outdated
269 current_episodes = Episode.objects.filter(podcast=podcast,
270 outdated=False)[:500]
271 outdated_episodes = set(current_episodes) - set(updated_episodes)
273 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
274 for episode in outdated_episodes:
275 mark_outdated(episode)
277 @transaction.atomic
278 def _order_episodes(self, podcast):
279 """ Reorder the podcast's episode according to release timestamp
281 Returns the highest order value (corresponding to the most recent
282 episode) """
284 num_episodes = podcast.episode_set.count()
285 if not num_episodes:
286 return 0
288 episodes = podcast.episode_set.all().extra(select={
289 'has_released': 'released IS NOT NULL',
291 .order_by('-has_released', '-released', 'pk')\
292 .only('pk')
294 for n, episode in enumerate(episodes.iterator(), 1):
295 # assign ``order`` from higher (most recent) to 0 (oldest)
296 # None means "unknown"
297 new_order = num_episodes - n
299 # optimize for new episodes that are newer than all existing
300 if episode.order == new_order:
301 continue
303 logger.info('Updating order from {} to {}'.format(episode.order,
304 new_order))
305 episode.order = new_order
306 episode.save()
308 return num_episodes -1
310 def _save_podcast_logo(self, cover_art):
311 if not cover_art:
312 return
314 try:
315 image_sha1 = hashlib.sha1(cover_art).hexdigest()
316 prefix = CoverArt.get_prefix(image_sha1)
318 filename = CoverArt.get_original(prefix, image_sha1)
319 dirname = CoverArt.get_dir(filename)
321 # get hash of existing file
322 if os.path.exists(filename):
323 with open(filename) as f:
324 old_hash = file_hash(f).digest()
325 else:
326 old_hash = ''
328 logger.info('Logo %s', cover_art)
330 # save new cover art
331 with open(filename, 'w') as fp:
332 fp.write(urllib2.urlopen(cover_art).read())
334 # get hash of new file
335 with open(filename) as f:
336 new_hash = file_hash(f).digest()
338 # remove thumbnails if cover changed
339 if old_hash != new_hash:
340 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
341 logger.info('Removing %d thumbnails', len(thumbnails))
342 for f in thumbnails:
343 os.unlink(f)
345 return cover_art
347 except (urllib2.HTTPError, urllib2.URLError, ValueError,
348 httplib.BadStatusLine, socket.error, IOError) as e:
349 logger.warn('Exception while updating podcast logo: %s', str(e))
352 def _mark_outdated(self, podcast, msg=''):
353 logger.info('marking podcast outdated: %s', msg)
354 podcast.outdated = True
355 podcast.last_update = datetime.utcnow()
356 podcast.save()
357 self._update_episodes(podcast, [])
360 def get_episode_url(parsed_episode):
361 """ returns the URL of a parsed episode """
362 for f in parsed_episode.files:
363 if f.urls:
364 return f.urls[0]
365 return None
368 def update_episode(parsed_episode, episode, podcast):
369 """ updates "episode" with the data from "parsed_episode" """
371 # TODO: check if there have been any changes, to avoid unnecessary updates
372 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.guid or episode.guid)
373 episode.description = parsed_episode.description or episode.description
374 episode.subtitle = parsed_episode.subtitle or episode.subtitle
375 episode.content = parsed_episode.content or parsed_episode.description or episode.content
376 episode.link = to_maxlength(Episode, 'link',
377 parsed_episode.link or episode.link)
378 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
379 episode.author = to_maxlength(Episode, 'author', parsed_episode.author or episode.author)
380 episode.duration = parsed_episode.duration or episode.duration
381 episode.filesize = parsed_episode.files[0].filesize
382 episode.language = parsed_episode.language or episode.language or \
383 podcast.language
384 episode.mimetypes = ','.join(list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
385 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
386 parsed_episode.flattr or
387 episode.flattr_url)
388 episode.license = parsed_episode.license or episode.license
390 episode.title = to_maxlength(Episode, 'title',
391 parsed_episode.title or episode.title or
392 file_basename_no_extension(episode.url))
394 episode.last_update = datetime.utcnow()
395 episode.save()
397 parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
398 episode.add_missing_urls(parsed_urls)
401 def mark_outdated(obj):
402 """ marks obj outdated if its not already """
403 if obj.outdated:
404 return None
406 obj.outdated = True
407 obj.last_update = datetime.utcnow()
408 obj.save()
411 def get_update_interval(episodes):
412 """ calculates the avg interval between new episodes """
414 count = len(episodes)
415 if not count:
416 logger.info('no episodes, using default interval of %dh',
417 DEFAULT_UPDATE_INTERVAL)
418 return DEFAULT_UPDATE_INTERVAL
420 earliest = episodes[0]
421 now = datetime.utcnow()
423 timespan_s = (now - earliest.released).total_seconds()
424 timespan_h = timespan_s / 60 / 60
426 interval = int(timespan_h / count)
427 logger.info('%d episodes in %d days => %dh interval', count,
428 timespan_h / 24, interval)
430 # place interval between {MIN,MAX}_UPDATE_INTERVAL
431 interval = max(interval, MIN_UPDATE_INTERVAL)
432 interval = min(interval, MAX_UPDATE_INTERVAL)
434 return interval
437 def file_basename_no_extension(filename):
438 """ Returns filename without extension
440 >>> file_basename_no_extension('/home/me/file.txt')
441 'file'
443 >>> file_basename_no_extension('file')
444 'file'
446 base = os.path.basename(filename)
447 name, extension = os.path.splitext(base)
448 return name