mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain, islice
  26 import socket
  27
  28 from django.conf import settings
  29
  30 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  31 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  32 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  33     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  34 from feedservice.parse import parse_feed, FetchFeedException
  35 from feedservice.parse.text import ConvertMarkdown
  36 from feedservice.parse.models import ParserException
  37 from feedservice.parse.vimeo import VimeoError
  38 from mygpo.utils import file_hash
  39 from mygpo.web.logo import CoverArt
  40 from mygpo.data.podcast import subscribe_at_hub
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE=200
  48
  49 class NoPodcastCreated(Exception):
  50     """ raised when no podcast obj was created for a new URL """
  51
  52
  53 class NoEpisodesException(Exception):
  54     """ raised when parsing something that doesn't contain any episodes """
  55
  56
  57 class PodcastUpdater(object):
  58     """ Updates a number of podcasts with data from their feeds """
  59
  60     def update_queue(self, queue):
  61         """ Fetch data for the URLs supplied as the queue iterable """
  62
  63         for n, podcast_url in enumerate(queue, 1):
  64             logger.info('Update %d - %s', n, podcast_url)
  65             try:
  66                 yield self.update(podcast_url)
  67
  68             except NoPodcastCreated as npc:
  69                 logger.info('No podcast created: %s', npc)
  70
  71
  72     def update(self, podcast_url):
  73         """ Update the podcast for the supplied URL """
  74
  75         try:
  76             parsed = self._fetch_feed(podcast_url)
  77             self._validate_parsed(parsed)
  78
  79         except (ParserException, FetchFeedException, NoEpisodesException,
  80                 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
  81             #TODO: catch valueError (for invalid Ipv6 in feedservice)
  82
  83             if isinstance(ex, VimeoError):
  84                 logger.exception('Problem when updating Vimeo feed %s',
  85                                  podcast_url)
  86
  87             # if we fail to parse the URL, we don't even create the
  88             # podcast object
  89             try:
  90                 p = Podcast.objects.get(urls__url=podcast_url)
  91                 # if it exists already, we mark it as outdated
  92                 self._mark_outdated(p, 'error while fetching feed: %s' %
  93                     str(ex))
  94                 return p
  95
  96             except Podcast.DoesNotExist:
  97                 raise NoPodcastCreated(ex)
  98
  99         assert parsed, 'fetch_feed must return something'
 100         p = Podcast.objects.get_or_create_for_url(podcast_url)
 101         episodes = self._update_episodes(p, parsed.episodes)
 102         self._update_podcast(p, parsed, episodes)
 103         return p
 104
 105
 106     def verify_podcast_url(self, podcast_url):
 107         parsed = self._fetch_feed(podcast_url)
 108         self._validate_parsed(parsed)
 109         return True
 110
 111
 112     def _fetch_feed(self, podcast_url):
 113         import socket
 114         t = socket.getdefaulttimeout()
 115         socket.setdefaulttimeout(10)
 116         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 117         socket.setdefaulttimeout(t)
 118
 119
 120
 121     def _validate_parsed(self, parsed):
 122         """ validates the parsed results and raises an exception if invalid
 123
 124         feedparser parses pretty much everything. We reject anything that
 125         doesn't look like a feed"""
 126
 127         if not parsed or not parsed.episodes:
 128             raise NoEpisodesException('no episodes found')
 129
 130
 131     def _update_podcast(self, podcast, parsed, episodes):
 132         """ updates a podcast according to new parser results """
 133
 134         # we need that later to decide if we can "bump" a category
 135         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 136
 137         podcast.title = parsed.title or podcast.title
 138         podcast.description = parsed.description or podcast.description
 139         podcast.subtitle = parsed.subtitle or podcast.subtitle
 140         podcast.link = parsed.link or podcast.link
 141         podcast.logo_url = parsed.logo or podcast.logo_url
 142         podcast.author = parsed.author or podcast.author
 143         podcast.language = parsed.language or podcast.language
 144         podcast.content_types = parsed.content_types or podcast.content_types
 145 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 146         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 147         podcast.new_location = parsed.new_location or podcast.new_location
 148         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 149         podcast.hub = parsed.hub or podcast.hub
 150         podcast.license = parsed.license or podcast.license
 151
 152         podcast.add_missing_urls(parsed.urls)
 153
 154         if podcast.new_location:
 155             try:
 156                 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 157                 if new_podcast != podcast:
 158                     self._mark_outdated(podcast, 'redirected to different podcast')
 159                     return
 160             except Podcast.DoesNotExist:
 161                 podcast.urls.insert(0, podcast.new_location)
 162
 163
 164         # latest episode timestamp
 165         episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released')
 166
 167         podcast.update_interval = get_update_interval(episodes)
 168
 169         latest_episode = episodes.last()
 170         if latest_episode:
 171             podcast.latest_episode_timestamp = latest_episode.released
 172
 173         podcast.episode_count = Episode.objects.filter(podcast=podcast).count()
 174
 175
 176         self._update_categories(podcast, prev_latest_episode_timestamp)
 177
 178         # try to download the logo and reset logo_url to None on http errors
 179         found = self._save_podcast_logo(podcast.logo_url)
 180         if not found:
 181             podcast.logo_url = None
 182
 183         # The podcast is always saved (not just when there are changes) because
 184         # we need to record the last update
 185         logger.info('Saving podcast.')
 186         podcast.last_update = datetime.utcnow()
 187         podcast.save()
 188
 189
 190         try:
 191             subscribe_at_hub(podcast)
 192         except SubscriptionError as se:
 193             logger.warn('subscribing to hub failed: %s', str(se))
 194
 195
 196         if not podcast.slug:
 197             slug = PodcastSlug(podcast).get_slug()
 198             if slug:
 199                 podcast.add_slug(slug)
 200
 201         assign_missing_episode_slugs(podcast)
 202
 203
 204     def _update_categories(self, podcast, prev_timestamp):
 205         """ checks some practical requirements and updates a category """
 206
 207         from datetime import timedelta
 208
 209         max_timestamp = datetime.utcnow() + timedelta(days=1)
 210
 211         # no episodes at all
 212         if not podcast.latest_episode_timestamp:
 213             return
 214
 215         # no new episode
 216         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 217             return
 218
 219         # too far in the future
 220         if podcast.latest_episode_timestamp > max_timestamp:
 221             return
 222
 223         # not enough subscribers
 224         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 225             return
 226
 227         update_category(podcast)
 228
 229
 230     def _update_episodes(self, podcast, parsed_episodes):
 231
 232         pid = podcast.get_id()
 233
 234         # list of (obj, fun) where fun is the function to update obj
 235         updated_episodes = []
 236         episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 237         logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 238                     len(episodes_to_update))
 239
 240         logger.info('Updating %d episodes', len(episodes_to_update))
 241         for n, parsed in enumerate(episodes_to_update, 1):
 242
 243             url = get_episode_url(parsed)
 244             if not url:
 245                 logger.info('Skipping episode %d for missing URL', n)
 246                 continue
 247
 248             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 249
 250             episode = Episode.objects.get_or_create_for_url(podcast, url)
 251
 252             update_episode(parsed, episode, podcast)
 253             updated_episodes.append(episode)
 254
 255         # and mark the remaining ones outdated
 256         current_episodes = Episode.objects.filter(podcast=podcast,
 257                                                   outdated=False)[:500]
 258         outdated_episodes = set(current_episodes) - set(updated_episodes)
 259
 260         logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 261         for episode in outdated_episodes:
 262             mark_outdated(episode)
 263
 264
 265     def _save_podcast_logo(self, cover_art):
 266         if not cover_art:
 267             return
 268
 269         try:
 270             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 271             prefix = CoverArt.get_prefix(image_sha1)
 272
 273             filename = CoverArt.get_original(prefix, image_sha1)
 274             dirname = CoverArt.get_dir(filename)
 275
 276             # get hash of existing file
 277             if os.path.exists(filename):
 278                 with open(filename) as f:
 279                     old_hash = file_hash(f).digest()
 280             else:
 281                 old_hash = ''
 282
 283             logger.info('Logo %s', cover_art)
 284
 285             # save new cover art
 286             with open(filename, 'w') as fp:
 287                 fp.write(urllib2.urlopen(cover_art).read())
 288
 289             # get hash of new file
 290             with open(filename) as f:
 291                 new_hash = file_hash(f).digest()
 292
 293             # remove thumbnails if cover changed
 294             if old_hash != new_hash:
 295                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 296                 logger.info('Removing %d thumbnails', len(thumbnails))
 297                 for f in thumbnails:
 298                     os.unlink(f)
 299
 300             return cover_art
 301
 302         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 303                 httplib.BadStatusLine, socket.error, IOError) as e:
 304             logger.warn('Exception while updating podcast logo: %s', str(e))
 305
 306
 307     def _mark_outdated(self, podcast, msg=''):
 308         logger.info('marking podcast outdated: %s', msg)
 309         podcast.outdated = True
 310         podcast.last_update = datetime.utcnow()
 311         podcast.save()
 312         self._update_episodes(podcast, [])
 313
 314
 315 def get_episode_url(parsed_episode):
 316     """ returns the URL of a parsed episode """
 317     for f in parsed_episode.files:
 318         if f.urls:
 319             return f.urls[0]
 320     return None
 321
 322
 323 def update_episode(parsed_episode, episode, podcast):
 324     """ updates "episode" with the data from "parsed_episode" """
 325
 326     # TODO: check if there have been any changes, to avoid unnecessary updates
 327     episode.guid = parsed_episode.guid or episode.guid
 328     episode.description = parsed_episode.description or episode.description
 329     episode.subtitle = parsed_episode.subtitle or episode.subtitle
 330     episode.content = parsed_episode.content or parsed_episode.description or episode.content
 331     episode.link = parsed_episode.link or episode.link
 332     episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 333     episode.author = parsed_episode.author or episode.author
 334     episode.duration = parsed_episode.duration or episode.duration
 335     episode.filesize = parsed_episode.files[0].filesize
 336     episode.language = parsed_episode.language or episode.language or \
 337                                                   podcast.language
 338     episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 339     episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 340     episode.license = parsed_episode.license or episode.license
 341
 342     episode.title = parsed_episode.title or episode.title or \
 343                     file_basename_no_extension(episode.url)
 344
 345     episode.last_update = datetime.utcnow()
 346     episode.save()
 347
 348     parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 349     episode.add_missing_urls(parsed_urls)
 350
 351
 352 def mark_outdated(obj):
 353     """ marks obj outdated if its not already """
 354     if obj.outdated:
 355         return None
 356
 357     obj.outdated = True
 358     obj.last_update = datetime.utcnow()
 359     obj.save()
 360
 361
 362 def get_update_interval(episodes):
 363     """ calculates the avg interval between new episodes """
 364
 365     count = len(episodes)
 366     if not count:
 367         logger.info('no episodes, using default interval of %dh',
 368             DEFAULT_UPDATE_INTERVAL)
 369         return DEFAULT_UPDATE_INTERVAL
 370
 371     earliest = episodes[0]
 372     now = datetime.utcnow()
 373
 374     timespan_s = (now - earliest.released).total_seconds()
 375     timespan_h = timespan_s / 60 / 60
 376
 377     interval = int(timespan_h / count)
 378     logger.info('%d episodes in %d days => %dh interval', count,
 379         timespan_h / 24, interval)
 380
 381     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 382     interval = max(interval, MIN_UPDATE_INTERVAL)
 383     interval = min(interval, MAX_UPDATE_INTERVAL)
 384
 385     return interval
 386
 387
 388 def file_basename_no_extension(filename):
 389     """ Returns filename without extension
 390
 391     >>> file_basename_no_extension('/home/me/file.txt')
 392     'file'
 393
 394     >>> file_basename_no_extension('file')
 395     'file'
 396     """
 397     base = os.path.basename(filename)
 398     name, extension = os.path.splitext(base)
 399     return name