mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain, islice
  26 import socket
  27
  28 from django.conf import settings
  29
  30 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  31 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  32 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  33     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  34 from feedservice.parse import parse_feed, FetchFeedException
  35 from feedservice.parse.text import ConvertMarkdown
  36 from feedservice.parse.models import ParserException
  37 from feedservice.parse.vimeo import VimeoError
  38 from mygpo.utils import file_hash, to_maxlength
  39 from mygpo.web.logo import CoverArt
  40 from mygpo.data.podcast import subscribe_at_hub
  41 from mygpo.data.tasks import update_related_podcasts
  42 from mygpo.pubsub.models import SubscriptionError
  43 from mygpo.directory.tags import update_category
  44
  45 import logging
  46 logger = logging.getLogger(__name__)
  47
  48 MAX_EPISODES_UPDATE=200
  49
  50 class NoPodcastCreated(Exception):
  51     """ raised when no podcast obj was created for a new URL """
  52
  53
  54 class NoEpisodesException(Exception):
  55     """ raised when parsing something that doesn't contain any episodes """
  56
  57
  58 class PodcastUpdater(object):
  59     """ Updates a number of podcasts with data from their feeds """
  60
  61     def update_queue(self, queue):
  62         """ Fetch data for the URLs supplied as the queue iterable """
  63
  64         for n, podcast_url in enumerate(queue, 1):
  65             logger.info('Update %d - %s', n, podcast_url)
  66             try:
  67                 yield self.update(podcast_url)
  68
  69             except NoPodcastCreated as npc:
  70                 logger.info('No podcast created: %s', npc)
  71
  72
  73     def update(self, podcast_url):
  74         """ Update the podcast for the supplied URL """
  75
  76         try:
  77             parsed = self._fetch_feed(podcast_url)
  78             self._validate_parsed(parsed)
  79
  80         except (ParserException, FetchFeedException, NoEpisodesException,
  81                 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
  82             #TODO: catch valueError (for invalid Ipv6 in feedservice)
  83
  84             if isinstance(ex, VimeoError):
  85                 logger.exception('Problem when updating Vimeo feed %s',
  86                                  podcast_url)
  87
  88             # if we fail to parse the URL, we don't even create the
  89             # podcast object
  90             try:
  91                 p = Podcast.objects.get(urls__url=podcast_url)
  92                 # if it exists already, we mark it as outdated
  93                 self._mark_outdated(p, 'error while fetching feed: %s' %
  94                     str(ex))
  95                 return p
  96
  97             except Podcast.DoesNotExist:
  98                 raise NoPodcastCreated(ex)
  99
 100         assert parsed, 'fetch_feed must return something'
 101         p = Podcast.objects.get_or_create_for_url(podcast_url)
 102         episodes = self._update_episodes(p, parsed.episodes)
 103         self._update_podcast(p, parsed, episodes)
 104         return p
 105
 106
 107     def verify_podcast_url(self, podcast_url):
 108         parsed = self._fetch_feed(podcast_url)
 109         self._validate_parsed(parsed)
 110         return True
 111
 112
 113     def _fetch_feed(self, podcast_url):
 114         import socket
 115         t = socket.getdefaulttimeout()
 116         socket.setdefaulttimeout(10)
 117         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 118         socket.setdefaulttimeout(t)
 119
 120
 121
 122     def _validate_parsed(self, parsed):
 123         """ validates the parsed results and raises an exception if invalid
 124
 125         feedparser parses pretty much everything. We reject anything that
 126         doesn't look like a feed"""
 127
 128         if not parsed or not parsed.episodes:
 129             raise NoEpisodesException('no episodes found')
 130
 131
 132     def _update_podcast(self, podcast, parsed, episodes):
 133         """ updates a podcast according to new parser results """
 134
 135         # we need that later to decide if we can "bump" a category
 136         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 137
 138         podcast.title = parsed.title or podcast.title
 139         podcast.description = parsed.description or podcast.description
 140         podcast.subtitle = parsed.subtitle or podcast.subtitle
 141         podcast.link = parsed.link or podcast.link
 142         podcast.logo_url = parsed.logo or podcast.logo_url
 143         podcast.author = parsed.author or podcast.author
 144         podcast.language = parsed.language or podcast.language
 145         podcast.content_types = parsed.content_types or podcast.content_types
 146 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 147         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 148         podcast.new_location = parsed.new_location or podcast.new_location
 149         podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 150                                           parsed.flattr or podcast.flattr_url)
 151         podcast.hub = parsed.hub or podcast.hub
 152         podcast.license = parsed.license or podcast.license
 153
 154         podcast.add_missing_urls(parsed.urls)
 155
 156         if podcast.new_location:
 157             try:
 158                 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 159                 if new_podcast != podcast:
 160                     self._mark_outdated(podcast, 'redirected to different podcast')
 161                     return
 162             except Podcast.DoesNotExist:
 163                 podcast.urls.insert(0, podcast.new_location)
 164
 165
 166         # latest episode timestamp
 167         episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released')
 168
 169         podcast.update_interval = get_update_interval(episodes)
 170
 171         latest_episode = episodes.last()
 172         if latest_episode:
 173             podcast.latest_episode_timestamp = latest_episode.released
 174
 175         podcast.episode_count = Episode.objects.filter(podcast=podcast).count()
 176
 177
 178         self._update_categories(podcast, prev_latest_episode_timestamp)
 179
 180         # try to download the logo and reset logo_url to None on http errors
 181         found = self._save_podcast_logo(podcast.logo_url)
 182         if not found:
 183             podcast.logo_url = None
 184
 185         # The podcast is always saved (not just when there are changes) because
 186         # we need to record the last update
 187         logger.info('Saving podcast.')
 188         podcast.last_update = datetime.utcnow()
 189         podcast.save()
 190
 191
 192         try:
 193             subscribe_at_hub(podcast)
 194         except SubscriptionError as se:
 195             logger.warn('subscribing to hub failed: %s', str(se))
 196
 197
 198         if not podcast.slug:
 199             slug = PodcastSlug(podcast).get_slug()
 200             if slug:
 201                 podcast.add_slug(slug)
 202
 203         assign_missing_episode_slugs(podcast)
 204         update_related_podcasts.delay(podcast)
 205
 206
 207     def _update_categories(self, podcast, prev_timestamp):
 208         """ checks some practical requirements and updates a category """
 209
 210         from datetime import timedelta
 211
 212         max_timestamp = datetime.utcnow() + timedelta(days=1)
 213
 214         # no episodes at all
 215         if not podcast.latest_episode_timestamp:
 216             return
 217
 218         # no new episode
 219         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 220             return
 221
 222         # too far in the future
 223         if podcast.latest_episode_timestamp > max_timestamp:
 224             return
 225
 226         # not enough subscribers
 227         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 228             return
 229
 230         update_category(podcast)
 231
 232
 233     def _update_episodes(self, podcast, parsed_episodes):
 234
 235         pid = podcast.get_id()
 236
 237         # list of (obj, fun) where fun is the function to update obj
 238         updated_episodes = []
 239         episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 240         logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 241                     len(episodes_to_update))
 242
 243         logger.info('Updating %d episodes', len(episodes_to_update))
 244         for n, parsed in enumerate(episodes_to_update, 1):
 245
 246             url = get_episode_url(parsed)
 247             if not url:
 248                 logger.info('Skipping episode %d for missing URL', n)
 249                 continue
 250
 251             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 252
 253             episode = Episode.objects.get_or_create_for_url(podcast, url)
 254
 255             update_episode(parsed, episode, podcast)
 256             updated_episodes.append(episode)
 257
 258         # and mark the remaining ones outdated
 259         current_episodes = Episode.objects.filter(podcast=podcast,
 260                                                   outdated=False)[:500]
 261         outdated_episodes = set(current_episodes) - set(updated_episodes)
 262
 263         logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 264         for episode in outdated_episodes:
 265             mark_outdated(episode)
 266
 267
 268     def _save_podcast_logo(self, cover_art):
 269         if not cover_art:
 270             return
 271
 272         try:
 273             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 274             prefix = CoverArt.get_prefix(image_sha1)
 275
 276             filename = CoverArt.get_original(prefix, image_sha1)
 277             dirname = CoverArt.get_dir(filename)
 278
 279             # get hash of existing file
 280             if os.path.exists(filename):
 281                 with open(filename) as f:
 282                     old_hash = file_hash(f).digest()
 283             else:
 284                 old_hash = ''
 285
 286             logger.info('Logo %s', cover_art)
 287
 288             # save new cover art
 289             with open(filename, 'w') as fp:
 290                 fp.write(urllib2.urlopen(cover_art).read())
 291
 292             # get hash of new file
 293             with open(filename) as f:
 294                 new_hash = file_hash(f).digest()
 295
 296             # remove thumbnails if cover changed
 297             if old_hash != new_hash:
 298                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 299                 logger.info('Removing %d thumbnails', len(thumbnails))
 300                 for f in thumbnails:
 301                     os.unlink(f)
 302
 303             return cover_art
 304
 305         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 306                 httplib.BadStatusLine, socket.error, IOError) as e:
 307             logger.warn('Exception while updating podcast logo: %s', str(e))
 308
 309
 310     def _mark_outdated(self, podcast, msg=''):
 311         logger.info('marking podcast outdated: %s', msg)
 312         podcast.outdated = True
 313         podcast.last_update = datetime.utcnow()
 314         podcast.save()
 315         self._update_episodes(podcast, [])
 316
 317
 318 def get_episode_url(parsed_episode):
 319     """ returns the URL of a parsed episode """
 320     for f in parsed_episode.files:
 321         if f.urls:
 322             return f.urls[0]
 323     return None
 324
 325
 326 def update_episode(parsed_episode, episode, podcast):
 327     """ updates "episode" with the data from "parsed_episode" """
 328
 329     # TODO: check if there have been any changes, to avoid unnecessary updates
 330     episode.guid = parsed_episode.guid or episode.guid
 331     episode.description = parsed_episode.description or episode.description
 332     episode.subtitle = parsed_episode.subtitle or episode.subtitle
 333     episode.content = parsed_episode.content or parsed_episode.description or episode.content
 334     episode.link = to_maxlength(Episode, 'link',
 335                                 parsed_episode.link or episode.link)
 336     episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 337     episode.author = parsed_episode.author or episode.author
 338     episode.duration = parsed_episode.duration or episode.duration
 339     episode.filesize = parsed_episode.files[0].filesize
 340     episode.language = parsed_episode.language or episode.language or \
 341                                                   podcast.language
 342     episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 343     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 344                                       parsed_episode.flattr or
 345                                       episode.flattr_url)
 346     episode.license = parsed_episode.license or episode.license
 347
 348     episode.title = to_maxlength(Episode, 'title',
 349                                  parsed_episode.title or episode.title or
 350                                  file_basename_no_extension(episode.url))
 351
 352     episode.last_update = datetime.utcnow()
 353     episode.save()
 354
 355     parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 356     episode.add_missing_urls(parsed_urls)
 357
 358
 359 def mark_outdated(obj):
 360     """ marks obj outdated if its not already """
 361     if obj.outdated:
 362         return None
 363
 364     obj.outdated = True
 365     obj.last_update = datetime.utcnow()
 366     obj.save()
 367
 368
 369 def get_update_interval(episodes):
 370     """ calculates the avg interval between new episodes """
 371
 372     count = len(episodes)
 373     if not count:
 374         logger.info('no episodes, using default interval of %dh',
 375             DEFAULT_UPDATE_INTERVAL)
 376         return DEFAULT_UPDATE_INTERVAL
 377
 378     earliest = episodes[0]
 379     now = datetime.utcnow()
 380
 381     timespan_s = (now - earliest.released).total_seconds()
 382     timespan_h = timespan_s / 60 / 60
 383
 384     interval = int(timespan_h / count)
 385     logger.info('%d episodes in %d days => %dh interval', count,
 386         timespan_h / 24, interval)
 387
 388     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 389     interval = max(interval, MIN_UPDATE_INTERVAL)
 390     interval = min(interval, MAX_UPDATE_INTERVAL)
 391
 392     return interval
 393
 394
 395 def file_basename_no_extension(filename):
 396     """ Returns filename without extension
 397
 398     >>> file_basename_no_extension('/home/me/file.txt')
 399     'file'
 400
 401     >>> file_basename_no_extension('file')
 402     'file'
 403     """
 404     base = os.path.basename(filename)
 405     name, extension = os.path.splitext(base)
 406     return name