mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain, islice
  26 import socket
  27
  28 from django.db import transaction
  29 from django.conf import settings
  30
  31 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  32 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  33 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  34     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  35 from feedservice.parse import parse_feed, FetchFeedException
  36 from feedservice.parse.text import ConvertMarkdown
  37 from feedservice.parse.models import ParserException
  38 from feedservice.parse.vimeo import VimeoError
  39 from mygpo.utils import file_hash, to_maxlength
  40 from mygpo.web.logo import CoverArt
  41 from mygpo.data.podcast import subscribe_at_hub
  42 from mygpo.data.tasks import update_related_podcasts
  43 from mygpo.pubsub.models import SubscriptionError
  44 from mygpo.directory.tags import update_category
  45
  46 import logging
  47 logger = logging.getLogger(__name__)
  48
  49 MAX_EPISODES_UPDATE=200
  50
  51 class NoPodcastCreated(Exception):
  52     """ raised when no podcast obj was created for a new URL """
  53
  54
  55 class NoEpisodesException(Exception):
  56     """ raised when parsing something that doesn't contain any episodes """
  57
  58
  59 class PodcastUpdater(object):
  60     """ Updates a number of podcasts with data from their feeds """
  61
  62     def update_queue(self, queue):
  63         """ Fetch data for the URLs supplied as the queue iterable """
  64
  65         for n, podcast_url in enumerate(queue, 1):
  66             logger.info('Update %d - %s', n, podcast_url)
  67             try:
  68                 yield self.update(podcast_url)
  69
  70             except NoPodcastCreated as npc:
  71                 logger.info('No podcast created: %s', npc)
  72
  73             except:
  74                 logger.exception('Error while updating podcast "%s"',
  75                                  podcast_url)
  76                 raise
  77
  78
  79     def update(self, podcast_url):
  80         """ Update the podcast for the supplied URL """
  81
  82         try:
  83             parsed = self._fetch_feed(podcast_url)
  84             self._validate_parsed(parsed)
  85
  86         except (ParserException, FetchFeedException, NoEpisodesException,
  87                 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
  88             #TODO: catch valueError (for invalid Ipv6 in feedservice)
  89
  90             if isinstance(ex, VimeoError):
  91                 logger.exception('Problem when updating Vimeo feed %s',
  92                                  podcast_url)
  93
  94             # if we fail to parse the URL, we don't even create the
  95             # podcast object
  96             try:
  97                 p = Podcast.objects.get(urls__url=podcast_url)
  98                 # if it exists already, we mark it as outdated
  99                 self._mark_outdated(p, 'error while fetching feed: %s' %
 100                     str(ex))
 101                 return p
 102
 103             except Podcast.DoesNotExist:
 104                 raise NoPodcastCreated(ex)
 105
 106         assert parsed, 'fetch_feed must return something'
 107         p = Podcast.objects.get_or_create_for_url(podcast_url)
 108         episodes = self._update_episodes(p, parsed.episodes)
 109         max_episode_order = self._order_episodes(p)
 110         self._update_podcast(p, parsed, episodes, max_episode_order)
 111         return p
 112
 113
 114     def verify_podcast_url(self, podcast_url):
 115         parsed = self._fetch_feed(podcast_url)
 116         self._validate_parsed(parsed)
 117         return True
 118
 119
 120     def _fetch_feed(self, podcast_url):
 121         import socket
 122         t = socket.getdefaulttimeout()
 123         socket.setdefaulttimeout(10)
 124         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 125         socket.setdefaulttimeout(t)
 126
 127
 128
 129     def _validate_parsed(self, parsed):
 130         """ validates the parsed results and raises an exception if invalid
 131
 132         feedparser parses pretty much everything. We reject anything that
 133         doesn't look like a feed"""
 134
 135         if not parsed or not parsed.episodes:
 136             raise NoEpisodesException('no episodes found')
 137
 138
 139     def _update_podcast(self, podcast, parsed, episodes, max_episode_order):
 140         """ updates a podcast according to new parser results """
 141
 142         # we need that later to decide if we can "bump" a category
 143         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 144
 145         podcast.title = parsed.title or podcast.title
 146         podcast.description = parsed.description or podcast.description
 147         podcast.subtitle = parsed.subtitle or podcast.subtitle
 148         podcast.link = parsed.link or podcast.link
 149         podcast.logo_url = parsed.logo or podcast.logo_url
 150         podcast.author = to_maxlength(Podcast, 'author', parsed.author or podcast.author)
 151         podcast.language = to_maxlength(Podcast, 'language', parsed.language or podcast.language)
 152         podcast.content_types = ','.join(parsed.content_types) or podcast.content_types
 153 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 154         podcast.common_episode_title = to_maxlength(Podcast,
 155             'common_episode_title',
 156             parsed.common_title or podcast.common_episode_title)
 157         podcast.new_location = parsed.new_location or podcast.new_location
 158         podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 159                                           parsed.flattr or podcast.flattr_url)
 160         podcast.hub = parsed.hub or podcast.hub
 161         podcast.license = parsed.license or podcast.license
 162         podcast.max_episode_order = max_episode_order
 163
 164         podcast.add_missing_urls(parsed.urls)
 165
 166         if podcast.new_location:
 167             try:
 168                 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 169                 if new_podcast != podcast:
 170                     self._mark_outdated(podcast, 'redirected to different podcast')
 171                     return
 172             except Podcast.DoesNotExist:
 173                 podcast.set_url(podcast.new_location)
 174
 175
 176         # latest episode timestamp
 177         episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released')
 178
 179         podcast.update_interval = get_update_interval(episodes)
 180
 181         latest_episode = episodes.last()
 182         if latest_episode:
 183             podcast.latest_episode_timestamp = latest_episode.released
 184
 185         podcast.episode_count = Episode.objects.filter(podcast=podcast).count()
 186
 187
 188         self._update_categories(podcast, prev_latest_episode_timestamp)
 189
 190         # try to download the logo and reset logo_url to None on http errors
 191         found = self._save_podcast_logo(podcast.logo_url)
 192         if not found:
 193             podcast.logo_url = None
 194
 195         # The podcast is always saved (not just when there are changes) because
 196         # we need to record the last update
 197         logger.info('Saving podcast.')
 198         podcast.last_update = datetime.utcnow()
 199         podcast.save()
 200
 201
 202         try:
 203             subscribe_at_hub(podcast)
 204         except SubscriptionError as se:
 205             logger.warn('subscribing to hub failed: %s', str(se))
 206
 207
 208         if not podcast.slug:
 209             slug = PodcastSlug(podcast).get_slug()
 210             if slug:
 211                 podcast.add_slug(slug)
 212
 213         assign_missing_episode_slugs(podcast)
 214         update_related_podcasts.delay(podcast)
 215
 216
 217     def _update_categories(self, podcast, prev_timestamp):
 218         """ checks some practical requirements and updates a category """
 219
 220         from datetime import timedelta
 221
 222         max_timestamp = datetime.utcnow() + timedelta(days=1)
 223
 224         # no episodes at all
 225         if not podcast.latest_episode_timestamp:
 226             return
 227
 228         # no new episode
 229         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 230             return
 231
 232         # too far in the future
 233         if podcast.latest_episode_timestamp > max_timestamp:
 234             return
 235
 236         # not enough subscribers
 237         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 238             return
 239
 240         update_category(podcast)
 241
 242
 243     def _update_episodes(self, podcast, parsed_episodes):
 244
 245         pid = podcast.get_id()
 246
 247         # list of (obj, fun) where fun is the function to update obj
 248         updated_episodes = []
 249         episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 250         logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 251                     len(episodes_to_update))
 252
 253         logger.info('Updating %d episodes', len(episodes_to_update))
 254         for n, parsed in enumerate(episodes_to_update, 1):
 255
 256             url = get_episode_url(parsed)
 257             if not url:
 258                 logger.info('Skipping episode %d for missing URL', n)
 259                 continue
 260
 261             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 262
 263             episode = Episode.objects.get_or_create_for_url(podcast, url)
 264
 265             update_episode(parsed, episode, podcast)
 266             updated_episodes.append(episode)
 267
 268         # and mark the remaining ones outdated
 269         current_episodes = Episode.objects.filter(podcast=podcast,
 270                                                   outdated=False)[:500]
 271         outdated_episodes = set(current_episodes) - set(updated_episodes)
 272
 273         logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 274         for episode in outdated_episodes:
 275             mark_outdated(episode)
 276
 277     @transaction.atomic
 278     def _order_episodes(self, podcast):
 279         """ Reorder the podcast's episode according to release timestamp
 280
 281         Returns the highest order value (corresponding to the most recent
 282         episode) """
 283
 284         num_episodes = podcast.episode_set.count()
 285         if not num_episodes:
 286             return 0
 287
 288         episodes = podcast.episode_set.all().extra(select={
 289                 'has_released': 'released IS NOT NULL',
 290             })\
 291             .order_by('-has_released', '-released', 'pk')\
 292             .only('pk')
 293
 294         for n, episode in enumerate(episodes.iterator(), 1):
 295             # assign ``order`` from higher (most recent) to 0 (oldest)
 296             # None means "unknown"
 297             new_order = num_episodes - n
 298
 299             # optimize for new episodes that are newer than all existing
 300             if episode.order == new_order:
 301                 continue
 302
 303             logger.info('Updating order from {} to {}'.format(episode.order,
 304                                                               new_order))
 305             episode.order = new_order
 306             episode.save()
 307
 308         return num_episodes -1
 309
 310     def _save_podcast_logo(self, cover_art):
 311         if not cover_art:
 312             return
 313
 314         try:
 315             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 316             prefix = CoverArt.get_prefix(image_sha1)
 317
 318             filename = CoverArt.get_original(prefix, image_sha1)
 319             dirname = CoverArt.get_dir(filename)
 320
 321             # get hash of existing file
 322             if os.path.exists(filename):
 323                 with open(filename) as f:
 324                     old_hash = file_hash(f).digest()
 325             else:
 326                 old_hash = ''
 327
 328             logger.info('Logo %s', cover_art)
 329
 330             # save new cover art
 331             with open(filename, 'w') as fp:
 332                 fp.write(urllib2.urlopen(cover_art).read())
 333
 334             # get hash of new file
 335             with open(filename) as f:
 336                 new_hash = file_hash(f).digest()
 337
 338             # remove thumbnails if cover changed
 339             if old_hash != new_hash:
 340                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 341                 logger.info('Removing %d thumbnails', len(thumbnails))
 342                 for f in thumbnails:
 343                     os.unlink(f)
 344
 345             return cover_art
 346
 347         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 348                 httplib.BadStatusLine, socket.error, IOError) as e:
 349             logger.warn('Exception while updating podcast logo: %s', str(e))
 350
 351
 352     def _mark_outdated(self, podcast, msg=''):
 353         logger.info('marking podcast outdated: %s', msg)
 354         podcast.outdated = True
 355         podcast.last_update = datetime.utcnow()
 356         podcast.save()
 357         self._update_episodes(podcast, [])
 358
 359
 360 def get_episode_url(parsed_episode):
 361     """ returns the URL of a parsed episode """
 362     for f in parsed_episode.files:
 363         if f.urls:
 364             return f.urls[0]
 365     return None
 366
 367
 368 def update_episode(parsed_episode, episode, podcast):
 369     """ updates "episode" with the data from "parsed_episode" """
 370
 371     # TODO: check if there have been any changes, to avoid unnecessary updates
 372     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.guid or episode.guid)
 373     episode.description = parsed_episode.description or episode.description
 374     episode.subtitle = parsed_episode.subtitle or episode.subtitle
 375     episode.content = parsed_episode.content or parsed_episode.description or episode.content
 376     episode.link = to_maxlength(Episode, 'link',
 377                                 parsed_episode.link or episode.link)
 378     episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 379     episode.author = to_maxlength(Episode, 'author', parsed_episode.author or episode.author)
 380     episode.duration = parsed_episode.duration or episode.duration
 381     episode.filesize = parsed_episode.files[0].filesize
 382     episode.language = parsed_episode.language or episode.language or \
 383                                                   podcast.language
 384     episode.mimetypes = ','.join(list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 385     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 386                                       parsed_episode.flattr or
 387                                       episode.flattr_url)
 388     episode.license = parsed_episode.license or episode.license
 389
 390     episode.title = to_maxlength(Episode, 'title',
 391                                  parsed_episode.title or episode.title or
 392                                  file_basename_no_extension(episode.url))
 393
 394     episode.last_update = datetime.utcnow()
 395     episode.save()
 396
 397     parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 398     episode.add_missing_urls(parsed_urls)
 399
 400
 401 def mark_outdated(obj):
 402     """ marks obj outdated if its not already """
 403     if obj.outdated:
 404         return None
 405
 406     obj.outdated = True
 407     obj.last_update = datetime.utcnow()
 408     obj.save()
 409
 410
 411 def get_update_interval(episodes):
 412     """ calculates the avg interval between new episodes """
 413
 414     count = len(episodes)
 415     if not count:
 416         logger.info('no episodes, using default interval of %dh',
 417             DEFAULT_UPDATE_INTERVAL)
 418         return DEFAULT_UPDATE_INTERVAL
 419
 420     earliest = episodes[0]
 421     now = datetime.utcnow()
 422
 423     timespan_s = (now - earliest.released).total_seconds()
 424     timespan_h = timespan_s / 60 / 60
 425
 426     interval = int(timespan_h / count)
 427     logger.info('%d episodes in %d days => %dh interval', count,
 428         timespan_h / 24, interval)
 429
 430     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 431     interval = max(interval, MIN_UPDATE_INTERVAL)
 432     interval = min(interval, MAX_UPDATE_INTERVAL)
 433
 434     return interval
 435
 436
 437 def file_basename_no_extension(filename):
 438     """ Returns filename without extension
 439
 440     >>> file_basename_no_extension('/home/me/file.txt')
 441     'file'
 442
 443     >>> file_basename_no_extension('file')
 444     'file'
 445     """
 446     base = os.path.basename(filename)
 447     name, extension = os.path.splitext(base)
 448     return name