mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 from urlparse import urljoin
  23 import httplib
  24 import hashlib
  25 from datetime import datetime, timedelta
  26 from itertools import chain, islice
  27 import socket
  28 import requests
  29
  30 from django.db import transaction
  31 from django.conf import settings
  32
  33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  36     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  37 from mygpo.utils import file_hash, to_maxlength
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.data.tasks import update_related_podcasts
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE = 200
  48
  49
  50 class UpdatePodcastException(Exception):
  51     pass
  52
  53
  54 class NoPodcastCreated(Exception):
  55     """ raised when no podcast obj was created for a new URL """
  56
  57
  58 class NoEpisodesException(Exception):
  59     """ raised when parsing something that doesn't contain any episodes """
  60
  61
  62 def update_podcasts(queue):
  63     """ Fetch data for the URLs supplied as the queue iterable """
  64
  65     for n, podcast_url in enumerate(queue, 1):
  66         logger.info('Update %d - %s', n, podcast_url)
  67         try:
  68             yield update_podcast(podcast_url)
  69
  70         except NoPodcastCreated as npc:
  71             logger.info('No podcast created: %s', npc)
  72
  73         except:
  74             logger.exception('Error while updating podcast "%s"',
  75                              podcast_url)
  76             raise
  77
  78
  79 def update_podcast(podcast_url):
  80     """ Update the podcast for the supplied URL """
  81
  82     try:
  83         parsed = _fetch_feed(podcast_url)
  84         _validate_parsed(parsed)
  85
  86     except requests.exceptions.RequestException as re:
  87         logging.exception('Error while fetching response from feedservice')
  88         return
  89
  90     except NoEpisodesException as nee:
  91         logging.warn('No episode found while parsing podcast')
  92
  93         # if we fail to parse the URL, we don't even create the
  94         # podcast object
  95         try:
  96             p = Podcast.objects.get(urls__url=podcast_url)
  97             # if it exists already, we mark it as outdated
  98             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
  99             return p
 100
 101         except Podcast.DoesNotExist:
 102             raise NoPodcastCreated(nee)
 103
 104     assert parsed, 'fetch_feed must return something'
 105     p = Podcast.objects.get_or_create_for_url(podcast_url)
 106     episodes = _update_episodes(p, parsed.get('episodes', []))
 107     max_episode_order = _order_episodes(p)
 108     _update_podcast(p, parsed, episodes, max_episode_order)
 109     return p
 110
 111
 112 def verify_podcast_url(podcast_url):
 113     parsed = _fetch_feed(podcast_url)
 114     _validate_parsed(parsed)
 115     return True
 116
 117
 118 def _fetch_feed(podcast_url):
 119     params = {'url': podcast_url}
 120     headers = {
 121         'Accept': 'application/json',
 122     }
 123     # markdown and other parameters?
 124     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 125     r = requests.get(url, params=params, headers=headers, timeout=10)
 126     return r.json()[0]
 127
 128
 129 def _validate_parsed(parsed):
 130     """ validates the parsed results and raises an exception if invalid
 131
 132     feedparser parses pretty much everything. We reject anything that
 133     doesn't look like a feed"""
 134
 135     if not parsed or not parsed.get('episodes', []):
 136         raise NoEpisodesException('no episodes found')
 137
 138
 139 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 140     """ updates a podcast according to new parser results """
 141
 142     # we need that later to decide if we can "bump" a category
 143     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 144
 145     podcast.title = parsed.get('title') or podcast.title
 146     podcast.description = parsed.get('description') or podcast.description
 147     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 148     podcast.link = parsed.get('link') or podcast.link
 149     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 150     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 151                                   podcast.author)
 152     podcast.language = to_maxlength(Podcast, 'language',
 153                                     parsed.get('language') or podcast.language)
 154     podcast.content_types = ','.join(parsed.get('content_types')) or \
 155                                      podcast.content_types
 156     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 157     podcast.common_episode_title = to_maxlength(
 158         Podcast,
 159         'common_episode_title',
 160         parsed.get('common_title') or podcast.common_episode_title)
 161     podcast.new_location = parsed.get('new_location') or podcast.new_location
 162     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 163                                       parsed.get('flattr') or
 164                                       podcast.flattr_url)
 165     podcast.hub = parsed.get('hub') or podcast.hub
 166     podcast.license = parsed.get('license') or podcast.license
 167     podcast.max_episode_order = max_episode_order
 168
 169     podcast.add_missing_urls(parsed.get('urls', []))
 170
 171     if podcast.new_location:
 172         try:
 173             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 174             if new_podcast != podcast:
 175                 _mark_outdated(podcast, 'redirected to different podcast')
 176                 return
 177         except Podcast.DoesNotExist:
 178             podcast.set_url(podcast.new_location)
 179
 180     # latest episode timestamp
 181     episodes = Episode.objects.filter(podcast=podcast,
 182                                       released__isnull=False)\
 183                               .order_by('released')
 184
 185     podcast.update_interval = get_update_interval(episodes)
 186
 187     latest_episode = episodes.last()
 188     if latest_episode:
 189         podcast.latest_episode_timestamp = latest_episode.released
 190
 191     # podcast.episode_count is not update here on purpose. It is, instead,
 192     # continuously updated when creating new episodes in
 193     # EpisodeManager.get_or_create_for_url
 194
 195     _update_categories(podcast, prev_latest_episode_timestamp)
 196
 197     # try to download the logo and reset logo_url to None on http errors
 198     found = _save_podcast_logo(podcast.logo_url)
 199     if not found:
 200         podcast.logo_url = None
 201
 202     # The podcast is always saved (not just when there are changes) because
 203     # we need to record the last update
 204     logger.info('Saving podcast.')
 205     podcast.last_update = datetime.utcnow()
 206     podcast.save()
 207
 208     try:
 209         subscribe_at_hub(podcast)
 210     except SubscriptionError as se:
 211         logger.warn('subscribing to hub failed: %s', str(se))
 212
 213     if not podcast.slug:
 214         slug = PodcastSlug(podcast).get_slug()
 215         if slug:
 216             podcast.add_slug(slug)
 217
 218     assign_missing_episode_slugs(podcast)
 219     update_related_podcasts.delay(podcast)
 220
 221
 222 def _update_categories(podcast, prev_timestamp):
 223     """ checks some practical requirements and updates a category """
 224
 225     max_timestamp = datetime.utcnow() + timedelta(days=1)
 226
 227     # no episodes at all
 228     if not podcast.latest_episode_timestamp:
 229         return
 230
 231     # no new episode
 232     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 233         return
 234
 235     # too far in the future
 236     if podcast.latest_episode_timestamp > max_timestamp:
 237         return
 238
 239     # not enough subscribers
 240     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 241         return
 242
 243     update_category(podcast)
 244
 245
 246 def _update_episodes(podcast, parsed_episodes):
 247
 248     pid = podcast.get_id()
 249
 250     # list of (obj, fun) where fun is the function to update obj
 251     updated_episodes = []
 252     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 253     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 254                 len(episodes_to_update))
 255
 256     logger.info('Updating %d episodes', len(episodes_to_update))
 257     for n, parsed in enumerate(episodes_to_update, 1):
 258
 259         url = get_episode_url(parsed)
 260         if not url:
 261             logger.info('Skipping episode %d for missing URL', n)
 262             continue
 263
 264         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 265
 266         episode = Episode.objects.get_or_create_for_url(podcast, url)
 267
 268         update_episode(parsed, episode, podcast)
 269         updated_episodes.append(episode)
 270
 271     # and mark the remaining ones outdated
 272     current_episodes = Episode.objects.filter(podcast=podcast,
 273                                               outdated=False)[:500]
 274     outdated_episodes = set(current_episodes) - set(updated_episodes)
 275
 276     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 277     for episode in outdated_episodes:
 278         mark_outdated(episode)
 279
 280
 281 @transaction.atomic
 282 def _order_episodes(podcast):
 283     """ Reorder the podcast's episode according to release timestamp
 284
 285     Returns the highest order value (corresponding to the most recent
 286     episode) """
 287
 288     num_episodes = podcast.episode_count
 289     if not num_episodes:
 290         return 0
 291
 292     episodes = podcast.episode_set.all().extra(select={
 293         'has_released': 'released IS NOT NULL',
 294         })\
 295         .order_by('-has_released', '-released', 'pk')\
 296         .only('pk')
 297
 298     for n, episode in enumerate(episodes.iterator(), 1):
 299         # assign ``order`` from higher (most recent) to 0 (oldest)
 300         # None means "unknown"
 301         new_order = num_episodes - n
 302
 303         # optimize for new episodes that are newer than all existing
 304         if episode.order == new_order:
 305             continue
 306
 307         logger.info('Updating order from {} to {}'.format(episode.order,
 308                                                           new_order))
 309         episode.order = new_order
 310         episode.save()
 311
 312     return num_episodes - 1
 313
 314
 315 def _save_podcast_logo(cover_art):
 316     if not cover_art:
 317         return
 318
 319     try:
 320         image_sha1 = hashlib.sha1(cover_art).hexdigest()
 321         prefix = CoverArt.get_prefix(image_sha1)
 322
 323         filename = CoverArt.get_original(prefix, image_sha1)
 324         dirname = CoverArt.get_dir(filename)
 325
 326         # get hash of existing file
 327         if os.path.exists(filename):
 328             with open(filename) as f:
 329                 old_hash = file_hash(f).digest()
 330         else:
 331             old_hash = ''
 332
 333         logger.info('Logo %s', cover_art)
 334
 335         # save new cover art
 336         with open(filename, 'w') as fp:
 337             fp.write(urllib2.urlopen(cover_art).read())
 338
 339         # get hash of new file
 340         with open(filename) as f:
 341             new_hash = file_hash(f).digest()
 342
 343         # remove thumbnails if cover changed
 344         if old_hash != new_hash:
 345             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 346             logger.info('Removing %d thumbnails', len(thumbnails))
 347             for f in thumbnails:
 348                 os.unlink(f)
 349
 350         return cover_art
 351
 352     except (urllib2.HTTPError, urllib2.URLError, ValueError,
 353             httplib.BadStatusLine, socket.error, IOError) as e:
 354         logger.warn('Exception while updating podcast logo: %s', str(e))
 355
 356
 357 def _mark_outdated(podcast, msg=''):
 358     logger.info('marking podcast outdated: %s', msg)
 359     podcast.outdated = True
 360     podcast.last_update = datetime.utcnow()
 361     podcast.save()
 362     _update_episodes(podcast, [])
 363
 364
 365 def get_episode_url(parsed_episode):
 366     """ returns the URL of a parsed episode """
 367     for f in parsed_episode.get('files', []):
 368         if f.get('urls', []):
 369             return f['urls'][0]
 370     return None
 371
 372
 373 def update_episode(parsed_episode, episode, podcast):
 374     """ updates "episode" with the data from "parsed_episode" """
 375
 376     # TODO: check if there have been any changes, to avoid unnecessary updates
 377     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 378                                 episode.guid)
 379     episode.description = parsed_episode.get('description') or \
 380         episode.description
 381     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 382     episode.content = parsed_episode.get('content') or \
 383         parsed_episode.get('description') or episode.content
 384     episode.link = to_maxlength(Episode, 'link',
 385                                 parsed_episode.get('link') or episode.link)
 386     episode.released = datetime.utcfromtimestamp(
 387         parsed_episode.get('released')) if parsed_episode.get('released') \
 388         else episode.released
 389     episode.author = to_maxlength(Episode, 'author',
 390                                   parsed_episode.get('author') or
 391                                   episode.author)
 392     episode.duration = parsed_episode.get('duration') or episode.duration
 393     episode.filesize = parsed_episode['files'][0]['filesize']
 394     episode.language = parsed_episode.get('language') or \
 395         episode.language or podcast.language
 396     episode.mimetypes = ','.join(list(set(
 397         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 398     )))
 399     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 400                                       parsed_episode.get('flattr') or
 401                                       episode.flattr_url)
 402     episode.license = parsed_episode.get('license') or episode.license
 403
 404     episode.title = to_maxlength(Episode, 'title',
 405                                  parsed_episode.get('title') or
 406                                  episode.title or
 407                                  file_basename_no_extension(episode.url))
 408
 409     episode.last_update = datetime.utcnow()
 410     episode.save()
 411
 412     parsed_urls = list(chain.from_iterable(
 413         f.get('urls', []) for f in parsed_episode.get('files', [])))
 414     episode.add_missing_urls(parsed_urls)
 415
 416
 417 def mark_outdated(obj):
 418     """ marks obj outdated if its not already """
 419     if obj.outdated:
 420         return None
 421
 422     obj.outdated = True
 423     obj.last_update = datetime.utcnow()
 424     obj.save()
 425
 426
 427 def get_update_interval(episodes):
 428     """ calculates the avg interval between new episodes """
 429
 430     count = len(episodes)
 431     if not count:
 432         logger.info('no episodes, using default interval of %dh',
 433                     DEFAULT_UPDATE_INTERVAL)
 434         return DEFAULT_UPDATE_INTERVAL
 435
 436     earliest = episodes[0]
 437     now = datetime.utcnow()
 438
 439     timespan_s = (now - earliest.released).total_seconds()
 440     timespan_h = timespan_s / 60 / 60
 441
 442     interval = int(timespan_h / count)
 443     logger.info('%d episodes in %d days => %dh interval', count,
 444                 timespan_h / 24, interval)
 445
 446     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 447     interval = max(interval, MIN_UPDATE_INTERVAL)
 448     interval = min(interval, MAX_UPDATE_INTERVAL)
 449
 450     return interval
 451
 452
 453 def file_basename_no_extension(filename):
 454     """ Returns filename without extension
 455
 456     >>> file_basename_no_extension('/home/me/file.txt')
 457     'file'
 458
 459     >>> file_basename_no_extension('file')
 460     'file'
 461     """
 462     base = os.path.basename(filename)
 463     name, extension = os.path.splitext(base)
 464     return name