mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import os.path
   5 import urllib.request, urllib.error, urllib.parse
   6 from urllib.parse import urljoin
   7 import http.client
   8 import hashlib
   9 from datetime import datetime, timedelta
  10 from itertools import chain, islice
  11 import socket
  12 import requests
  13
  14 from django.db import transaction
  15 from django.conf import settings
  16
  17 from mygpo.podcasts.models import Podcast, Episode
  18 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
  19 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  20     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  21 from mygpo.utils import file_hash, to_maxlength
  22 from mygpo.web.logo import CoverArt
  23 from mygpo.data.podcast import subscribe_at_hub
  24 from mygpo.data.tasks import update_related_podcasts
  25 from mygpo.pubsub.models import SubscriptionError
  26 from mygpo.directory.tags import update_category
  27
  28 import logging
  29 logger = logging.getLogger(__name__)
  30
  31 MAX_EPISODES_UPDATE = 200
  32
  33
  34 class UpdatePodcastException(Exception):
  35     pass
  36
  37
  38 class NoPodcastCreated(Exception):
  39     """ raised when no podcast obj was created for a new URL """
  40
  41
  42 class NoEpisodesException(Exception):
  43     """ raised when parsing something that doesn't contain any episodes """
  44
  45
  46 def update_podcasts(queue):
  47     """ Fetch data for the URLs supplied as the queue iterable """
  48
  49     for n, podcast_url in enumerate(queue, 1):
  50         logger.info('Update %d - %s', n, podcast_url)
  51         try:
  52             yield update_podcast(podcast_url)
  53
  54         except NoPodcastCreated as npc:
  55             logger.info('No podcast created: %s', npc)
  56
  57         except:
  58             logger.exception('Error while updating podcast "%s"',
  59                              podcast_url)
  60             raise
  61
  62
  63 def update_podcast(podcast_url):
  64     """ Update the podcast for the supplied URL """
  65
  66     try:
  67         parsed = _fetch_feed(podcast_url)
  68         _validate_parsed(parsed)
  69
  70     except requests.exceptions.RequestException as re:
  71         logging.exception('Error while fetching response from feedservice')
  72
  73         # if we fail to parse the URL, we don't even create the
  74         # podcast object
  75         try:
  76             p = Podcast.objects.get(urls__url=podcast_url)
  77             # if it exists already, we mark it as outdated
  78             _mark_outdated(p, 'error while fetching feed: %s' % str(re))
  79             p.last_update = datetime.utcnow()
  80             p.save()
  81             return p
  82
  83         except Podcast.DoesNotExist:
  84             raise NoPodcastCreated(re)
  85
  86     except NoEpisodesException as nee:
  87         logging.warn('No episode found while parsing podcast')
  88
  89         # if we fail to parse the URL, we don't even create the
  90         # podcast object
  91         try:
  92             p = Podcast.objects.get(urls__url=podcast_url)
  93             # if it exists already, we mark it as outdated
  94             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
  95             return p
  96
  97         except Podcast.DoesNotExist:
  98             raise NoPodcastCreated(nee)
  99
 100     assert parsed, 'fetch_feed must return something'
 101     p = Podcast.objects.get_or_create_for_url(podcast_url)
 102     episodes = _update_episodes(p, parsed.get('episodes', []))
 103     p.refresh_from_db()
 104     p.episode_count = Episode.objects.filter(podcast=p).count()
 105     p.save()
 106     max_episode_order = _order_episodes(p)
 107     _update_podcast(p, parsed, episodes, max_episode_order)
 108     return p
 109
 110
 111 def verify_podcast_url(podcast_url):
 112     parsed = _fetch_feed(podcast_url)
 113     _validate_parsed(parsed)
 114     return True
 115
 116
 117 def _fetch_feed(podcast_url):
 118     params = {
 119         'url': podcast_url,
 120         'process_text': 'markdown',
 121     }
 122     headers = {
 123         'Accept': 'application/json',
 124     }
 125     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 126     r = requests.get(url, params=params, headers=headers, timeout=10)
 127
 128     if r.status_code != 200:
 129         logger.error('Feed-service status code for "%s" was %s', podcast_url,
 130                      r.status_code)
 131         return None
 132
 133     try:
 134         return r.json()[0]
 135     except ValueError:
 136         logger.exception(
 137             'Feed-service error while parsing response for url "%s": %s',
 138             podcast_url, r.text,
 139         )
 140         raise
 141
 142
 143 def _validate_parsed(parsed):
 144     """ validates the parsed results and raises an exception if invalid
 145
 146     feedparser parses pretty much everything. We reject anything that
 147     doesn't look like a feed"""
 148
 149     if not parsed or not parsed.get('episodes', []):
 150         raise NoEpisodesException('no episodes found')
 151
 152
 153 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 154     """ updates a podcast according to new parser results """
 155
 156     # we need that later to decide if we can "bump" a category
 157     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 158
 159     podcast.title = parsed.get('title') or podcast.title
 160     podcast.description = parsed.get('description') or podcast.description
 161     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 162     podcast.link = parsed.get('link') or podcast.link
 163     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 164     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 165                                   podcast.author)
 166     podcast.language = to_maxlength(Podcast, 'language',
 167                                     parsed.get('language') or podcast.language)
 168     podcast.content_types = ','.join(parsed.get('content_types')) or \
 169                                      podcast.content_types
 170     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 171     podcast.common_episode_title = to_maxlength(
 172         Podcast,
 173         'common_episode_title',
 174         parsed.get('common_title') or podcast.common_episode_title)
 175     podcast.new_location = parsed.get('new_location') or podcast.new_location
 176     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 177                                       parsed.get('flattr') or
 178                                       podcast.flattr_url)
 179     podcast.hub = parsed.get('hub') or podcast.hub
 180     podcast.license = parsed.get('license') or podcast.license
 181     podcast.max_episode_order = max_episode_order
 182
 183     podcast.add_missing_urls(parsed.get('urls', []))
 184
 185     if podcast.new_location:
 186         try:
 187             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 188             if new_podcast != podcast:
 189                 _mark_outdated(podcast, 'redirected to different podcast')
 190                 return
 191         except Podcast.DoesNotExist:
 192             podcast.set_url(podcast.new_location)
 193
 194     # latest episode timestamp
 195     episodes = Episode.objects.filter(podcast=podcast,
 196                                       released__isnull=False)\
 197                               .order_by('released')
 198
 199     podcast.update_interval = get_update_interval(episodes)
 200
 201     latest_episode = episodes.last()
 202     if latest_episode:
 203         podcast.latest_episode_timestamp = latest_episode.released
 204
 205     # podcast.episode_count is not update here on purpose. It is, instead,
 206     # continuously updated when creating new episodes in
 207     # EpisodeManager.get_or_create_for_url
 208
 209     _update_categories(podcast, prev_latest_episode_timestamp)
 210
 211     # try to download the logo and reset logo_url to None on http errors
 212     found = _save_podcast_logo(podcast.logo_url)
 213     if not found:
 214         podcast.logo_url = None
 215
 216     # The podcast is always saved (not just when there are changes) because
 217     # we need to record the last update
 218     logger.info('Saving podcast.')
 219     podcast.last_update = datetime.utcnow()
 220     podcast.save()
 221
 222     try:
 223         subscribe_at_hub(podcast)
 224     except SubscriptionError as se:
 225         logger.warn('subscribing to hub failed: %s', str(se))
 226
 227     assign_slug(podcast)
 228     assign_missing_episode_slugs(podcast)
 229     update_related_podcasts.delay(podcast)
 230
 231
 232 def assign_slug(podcast):
 233     if podcast.slug:
 234         return
 235
 236     for slug in PodcastSlugs(podcast):
 237         try:
 238             with transaction.atomic():
 239                 podcast.add_slug(slug)
 240             break
 241
 242         except:
 243             continue
 244
 245
 246 def assign_missing_episode_slugs(podcast):
 247     common_title = podcast.get_common_episode_title()
 248
 249     episodes = Episode.objects.filter(podcast=podcast, slugs__isnull=True)
 250
 251     for episode in episodes:
 252
 253         for slug in EpisodeSlugs(episode, common_title):
 254             try:
 255                 with transaction.atomic():
 256                     episode.set_slug(slug)
 257                 break
 258
 259             except:
 260                 continue
 261
 262
 263 def _update_categories(podcast, prev_timestamp):
 264     """ checks some practical requirements and updates a category """
 265
 266     max_timestamp = datetime.utcnow() + timedelta(days=1)
 267
 268     # no episodes at all
 269     if not podcast.latest_episode_timestamp:
 270         return
 271
 272     # no new episode
 273     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 274         return
 275
 276     # too far in the future
 277     if podcast.latest_episode_timestamp > max_timestamp:
 278         return
 279
 280     # not enough subscribers
 281     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 282         return
 283
 284     update_category(podcast)
 285
 286
 287 def _update_episodes(podcast, parsed_episodes):
 288
 289     pid = podcast.get_id()
 290
 291     # list of (obj, fun) where fun is the function to update obj
 292     updated_episodes = []
 293     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 294     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 295                 len(episodes_to_update))
 296
 297     logger.info('Updating %d episodes', len(episodes_to_update))
 298     for n, parsed in enumerate(episodes_to_update, 1):
 299
 300         url = get_episode_url(parsed)
 301         if not url:
 302             logger.info('Skipping episode %d for missing URL', n)
 303             continue
 304
 305         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 306
 307         episode = Episode.objects.get_or_create_for_url(podcast, url)
 308
 309         update_episode(parsed, episode, podcast)
 310         updated_episodes.append(episode)
 311
 312     # and mark the remaining ones outdated
 313     current_episodes = Episode.objects.filter(podcast=podcast,
 314                                               outdated=False)[:500]
 315     outdated_episodes = set(current_episodes) - set(updated_episodes)
 316
 317     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 318     for episode in outdated_episodes:
 319         mark_outdated(episode)
 320
 321
 322 @transaction.atomic
 323 def _order_episodes(podcast):
 324     """ Reorder the podcast's episode according to release timestamp
 325
 326     Returns the highest order value (corresponding to the most recent
 327     episode) """
 328
 329     num_episodes = podcast.episode_count
 330     if not num_episodes:
 331         return 0
 332
 333     episodes = podcast.episode_set.all().extra(select={
 334         'has_released': 'released IS NOT NULL',
 335         })\
 336         .order_by('-has_released', '-released', 'pk')\
 337         .only('pk')
 338
 339     for n, episode in enumerate(episodes.iterator(), 1):
 340         # assign ``order`` from higher (most recent) to 0 (oldest)
 341         # None means "unknown"
 342         new_order = num_episodes - n
 343
 344         # optimize for new episodes that are newer than all existing
 345         if episode.order == new_order:
 346             continue
 347
 348         logger.info('Updating order from {} to {}'.format(episode.order,
 349                                                           new_order))
 350         episode.order = new_order
 351         episode.save()
 352
 353     return num_episodes - 1
 354
 355
 356 def _save_podcast_logo(cover_art):
 357     if not cover_art:
 358         return
 359
 360     try:
 361         image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
 362         prefix = CoverArt.get_prefix(image_sha1)
 363
 364         filename = CoverArt.get_original(prefix, image_sha1)
 365         dirname = CoverArt.get_dir(filename)
 366
 367         # get hash of existing file
 368         if os.path.exists(filename):
 369             with open(filename, 'rb') as f:
 370                 old_hash = file_hash(f).digest()
 371         else:
 372             old_hash = ''
 373
 374         logger.info('Logo %s', cover_art)
 375
 376         # save new cover art
 377         with open(filename, 'wb') as fp:
 378             fp.write(urllib.request.urlopen(cover_art).read())
 379
 380         # get hash of new file
 381         with open(filename, 'rb') as f:
 382             new_hash = file_hash(f).digest()
 383
 384         # remove thumbnails if cover changed
 385         if old_hash != new_hash:
 386             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 387             logger.info('Removing %d thumbnails', len(thumbnails))
 388             for f in thumbnails:
 389                 os.unlink(f)
 390
 391         return cover_art
 392
 393     except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
 394             http.client.HTTPException, socket.error, IOError) as e:
 395         logger.warn('Exception while updating podcast logo: %s', str(e))
 396
 397
 398 def _mark_outdated(podcast, msg=''):
 399     logger.info('marking podcast outdated: %s', msg)
 400     podcast.outdated = True
 401     podcast.last_update = datetime.utcnow()
 402     podcast.save()
 403     _update_episodes(podcast, [])
 404
 405
 406 def get_episode_url(parsed_episode):
 407     """ returns the URL of a parsed episode """
 408     for f in parsed_episode.get('files', []):
 409         if f.get('urls', []):
 410             return f['urls'][0]
 411     return None
 412
 413
 414 def update_episode(parsed_episode, episode, podcast):
 415     """ updates "episode" with the data from "parsed_episode" """
 416
 417     # TODO: check if there have been any changes, to avoid unnecessary updates
 418     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 419                                 episode.guid)
 420     episode.description = parsed_episode.get('description') or \
 421         episode.description
 422     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 423     episode.content = parsed_episode.get('content') or \
 424         parsed_episode.get('description') or episode.content
 425     episode.link = to_maxlength(Episode, 'link',
 426                                 parsed_episode.get('link') or episode.link)
 427     episode.released = datetime.utcfromtimestamp(
 428         parsed_episode.get('released')) if parsed_episode.get('released') \
 429         else episode.released
 430     episode.author = to_maxlength(Episode, 'author',
 431                                   parsed_episode.get('author') or
 432                                   episode.author)
 433     episode.duration = parsed_episode.get('duration') or episode.duration
 434     episode.filesize = parsed_episode['files'][0]['filesize']
 435     episode.language = parsed_episode.get('language') or \
 436         episode.language or podcast.language
 437     episode.mimetypes = ','.join(list(set(
 438         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 439     )))
 440     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 441                                       parsed_episode.get('flattr') or
 442                                       episode.flattr_url)
 443     episode.license = parsed_episode.get('license') or episode.license
 444
 445     episode.title = to_maxlength(Episode, 'title',
 446                                  parsed_episode.get('title') or
 447                                  episode.title or
 448                                  file_basename_no_extension(episode.url))
 449
 450     episode.last_update = datetime.utcnow()
 451     episode.save()
 452
 453     parsed_urls = list(chain.from_iterable(
 454         f.get('urls', []) for f in parsed_episode.get('files', [])))
 455     episode.add_missing_urls(parsed_urls)
 456
 457
 458 def mark_outdated(obj):
 459     """ marks obj outdated if its not already """
 460     if obj.outdated:
 461         return None
 462
 463     obj.outdated = True
 464     obj.last_update = datetime.utcnow()
 465     obj.save()
 466
 467
 468 def get_update_interval(episodes):
 469     """ calculates the avg interval between new episodes """
 470
 471     count = len(episodes)
 472     if not count:
 473         logger.info('no episodes, using default interval of %dh',
 474                     DEFAULT_UPDATE_INTERVAL)
 475         return DEFAULT_UPDATE_INTERVAL
 476
 477     earliest = episodes[0]
 478     now = datetime.utcnow()
 479
 480     timespan_s = (now - earliest.released).total_seconds()
 481     timespan_h = timespan_s / 60 / 60
 482
 483     interval = int(timespan_h / count)
 484     logger.info('%d episodes in %d days => %dh interval', count,
 485                 timespan_h / 24, interval)
 486
 487     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 488     interval = max(interval, MIN_UPDATE_INTERVAL)
 489     interval = min(interval, MAX_UPDATE_INTERVAL)
 490
 491     return interval
 492
 493
 494 def file_basename_no_extension(filename):
 495     """ Returns filename without extension
 496
 497     >>> file_basename_no_extension('/home/me/file.txt')
 498     'file'
 499
 500     >>> file_basename_no_extension('file')
 501     'file'
 502     """
 503     base = os.path.basename(filename)
 504     name, extension = os.path.splitext(base)
 505     return name