mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import os.path
   5 import urllib.request
   6 import urllib.error
   7 import urllib.parse
   8 from urllib.parse import urljoin
   9 import http.client
  10 import hashlib
  11 from datetime import datetime, timedelta
  12 from itertools import chain, islice
  13 import socket
  14 import requests
  15
  16 from django.db import transaction
  17 from django.conf import settings
  18
  19 from mygpo.podcasts.models import Podcast, Episode
  20 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
  21 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  22     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  23 from mygpo.utils import file_hash, to_maxlength
  24 from mygpo.web.logo import CoverArt
  25 from mygpo.data.podcast import subscribe_at_hub
  26 from mygpo.data.tasks import update_related_podcasts
  27 from mygpo.pubsub.models import SubscriptionError
  28 from mygpo.directory.tags import update_category
  29 from mygpo.search import get_index_fields
  30
  31 from . import models
  32
  33 import logging
  34 logger = logging.getLogger(__name__)
  35
  36 MAX_EPISODES_UPDATE = 200
  37
  38
  39 class UpdatePodcastException(Exception):
  40     pass
  41
  42
  43 class NoPodcastCreated(Exception):
  44     """ raised when no podcast obj was created for a new URL """
  45
  46
  47 class NoEpisodesException(Exception):
  48     """ raised when parsing something that doesn't contain any episodes """
  49
  50
  51 def update_podcasts(queue):
  52     """ Fetch data for the URLs supplied as the queue iterable """
  53
  54     for n, podcast_url in enumerate(queue, 1):
  55         logger.info('Update %d - %s', n, podcast_url)
  56         if not podcast_url:
  57             logger.warn('Podcast URL empty, skipping')
  58             continue
  59
  60         try:
  61             updater = PodcastUpdater(podcast_url)
  62             yield updater.update_podcast()
  63
  64         except NoPodcastCreated as npc:
  65             logger.info('No podcast created: %s', npc)
  66
  67         except:
  68             logger.exception('Error while updating podcast "%s"',
  69                              podcast_url)
  70             raise
  71
  72
  73 class PodcastUpdater(object):
  74     """ Updates the podcast specified by the podcast_url """
  75
  76     def __init__(self, podcast_url):
  77         self.podcast_url = podcast_url
  78
  79     def update_podcast(self):
  80         """ Update the podcast """
  81
  82         with models.PodcastUpdateResult(podcast_url=self.podcast_url) as res:
  83
  84             parsed, podcast, created = self.parse_feed()
  85
  86             if not podcast:
  87                 res.podcast_created = False
  88                 res.error_message = '"{}" could not be parsed'.format(
  89                     self.podcast_url)
  90
  91                 return
  92
  93             res.podcast = podcast
  94             res.podcast_created = created
  95
  96             res.episodes_added = 0
  97             episode_updater = MultiEpisodeUpdater(podcast, res)
  98
  99             if not parsed:
 100                 # if it exists already, we mark it as outdated
 101                 self._mark_outdated(
 102                     podcast,
 103                     'error while fetching feed',
 104                     episode_updater)
 105                 return
 106
 107             episode_updater.update_episodes(parsed.get('episodes', []))
 108
 109             podcast.refresh_from_db()
 110             podcast.episode_count = episode_updater.count_episodes()
 111             podcast.save()
 112
 113             episode_updater.order_episodes()
 114
 115             self._update_podcast(podcast, parsed, episode_updater)
 116
 117         return podcast
 118
 119     def parse_feed(self):
 120         try:
 121             parsed = self._fetch_feed()
 122             self._validate_parsed(parsed)
 123
 124         except (requests.exceptions.RequestException,
 125                 NoEpisodesException) as ex:
 126             logging.exception('Error while fetching/parsing feed')
 127
 128             # if we fail to parse the URL, we don't even create the
 129             # podcast object
 130             try:
 131                 p = Podcast.objects.get(urls__url=self.podcast_url)
 132                 return (None, p, False)
 133
 134             except Podcast.DoesNotExist as pdne:
 135                 raise NoPodcastCreated(ex) from pdne
 136
 137         # Parsing went well, get podcast
 138         podcast, created = Podcast.objects.get_or_create_for_url(
 139             self.podcast_url)
 140
 141         return (parsed, podcast, created)
 142
 143     def _fetch_feed(self):
 144         params = {
 145             'url': self.podcast_url,
 146             'process_text': 'markdown',
 147         }
 148         headers = {
 149             'Accept': 'application/json',
 150         }
 151         url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 152         r = requests.get(url, params=params, headers=headers, timeout=30)
 153
 154         if r.status_code != 200:
 155             logger.error('Feed-service status code for "{}" was {}'.format(
 156                 podcast_url, r.status_code))
 157             return None
 158
 159         try:
 160             return r.json()[0]
 161         except ValueError:
 162             logger.exception(
 163                 'Feed-service error while parsing response for url "%s": %s',
 164                 podcast_url, r.text,
 165             )
 166             raise
 167
 168     def _validate_parsed(self, parsed):
 169         """ validates the parsed results and raises an exception if invalid
 170
 171         feedparser parses pretty much everything. We reject anything that
 172         doesn't look like a feed"""
 173
 174         if not parsed or not parsed.get('episodes', []):
 175             raise NoEpisodesException('no episodes found')
 176
 177     def _update_podcast(self, podcast, parsed, episode_updater):
 178         """ updates a podcast according to new parser results """
 179
 180         # we need that later to decide if we can "bump" a category
 181         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 182
 183         # will later be used to see whether the index is outdated
 184         old_index_fields = get_index_fields(podcast)
 185
 186         podcast.title = parsed.get('title') or podcast.title
 187         podcast.description = parsed.get('description') or podcast.description
 188         podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 189         podcast.link = parsed.get('link') or podcast.link
 190         podcast.logo_url = parsed.get('logo') or podcast.logo_url
 191
 192         podcast.author = to_maxlength(
 193             Podcast, 'author',
 194             parsed.get('author') or podcast.author)
 195
 196         podcast.language = to_maxlength(
 197             Podcast, 'language',
 198             parsed.get('language') or podcast.language)
 199
 200         podcast.content_types = (','.join(parsed.get('content_types')) or
 201                                  podcast.content_types)
 202
 203         # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 204
 205         podcast.common_episode_title = to_maxlength(
 206             Podcast,
 207             'common_episode_title',
 208             parsed.get('common_title') or podcast.common_episode_title)
 209
 210         podcast.new_location = (parsed.get('new_location') or
 211                                 podcast.new_location)
 212         podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 213                                           parsed.get('flattr') or
 214                                           podcast.flattr_url)
 215         podcast.hub = parsed.get('hub') or podcast.hub
 216         podcast.license = parsed.get('license') or podcast.license
 217         podcast.max_episode_order = episode_updater.max_episode_order
 218
 219         podcast.add_missing_urls(parsed.get('urls', []))
 220
 221         if podcast.new_location:
 222             try:
 223                 new_podcast = Podcast.objects.get(
 224                     urls__url=podcast.new_location
 225                 )
 226
 227                 if new_podcast != podcast:
 228                     self._mark_outdated(
 229                         podcast,
 230                         'redirected to different podcast',
 231                         episode_updater,
 232                     )
 233                     return
 234             except Podcast.DoesNotExist:
 235                 podcast.set_url(podcast.new_location)
 236
 237         # latest episode timestamp
 238         episodes = Episode.objects.filter(podcast=podcast,
 239                                           released__isnull=False)\
 240                                   .order_by('released')
 241
 242         # Determine update interval
 243
 244         # Update interval is based on intervals between episodes
 245         podcast.update_interval = episode_updater.get_update_interval(episodes)
 246
 247         # factor is increased / decreased depending on whether the latest
 248         # update has returned episodes
 249         if episode_updater.episodes_added == 0:  # no episodes, incr factor
 250             podcast.update_interval_factor *= 1.2
 251         elif episode_updater.episodes_added > 1:  # new episodes, decr factor
 252             newfactor = podcast.update_interval_factor / 1.2
 253             podcast.update_interval_factor = max(1, newfactor)  # never below 1
 254
 255         latest_episode = episodes.last()
 256         if latest_episode:
 257             podcast.latest_episode_timestamp = latest_episode.released
 258
 259         # podcast.episode_count is not update here on purpose. It is, instead,
 260         # continuously updated when creating new episodes in
 261         # EpisodeManager.get_or_create_for_url
 262
 263         self._update_categories(podcast, prev_latest_episode_timestamp)
 264
 265         # try to download the logo and reset logo_url to None on http errors
 266         found = self._save_podcast_logo(podcast.logo_url)
 267         if not found:
 268             podcast.logo_url = None
 269
 270         # check if search index should be considered out of date
 271         new_index_fields = get_index_fields(podcast)
 272         if list(old_index_fields.items()) != list(new_index_fields.items()):
 273             podcast.search_index_uptodate = False
 274
 275         # The podcast is always saved (not just when there are changes) because
 276         # we need to record the last update
 277         logger.info('Saving podcast.')
 278         podcast.last_update = datetime.utcnow()
 279         podcast.save()
 280
 281         try:
 282             subscribe_at_hub(podcast)
 283         except SubscriptionError as se:
 284             logger.warn('subscribing to hub failed: %s', str(se))
 285
 286         self.assign_slug(podcast)
 287         episode_updater.assign_missing_episode_slugs()
 288         update_related_podcasts.delay(podcast.pk)
 289
 290     def assign_slug(self, podcast):
 291         if podcast.slug:
 292             return
 293
 294         for slug in PodcastSlugs(podcast):
 295             try:
 296                 with transaction.atomic():
 297                     podcast.add_slug(slug)
 298                 break
 299
 300             except:
 301                 continue
 302
 303     def _update_categories(self, podcast, prev_timestamp):
 304         """ checks some practical requirements and updates a category """
 305
 306         max_timestamp = datetime.utcnow() + timedelta(days=1)
 307
 308         # no episodes at all
 309         if not podcast.latest_episode_timestamp:
 310             return
 311
 312         # no new episode
 313         if prev_timestamp and \
 314            (podcast.latest_episode_timestamp <= prev_timestamp):
 315             return
 316
 317         # too far in the future
 318         if podcast.latest_episode_timestamp > max_timestamp:
 319             return
 320
 321         # not enough subscribers
 322         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 323             return
 324
 325         update_category(podcast)
 326
 327     def _save_podcast_logo(self, cover_art):
 328         if not cover_art:
 329             return
 330
 331         try:
 332             image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
 333             prefix = CoverArt.get_prefix(image_sha1)
 334
 335             filename = CoverArt.get_original(prefix, image_sha1)
 336             dirname = CoverArt.get_dir(filename)
 337
 338             # get hash of existing file
 339             if os.path.exists(filename):
 340                 with open(filename, 'rb') as f:
 341                     old_hash = file_hash(f).digest()
 342             else:
 343                 old_hash = ''
 344
 345             logger.info('Logo %s', cover_art)
 346
 347             # save new cover art
 348             with open(filename, 'wb') as fp:
 349                 fp.write(urllib.request.urlopen(cover_art).read())
 350
 351             # get hash of new file
 352             with open(filename, 'rb') as f:
 353                 new_hash = file_hash(f).digest()
 354
 355             # remove thumbnails if cover changed
 356             if old_hash != new_hash:
 357                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 358                 logger.info('Removing %d thumbnails', len(thumbnails))
 359                 for f in thumbnails:
 360                     os.unlink(f)
 361
 362             return cover_art
 363
 364         except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
 365                 http.client.HTTPException, socket.error, IOError) as e:
 366             logger.warn('Exception while updating podcast logo: %s', str(e))
 367
 368     def _mark_outdated(self, podcast, msg, episode_updater):
 369         logger.info('marking podcast outdated: %s', msg)
 370         podcast.outdated = True
 371         podcast.last_update = datetime.utcnow()
 372         podcast.save()
 373         episode_updater.update_episodes([])
 374
 375
 376 class MultiEpisodeUpdater(object):
 377
 378     def __init__(self, podcast, update_result):
 379         self.podcast = podcast
 380         self.update_result = update_result
 381         self.updated_episodes = []
 382         self.max_episode_order = None
 383
 384     def update_episodes(self, parsed_episodes):
 385
 386         pid = self.podcast.get_id()
 387
 388         episodes_to_update = list(islice(parsed_episodes, 0,
 389                                          MAX_EPISODES_UPDATE))
 390         logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 391                     len(episodes_to_update))
 392
 393         logger.info('Updating %d episodes', len(episodes_to_update))
 394         for n, parsed in enumerate(episodes_to_update, 1):
 395
 396             url = self.get_episode_url(parsed)
 397             if not url:
 398                 logger.info('Skipping episode %d for missing URL', n)
 399                 continue
 400
 401             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 402
 403             episode, created = Episode.objects.get_or_create_for_url(
 404                 self.podcast, url)
 405
 406             if created:
 407                 self.update_result.episodes_added += 1
 408
 409             updater = EpisodeUpdater(episode, self.podcast)
 410             updater.update_episode(parsed)
 411
 412             self.updated_episodes.append(episode)
 413
 414         # and mark the remaining ones outdated
 415         current_episodes = Episode.objects.filter(podcast=self.podcast,
 416                                                   outdated=False)[:500]
 417         outdated_episodes = set(current_episodes) - set(self.updated_episodes)
 418
 419         logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 420         for episode in outdated_episodes:
 421             updater = EpisodeUpdater(episode, self.podcast)
 422             updater.mark_outdated()
 423
 424     @transaction.atomic
 425     def order_episodes(self):
 426         """ Reorder the podcast's episode according to release timestamp
 427
 428         Returns the highest order value (corresponding to the most recent
 429         episode) """
 430
 431         num_episodes = self.podcast.episode_count
 432         if not num_episodes:
 433             return 0
 434
 435         episodes = self.podcast.episode_set.all().extra(select={
 436             'has_released': 'released IS NOT NULL',
 437             })\
 438             .order_by('-has_released', '-released', 'pk')\
 439             .only('pk')
 440
 441         for n, episode in enumerate(episodes.iterator(), 1):
 442             # assign ``order`` from higher (most recent) to 0 (oldest)
 443             # None means "unknown"
 444             new_order = num_episodes - n
 445
 446             # optimize for new episodes that are newer than all existing
 447             if episode.order == new_order:
 448                 continue
 449
 450             logger.info('Updating order from {} to {}'.format(episode.order,
 451                                                               new_order))
 452             episode.order = new_order
 453             episode.save()
 454
 455         self.max_episode_order = num_episodes - 1
 456
 457     def get_episode_url(self, parsed_episode):
 458         """ returns the URL of a parsed episode """
 459         for f in parsed_episode.get('files', []):
 460             if f.get('urls', []):
 461                 return f['urls'][0]
 462         return None
 463
 464     def count_episodes(self):
 465         return Episode.objects.filter(podcast=self.podcast).count()
 466
 467     def get_update_interval(self, episodes):
 468         """ calculates the avg interval between new episodes """
 469
 470         count = episodes.count()
 471         if not count:
 472             logger.info('no episodes, using default interval of %dh',
 473                         DEFAULT_UPDATE_INTERVAL)
 474             return DEFAULT_UPDATE_INTERVAL
 475
 476         earliest = episodes.first()
 477         now = datetime.utcnow()
 478
 479         timespan_s = (now - earliest.released).total_seconds()
 480         timespan_h = timespan_s / 60 / 60
 481
 482         interval = int(timespan_h / count)
 483         logger.info('%d episodes in %d days => %dh interval', count,
 484                     timespan_h / 24, interval)
 485
 486         # place interval between {MIN,MAX}_UPDATE_INTERVAL
 487         interval = max(interval, MIN_UPDATE_INTERVAL)
 488         interval = min(interval, MAX_UPDATE_INTERVAL)
 489
 490         return interval
 491
 492     def assign_missing_episode_slugs(self):
 493         common_title = self.podcast.get_common_episode_title()
 494
 495         episodes = Episode.objects.filter(podcast=self.podcast,
 496                                           slugs__isnull=True)
 497
 498         for episode in episodes:
 499
 500             for slug in EpisodeSlugs(episode, common_title):
 501                 try:
 502                     with transaction.atomic():
 503                         episode.set_slug(slug)
 504                     break
 505
 506                 except:
 507                     continue
 508
 509
 510 class EpisodeUpdater(object):
 511     """ Updates an individual episode """
 512
 513     def __init__(self, episode, podcast):
 514         self.episode = episode
 515         self.podcast = podcast
 516
 517     def update_episode(self, parsed_episode):
 518         """ updates "episode" with the data from "parsed_episode" """
 519
 520         # TODO: check if there have been any changes, to
 521         # avoid unnecessary updates
 522         self.episode.guid = to_maxlength(
 523             Episode, 'guid',
 524             parsed_episode.get('guid') or self.episode.guid)
 525
 526         self.episode.description = (parsed_episode.get('description') or
 527                                     self.episode.description)
 528
 529         self.episode.subtitle = (parsed_episode.get('subtitle') or
 530                                  self.episode.subtitle)
 531
 532         self.episode.content = (parsed_episode.get('content') or
 533                                 parsed_episode.get('description') or
 534                                 self.episode.content)
 535
 536         self.episode.link = to_maxlength(
 537             Episode, 'link',
 538             parsed_episode.get('link') or self.episode.link)
 539
 540         self.episode.released = (datetime.utcfromtimestamp(
 541             parsed_episode.get('released')) if parsed_episode.get('released')
 542             else self.episode.released)
 543
 544         self.episode.author = to_maxlength(
 545             Episode, 'author',
 546             parsed_episode.get('author') or self.episode.author)
 547
 548         self.episode.duration = (parsed_episode.get('duration') or
 549                                  self.episode.duration)
 550
 551         self.episode.filesize = parsed_episode['files'][0]['filesize']
 552
 553         self.episode.language = (parsed_episode.get('language') or
 554                                  self.episode.language or
 555                                  self.podcast.language)
 556
 557         mimetypes = [f['mimetype'] for f in parsed_episode.get('files', [])]
 558         self.episode.mimetypes = ','.join(list(set(filter(None, mimetypes))))
 559
 560         self.episode.flattr_url = to_maxlength(
 561             Episode, 'flattr_url',
 562             parsed_episode.get('flattr') or self.episode.flattr_url)
 563
 564         self.episode.license = (parsed_episode.get('license') or
 565                                 self.episode.license)
 566
 567         self.episode.title = to_maxlength(
 568             Episode, 'title',
 569             parsed_episode.get('title') or self.episode.title or
 570             file_basename_no_extension(self.episode.url))
 571
 572         self.episode.last_update = datetime.utcnow()
 573         self.episode.save()
 574
 575         parsed_urls = list(chain.from_iterable(
 576             f.get('urls', []) for f in parsed_episode.get('files', [])))
 577         self.episode.add_missing_urls(parsed_urls)
 578
 579     def mark_outdated(self):
 580         """ marks the episode outdated if its not already """
 581         if self.episode.outdated:
 582             return None
 583
 584         self.episode.outdated = True
 585         self.episode.last_update = datetime.utcnow()
 586         self.episode.save()
 587
 588
 589 def file_basename_no_extension(filename):
 590     """ Returns filename without extension
 591
 592     >>> file_basename_no_extension('/home/me/file.txt')
 593     'file'
 594
 595     >>> file_basename_no_extension('file')
 596     'file'
 597     """
 598     base = os.path.basename(filename)
 599     name, extension = os.path.splitext(base)
 600     return name
 601
 602
 603 def verify_podcast_url(self):
 604     parsed = _fetch_feed(self.podcast_url)
 605     self._validate_parsed(parsed)
 606     return True