mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import os.path
   5 import urllib.request
   6 import urllib.error
   7 from urllib.parse import urljoin
   8 import hashlib
   9 from datetime import datetime, timedelta
  10 from itertools import chain, islice
  11 import requests
  12
  13 from django.db import transaction
  14 from django.conf import settings
  15
  16 from mygpo.podcasts.models import Podcast, Episode
  17 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
  18 from mygpo.podcasts.models import (
  19     DEFAULT_UPDATE_INTERVAL,
  20     MIN_UPDATE_INTERVAL,
  21     MAX_UPDATE_INTERVAL,
  22 )
  23 from mygpo.utils import to_maxlength
  24 from mygpo.web.logo import CoverArt
  25 from mygpo.data.podcast import subscribe_at_hub
  26 from mygpo.data.tasks import update_related_podcasts
  27 from mygpo.pubsub.models import SubscriptionError
  28 from mygpo.directory.tags import update_category
  29 from mygpo.search import get_index_fields
  30
  31 from . import models
  32
  33 import logging
  34
  35 logger = logging.getLogger(__name__)
  36
  37 MAX_EPISODES_UPDATE = 200
  38
  39
  40 class UpdatePodcastException(Exception):
  41     pass
  42
  43
  44 class NoPodcastCreated(Exception):
  45     """raised when no podcast obj was created for a new URL"""
  46
  47
  48 class NoEpisodesException(Exception):
  49     """raised when parsing something that doesn't contain any episodes"""
  50
  51
  52 def update_podcasts(queue):
  53     """Fetch data for the URLs supplied as the queue iterable"""
  54
  55     for n, podcast_url in enumerate(queue, 1):
  56         logger.info("Update %d - %s", n, podcast_url)
  57         if not podcast_url:
  58             logger.warning("Podcast URL empty, skipping")
  59             continue
  60
  61         try:
  62             updater = PodcastUpdater(podcast_url)
  63             yield updater.update_podcast()
  64
  65         except NoPodcastCreated as npc:
  66             logger.info("No podcast created: %s", npc)
  67
  68         except NoEpisodesException as nee:
  69             logger.info(f"No episodes found when parsing {podcast_url}")
  70             continue
  71
  72         except GeneratorExit:
  73             pass
  74
  75         except:
  76             logger.exception('Error while updating podcast "%s"', podcast_url)
  77             raise
  78
  79
  80 class PodcastUpdater(object):
  81     """Updates the podcast specified by the podcast_url"""
  82
  83     def __init__(self, podcast_url):
  84         self.podcast_url = (
  85             (podcast_url[:2046] + "..") if len(podcast_url) > 2048 else podcast_url
  86         )
  87
  88     def update_podcast(self):
  89         """Update the podcast"""
  90
  91         with models.PodcastUpdateResult(podcast_url=self.podcast_url) as res:
  92
  93             parsed, podcast, created = self.parse_feed()
  94
  95             if not podcast:
  96                 res.podcast_created = False
  97                 res.error_message = '"{}" could not be parsed'.format(self.podcast_url)
  98
  99                 return
 100
 101             res.podcast = podcast
 102             res.podcast_created = created
 103
 104             res.episodes_added = 0
 105             episode_updater = MultiEpisodeUpdater(podcast, res)
 106
 107             if not parsed:
 108                 # if it exists already, we mark it as outdated
 109                 self._mark_outdated(
 110                     podcast, "error while fetching feed", episode_updater
 111                 )
 112                 return
 113
 114             episode_updater.update_episodes(parsed.get("episodes", []))
 115
 116             podcast.refresh_from_db()
 117             podcast.episode_count = episode_updater.count_episodes()
 118             podcast.save()
 119
 120             episode_updater.order_episodes()
 121
 122             self._update_podcast(podcast, parsed, episode_updater, res)
 123
 124         return podcast
 125
 126     def parse_feed(self):
 127         try:
 128             parsed = self._fetch_feed()
 129             self._validate_parsed(parsed)
 130
 131         except (requests.exceptions.RequestException, NoEpisodesException) as ex:
 132             logger.warn("Error while fetching/parsing feed", exc_info=True)
 133
 134             # if we fail to parse the URL, we don't even create the
 135             # podcast object
 136             try:
 137                 p = Podcast.objects.get(urls__url=self.podcast_url)
 138                 return (None, p, False)
 139
 140             except Podcast.DoesNotExist as pdne:
 141                 raise NoPodcastCreated(ex) from pdne
 142
 143         # Parsing went well, get podcast
 144         podcast, created = Podcast.objects.get_or_create_for_url(self.podcast_url)
 145
 146         return (parsed, podcast, created)
 147
 148     def _fetch_feed(self):
 149         params = {"url": self.podcast_url, "process_text": "markdown"}
 150         headers = {"Accept": "application/json"}
 151         url = urljoin(settings.FEEDSERVICE_URL, "parse")
 152         r = requests.get(url, params=params, headers=headers, timeout=30)
 153
 154         if r.status_code != 200:
 155             logger.error(
 156                 'Feed-service status code for "{}" was {}'.format(url, r.status_code)
 157             )
 158             return None
 159
 160         try:
 161             return r.json()[0]
 162         except ValueError:
 163             logger.exception(
 164                 'Feed-service error while parsing response for url "%s": %s',
 165                 self.podcast_url,
 166                 r.text,
 167             )
 168             raise
 169
 170     def _validate_parsed(self, parsed):
 171         """validates the parsed results and raises an exception if invalid
 172
 173         feedparser parses pretty much everything. We reject anything that
 174         doesn't look like a feed"""
 175
 176         if not parsed or not parsed.get("episodes", []):
 177             raise NoEpisodesException("no episodes found")
 178
 179     def _update_podcast(self, podcast, parsed, episode_updater, update_result):
 180         """updates a podcast according to new parser results"""
 181
 182         # we need that later to decide if we can "bump" a category
 183         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 184
 185         # will later be used to see whether the index is outdated
 186         old_index_fields = get_index_fields(podcast)
 187
 188         podcast.title = parsed.get("title") or podcast.title
 189         podcast.description = parsed.get("description") or podcast.description
 190         podcast.subtitle = parsed.get("subtitle") or podcast.subtitle
 191         podcast.link = parsed.get("link") or podcast.link
 192         podcast.logo_url = parsed.get("logo") or podcast.logo_url
 193
 194         podcast.author = to_maxlength(
 195             Podcast, "author", parsed.get("author") or podcast.author
 196         )
 197
 198         podcast.language = to_maxlength(
 199             Podcast, "language", parsed.get("language") or podcast.language
 200         )
 201
 202         podcast.content_types = (
 203             ",".join(parsed.get("content_types")) or podcast.content_types
 204         )
 205
 206         # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 207
 208         podcast.common_episode_title = to_maxlength(
 209             Podcast,
 210             "common_episode_title",
 211             parsed.get("common_title") or podcast.common_episode_title,
 212         )
 213
 214         podcast.new_location = parsed.get("new_location") or podcast.new_location
 215         podcast.flattr_url = to_maxlength(
 216             Podcast, "flattr_url", parsed.get("flattr") or podcast.flattr_url
 217         )
 218         podcast.hub = parsed.get("hub") or podcast.hub
 219         podcast.license = parsed.get("license") or podcast.license
 220         podcast.max_episode_order = episode_updater.max_episode_order
 221
 222         podcast.add_missing_urls(parsed.get("urls", []))
 223
 224         if podcast.new_location:
 225             try:
 226                 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 227
 228                 if new_podcast != podcast:
 229                     self._mark_outdated(
 230                         podcast, "redirected to different podcast", episode_updater
 231                     )
 232                     return
 233             except Podcast.DoesNotExist:
 234                 podcast.set_url(podcast.new_location)
 235
 236         # latest episode timestamp
 237         episodes = Episode.objects.filter(
 238             podcast=podcast, released__isnull=False
 239         ).order_by("released")
 240
 241         # Determine update interval
 242
 243         # Update interval is based on intervals between episodes
 244         podcast.update_interval = episode_updater.get_update_interval(episodes)
 245
 246         # factor is increased / decreased depending on whether the latest
 247         # update has returned episodes
 248         if update_result.episodes_added == 0:  # no episodes, incr factor
 249             newfactor = podcast.update_interval_factor * 1.2
 250             podcast.update_interval_factor = min(1000, newfactor)  # never above 1000
 251         elif update_result.episodes_added > 1:  # new episodes, decr factor
 252             newfactor = podcast.update_interval_factor / 1.2
 253             podcast.update_interval_factor = max(1, newfactor)  # never below 1
 254
 255         latest_episode = episodes.last()
 256         if latest_episode:
 257             podcast.latest_episode_timestamp = latest_episode.released
 258
 259         # podcast.episode_count is not update here on purpose. It is, instead,
 260         # continuously updated when creating new episodes in
 261         # EpisodeManager.get_or_create_for_url
 262
 263         self._update_categories(podcast, prev_latest_episode_timestamp)
 264
 265         # try to download the logo and reset logo_url to None on http errors
 266         found = CoverArt.save_podcast_logo(podcast.logo_url)
 267         if not found:
 268             podcast.logo_url = None
 269
 270         # check if search index should be considered out of date
 271         new_index_fields = get_index_fields(podcast)
 272         if list(old_index_fields.items()) != list(new_index_fields.items()):
 273             podcast.search_index_uptodate = False
 274
 275         # The podcast is always saved (not just when there are changes) because
 276         # we need to record the last update
 277         logger.info("Saving podcast.")
 278         podcast.last_update = datetime.utcnow()
 279         podcast.save()
 280
 281         try:
 282             subscribe_at_hub(podcast)
 283         except SubscriptionError as se:
 284             logger.warning("subscribing to hub failed: %s", str(se))
 285
 286         self.assign_slug(podcast)
 287         episode_updater.assign_missing_episode_slugs()
 288         update_related_podcasts.delay(podcast.pk)
 289
 290     def assign_slug(self, podcast):
 291         if podcast.slug:
 292             return
 293
 294         for slug in PodcastSlugs(podcast):
 295             try:
 296                 with transaction.atomic():
 297                     podcast.add_slug(slug)
 298                 break
 299
 300             except:
 301                 continue
 302
 303     def _update_categories(self, podcast, prev_timestamp):
 304         """checks some practical requirements and updates a category"""
 305
 306         max_timestamp = datetime.utcnow() + timedelta(days=1)
 307
 308         # no episodes at all
 309         if not podcast.latest_episode_timestamp:
 310             return
 311
 312         # no new episode
 313         if prev_timestamp and (podcast.latest_episode_timestamp <= prev_timestamp):
 314             return
 315
 316         # too far in the future
 317         if podcast.latest_episode_timestamp > max_timestamp:
 318             return
 319
 320         # not enough subscribers
 321         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 322             return
 323
 324         update_category(podcast)
 325
 326     def _mark_outdated(self, podcast, msg, episode_updater):
 327         logger.info("marking podcast outdated: %s", msg)
 328         podcast.outdated = True
 329         podcast.last_update = datetime.utcnow()
 330         podcast.save()
 331         episode_updater.update_episodes([])
 332
 333
 334 class MultiEpisodeUpdater(object):
 335     def __init__(self, podcast, update_result):
 336         self.podcast = podcast
 337         self.update_result = update_result
 338         self.updated_episodes = []
 339         self.max_episode_order = None
 340
 341     def update_episodes(self, parsed_episodes):
 342
 343         pid = self.podcast.get_id()
 344
 345         episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 346         logger.info(
 347             "Parsed %d (%d) episodes", len(parsed_episodes), len(episodes_to_update)
 348         )
 349
 350         logger.info("Updating %d episodes", len(episodes_to_update))
 351         for n, parsed in enumerate(episodes_to_update, 1):
 352
 353             url = self.get_episode_url(parsed)
 354             if not url:
 355                 logger.info("Skipping episode %d for missing URL", n)
 356                 continue
 357
 358             logger.info("Updating episode %d / %d", n, len(parsed_episodes))
 359
 360             episode, created = Episode.objects.get_or_create_for_url(self.podcast, url)
 361
 362             if created:
 363                 self.update_result.episodes_added += 1
 364
 365             updater = EpisodeUpdater(episode, self.podcast)
 366             updater.update_episode(parsed)
 367
 368             self.updated_episodes.append(episode)
 369
 370         # and mark the remaining ones outdated
 371         current_episodes = Episode.objects.filter(podcast=self.podcast, outdated=False)[
 372             :500
 373         ]
 374         outdated_episodes = set(current_episodes) - set(self.updated_episodes)
 375
 376         logger.info("Marking %d episodes as outdated", len(outdated_episodes))
 377         for episode in outdated_episodes:
 378             updater = EpisodeUpdater(episode, self.podcast)
 379             updater.mark_outdated()
 380
 381     @transaction.atomic
 382     def order_episodes(self):
 383         """Reorder the podcast's episode according to release timestamp
 384
 385         Returns the highest order value (corresponding to the most recent
 386         episode)"""
 387
 388         num_episodes = self.podcast.episode_count
 389         if not num_episodes:
 390             return 0
 391
 392         episodes = (
 393             self.podcast.episode_set.all()
 394             .extra(select={"has_released": "released IS NOT NULL"})
 395             .order_by("-has_released", "-released", "pk")
 396             .only("pk")
 397         )
 398
 399         for n, episode in enumerate(episodes.iterator(), 1):
 400             # assign ``order`` from higher (most recent) to 0 (oldest)
 401             # None means "unknown"
 402             new_order = num_episodes - n
 403
 404             # optimize for new episodes that are newer than all existing
 405             if episode.order == new_order:
 406                 continue
 407
 408             logger.info("Updating order from {} to {}".format(episode.order, new_order))
 409             episode.order = new_order
 410             episode.save()
 411
 412         self.max_episode_order = num_episodes - 1
 413
 414     def get_episode_url(self, parsed_episode):
 415         """returns the URL of a parsed episode"""
 416         for f in parsed_episode.get("files", []):
 417             if f.get("urls", []):
 418                 return f["urls"][0]
 419         return None
 420
 421     def count_episodes(self):
 422         return Episode.objects.filter(podcast=self.podcast).count()
 423
 424     def get_update_interval(self, episodes):
 425         """calculates the avg interval between new episodes"""
 426
 427         count = episodes.count()
 428         if not count:
 429             logger.info(
 430                 "no episodes, using default interval of %dh", DEFAULT_UPDATE_INTERVAL
 431             )
 432             return DEFAULT_UPDATE_INTERVAL
 433
 434         earliest = episodes.first()
 435         now = datetime.utcnow()
 436
 437         timespan_s = (now - earliest.released).total_seconds()
 438         timespan_h = timespan_s / 60 / 60
 439
 440         interval = int(timespan_h / count)
 441         logger.info(
 442             "%d episodes in %d days => %dh interval", count, timespan_h / 24, interval
 443         )
 444
 445         # place interval between {MIN,MAX}_UPDATE_INTERVAL
 446         interval = max(interval, MIN_UPDATE_INTERVAL)
 447         interval = min(interval, MAX_UPDATE_INTERVAL)
 448
 449         return interval
 450
 451     def assign_missing_episode_slugs(self):
 452         common_title = self.podcast.get_common_episode_title()
 453
 454         episodes = Episode.objects.filter(podcast=self.podcast, slugs__isnull=True)
 455
 456         for episode in episodes:
 457
 458             for slug in EpisodeSlugs(episode, common_title):
 459                 try:
 460                     with transaction.atomic():
 461                         episode.set_slug(slug)
 462                     break
 463
 464                 except:
 465                     continue
 466
 467
 468 class EpisodeUpdater(object):
 469     """Updates an individual episode"""
 470
 471     def __init__(self, episode, podcast):
 472         self.episode = episode
 473         self.podcast = podcast
 474
 475     def update_episode(self, parsed_episode):
 476         """updates "episode" with the data from "parsed_episode" """
 477
 478         # TODO: check if there have been any changes, to
 479         # avoid unnecessary updates
 480         self.episode.guid = to_maxlength(
 481             Episode, "guid", parsed_episode.get("guid") or self.episode.guid
 482         )
 483
 484         self.episode.description = (
 485             parsed_episode.get("description") or self.episode.description
 486         )
 487
 488         self.episode.subtitle = parsed_episode.get("subtitle") or self.episode.subtitle
 489
 490         self.episode.content = (
 491             parsed_episode.get("content")
 492             or parsed_episode.get("description")
 493             or self.episode.content
 494         )
 495
 496         self.episode.link = to_maxlength(
 497             Episode, "link", parsed_episode.get("link") or self.episode.link
 498         )
 499
 500         self.episode.released = (
 501             datetime.utcfromtimestamp(parsed_episode.get("released"))
 502             if parsed_episode.get("released")
 503             else self.episode.released
 504         )
 505
 506         self.episode.author = to_maxlength(
 507             Episode, "author", parsed_episode.get("author") or self.episode.author
 508         )
 509
 510         self.episode.duration = parsed_episode.get("duration") or self.episode.duration
 511
 512         self.episode.filesize = parsed_episode["files"][0]["filesize"]
 513
 514         self.episode.language = (
 515             parsed_episode.get("language")
 516             or self.episode.language
 517             or self.podcast.language
 518         )
 519
 520         mimetypes = [f["mimetype"] for f in parsed_episode.get("files", [])]
 521         self.episode.mimetypes = ",".join(list(set(filter(None, mimetypes))))
 522
 523         self.episode.flattr_url = to_maxlength(
 524             Episode,
 525             "flattr_url",
 526             parsed_episode.get("flattr") or self.episode.flattr_url,
 527         )
 528
 529         self.episode.license = parsed_episode.get("license") or self.episode.license
 530
 531         self.episode.title = to_maxlength(
 532             Episode,
 533             "title",
 534             parsed_episode.get("title")
 535             or self.episode.title
 536             or file_basename_no_extension(self.episode.url),
 537         )
 538
 539         self.episode.last_update = datetime.utcnow()
 540         self.episode.save()
 541
 542         parsed_urls = list(
 543             chain.from_iterable(
 544                 f.get("urls", []) for f in parsed_episode.get("files", [])
 545             )
 546         )
 547         self.episode.add_missing_urls(parsed_urls)
 548
 549     def mark_outdated(self):
 550         """marks the episode outdated if its not already"""
 551         if self.episode.outdated:
 552             return None
 553
 554         self.episode.outdated = True
 555         self.episode.last_update = datetime.utcnow()
 556         self.episode.save()
 557
 558
 559 def file_basename_no_extension(filename):
 560     """Returns filename without extension
 561
 562     >>> file_basename_no_extension('/home/me/file.txt')
 563     'file'
 564
 565     >>> file_basename_no_extension('file')
 566     'file'
 567     """
 568     base = os.path.basename(filename)
 569     name, extension = os.path.splitext(base)
 570     return name
 571
 572
 573 def verify_podcast_url(url):
 574     updater = PodcastUpdater(url)
 575     parsed = updater._fetch_feed()
 576     updater._validate_parsed(parsed)
 577     return True