mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib.request, urllib.error, urllib.parse
  22 from urllib.parse import urljoin
  23 import http.client
  24 import hashlib
  25 from datetime import datetime, timedelta
  26 from itertools import chain, islice
  27 import socket
  28 import requests
  29
  30 from django.db import transaction
  31 from django.conf import settings
  32
  33 from mygpo.podcasts.models import Podcast, Episode
  34 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
  35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  36     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  37 from mygpo.utils import file_hash, to_maxlength
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.data.tasks import update_related_podcasts
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE = 200
  48
  49
  50 class UpdatePodcastException(Exception):
  51     pass
  52
  53
  54 class NoPodcastCreated(Exception):
  55     """ raised when no podcast obj was created for a new URL """
  56
  57
  58 class NoEpisodesException(Exception):
  59     """ raised when parsing something that doesn't contain any episodes """
  60
  61
  62 def update_podcasts(queue):
  63     """ Fetch data for the URLs supplied as the queue iterable """
  64
  65     for n, podcast_url in enumerate(queue, 1):
  66         logger.info('Update %d - %s', n, podcast_url)
  67         try:
  68             yield update_podcast(podcast_url)
  69
  70         except NoPodcastCreated as npc:
  71             logger.info('No podcast created: %s', npc)
  72
  73         except:
  74             logger.exception('Error while updating podcast "%s"',
  75                              podcast_url)
  76             raise
  77
  78
  79 def update_podcast(podcast_url):
  80     """ Update the podcast for the supplied URL """
  81
  82     try:
  83         parsed = _fetch_feed(podcast_url)
  84         _validate_parsed(parsed)
  85
  86     except requests.exceptions.RequestException as re:
  87         logging.exception('Error while fetching response from feedservice')
  88
  89         # if we fail to parse the URL, we don't even create the
  90         # podcast object
  91         try:
  92             p = Podcast.objects.get(urls__url=podcast_url)
  93             # if it exists already, we mark it as outdated
  94             _mark_outdated(p, 'error while fetching feed: %s' % str(re))
  95             p.last_update = datetime.utcnow()
  96             p.save()
  97             return p
  98
  99         except Podcast.DoesNotExist:
 100             raise NoPodcastCreated(re)
 101
 102     except NoEpisodesException as nee:
 103         logging.warn('No episode found while parsing podcast')
 104
 105         # if we fail to parse the URL, we don't even create the
 106         # podcast object
 107         try:
 108             p = Podcast.objects.get(urls__url=podcast_url)
 109             # if it exists already, we mark it as outdated
 110             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
 111             return p
 112
 113         except Podcast.DoesNotExist:
 114             raise NoPodcastCreated(nee)
 115
 116     assert parsed, 'fetch_feed must return something'
 117     p = Podcast.objects.get_or_create_for_url(podcast_url)
 118     episodes = _update_episodes(p, parsed.get('episodes', []))
 119     p.refresh_from_db()
 120     p.episode_count = Episode.objects.filter(podcast=p).count()
 121     p.save()
 122     max_episode_order = _order_episodes(p)
 123     _update_podcast(p, parsed, episodes, max_episode_order)
 124     return p
 125
 126
 127 def verify_podcast_url(podcast_url):
 128     parsed = _fetch_feed(podcast_url)
 129     _validate_parsed(parsed)
 130     return True
 131
 132
 133 def _fetch_feed(podcast_url):
 134     params = {
 135         'url': podcast_url,
 136         'process_text': 'markdown',
 137     }
 138     headers = {
 139         'Accept': 'application/json',
 140     }
 141     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 142     r = requests.get(url, params=params, headers=headers, timeout=10)
 143
 144     if r.status_code != 200:
 145         logger.error('Feed-service status code for "%s" was %s', podcast_url,
 146                      r.status_code)
 147         return None
 148
 149     try:
 150         return r.json()[0]
 151     except ValueError:
 152         logger.exception(
 153             'Feed-service error while parsing response for url "%s": %s',
 154             podcast_url, r.text,
 155         )
 156         raise
 157
 158
 159 def _validate_parsed(parsed):
 160     """ validates the parsed results and raises an exception if invalid
 161
 162     feedparser parses pretty much everything. We reject anything that
 163     doesn't look like a feed"""
 164
 165     if not parsed or not parsed.get('episodes', []):
 166         raise NoEpisodesException('no episodes found')
 167
 168
 169 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 170     """ updates a podcast according to new parser results """
 171
 172     # we need that later to decide if we can "bump" a category
 173     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 174
 175     podcast.title = parsed.get('title') or podcast.title
 176     podcast.description = parsed.get('description') or podcast.description
 177     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 178     podcast.link = parsed.get('link') or podcast.link
 179     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 180     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 181                                   podcast.author)
 182     podcast.language = to_maxlength(Podcast, 'language',
 183                                     parsed.get('language') or podcast.language)
 184     podcast.content_types = ','.join(parsed.get('content_types')) or \
 185                                      podcast.content_types
 186     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 187     podcast.common_episode_title = to_maxlength(
 188         Podcast,
 189         'common_episode_title',
 190         parsed.get('common_title') or podcast.common_episode_title)
 191     podcast.new_location = parsed.get('new_location') or podcast.new_location
 192     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 193                                       parsed.get('flattr') or
 194                                       podcast.flattr_url)
 195     podcast.hub = parsed.get('hub') or podcast.hub
 196     podcast.license = parsed.get('license') or podcast.license
 197     podcast.max_episode_order = max_episode_order
 198
 199     podcast.add_missing_urls(parsed.get('urls', []))
 200
 201     if podcast.new_location:
 202         try:
 203             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 204             if new_podcast != podcast:
 205                 _mark_outdated(podcast, 'redirected to different podcast')
 206                 return
 207         except Podcast.DoesNotExist:
 208             podcast.set_url(podcast.new_location)
 209
 210     # latest episode timestamp
 211     episodes = Episode.objects.filter(podcast=podcast,
 212                                       released__isnull=False)\
 213                               .order_by('released')
 214
 215     podcast.update_interval = get_update_interval(episodes)
 216
 217     latest_episode = episodes.last()
 218     if latest_episode:
 219         podcast.latest_episode_timestamp = latest_episode.released
 220
 221     # podcast.episode_count is not update here on purpose. It is, instead,
 222     # continuously updated when creating new episodes in
 223     # EpisodeManager.get_or_create_for_url
 224
 225     _update_categories(podcast, prev_latest_episode_timestamp)
 226
 227     # try to download the logo and reset logo_url to None on http errors
 228     found = _save_podcast_logo(podcast.logo_url)
 229     if not found:
 230         podcast.logo_url = None
 231
 232     # The podcast is always saved (not just when there are changes) because
 233     # we need to record the last update
 234     logger.info('Saving podcast.')
 235     podcast.last_update = datetime.utcnow()
 236     podcast.save()
 237
 238     try:
 239         subscribe_at_hub(podcast)
 240     except SubscriptionError as se:
 241         logger.warn('subscribing to hub failed: %s', str(se))
 242
 243     assign_slug(podcast)
 244     assign_missing_episode_slugs(podcast)
 245     update_related_podcasts.delay(podcast)
 246
 247
 248 def assign_slug(podcast):
 249     if podcast.slug:
 250         return
 251
 252     for slug in PodcastSlugs(podcast):
 253         try:
 254             with transaction.atomic():
 255                 podcast.add_slug(slug)
 256             break
 257
 258         except:
 259             continue
 260
 261
 262 def assign_missing_episode_slugs(podcast):
 263     common_title = podcast.get_common_episode_title()
 264
 265     episodes = Episode.objects.filter(podcast=podcast, slugs__isnull=True)
 266
 267     for episode in episodes:
 268
 269         for slug in EpisodeSlugs(episode, common_title):
 270             try:
 271                 with transaction.atomic():
 272                     episode.set_slug(slug)
 273                 break
 274
 275             except:
 276                 continue
 277
 278
 279 def _update_categories(podcast, prev_timestamp):
 280     """ checks some practical requirements and updates a category """
 281
 282     max_timestamp = datetime.utcnow() + timedelta(days=1)
 283
 284     # no episodes at all
 285     if not podcast.latest_episode_timestamp:
 286         return
 287
 288     # no new episode
 289     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 290         return
 291
 292     # too far in the future
 293     if podcast.latest_episode_timestamp > max_timestamp:
 294         return
 295
 296     # not enough subscribers
 297     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 298         return
 299
 300     update_category(podcast)
 301
 302
 303 def _update_episodes(podcast, parsed_episodes):
 304
 305     pid = podcast.get_id()
 306
 307     # list of (obj, fun) where fun is the function to update obj
 308     updated_episodes = []
 309     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 310     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 311                 len(episodes_to_update))
 312
 313     logger.info('Updating %d episodes', len(episodes_to_update))
 314     for n, parsed in enumerate(episodes_to_update, 1):
 315
 316         url = get_episode_url(parsed)
 317         if not url:
 318             logger.info('Skipping episode %d for missing URL', n)
 319             continue
 320
 321         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 322
 323         episode = Episode.objects.get_or_create_for_url(podcast, url)
 324
 325         update_episode(parsed, episode, podcast)
 326         updated_episodes.append(episode)
 327
 328     # and mark the remaining ones outdated
 329     current_episodes = Episode.objects.filter(podcast=podcast,
 330                                               outdated=False)[:500]
 331     outdated_episodes = set(current_episodes) - set(updated_episodes)
 332
 333     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 334     for episode in outdated_episodes:
 335         mark_outdated(episode)
 336
 337
 338 @transaction.atomic
 339 def _order_episodes(podcast):
 340     """ Reorder the podcast's episode according to release timestamp
 341
 342     Returns the highest order value (corresponding to the most recent
 343     episode) """
 344
 345     num_episodes = podcast.episode_count
 346     if not num_episodes:
 347         return 0
 348
 349     episodes = podcast.episode_set.all().extra(select={
 350         'has_released': 'released IS NOT NULL',
 351         })\
 352         .order_by('-has_released', '-released', 'pk')\
 353         .only('pk')
 354
 355     for n, episode in enumerate(episodes.iterator(), 1):
 356         # assign ``order`` from higher (most recent) to 0 (oldest)
 357         # None means "unknown"
 358         new_order = num_episodes - n
 359
 360         # optimize for new episodes that are newer than all existing
 361         if episode.order == new_order:
 362             continue
 363
 364         logger.info('Updating order from {} to {}'.format(episode.order,
 365                                                           new_order))
 366         episode.order = new_order
 367         episode.save()
 368
 369     return num_episodes - 1
 370
 371
 372 def _save_podcast_logo(cover_art):
 373     if not cover_art:
 374         return
 375
 376     try:
 377         image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
 378         prefix = CoverArt.get_prefix(image_sha1)
 379
 380         filename = CoverArt.get_original(prefix, image_sha1)
 381         dirname = CoverArt.get_dir(filename)
 382
 383         # get hash of existing file
 384         if os.path.exists(filename):
 385             with open(filename, 'rb') as f:
 386                 old_hash = file_hash(f).digest()
 387         else:
 388             old_hash = ''
 389
 390         logger.info('Logo %s', cover_art)
 391
 392         # save new cover art
 393         with open(filename, 'wb') as fp:
 394             fp.write(urllib.request.urlopen(cover_art).read())
 395
 396         # get hash of new file
 397         with open(filename, 'rb') as f:
 398             new_hash = file_hash(f).digest()
 399
 400         # remove thumbnails if cover changed
 401         if old_hash != new_hash:
 402             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 403             logger.info('Removing %d thumbnails', len(thumbnails))
 404             for f in thumbnails:
 405                 os.unlink(f)
 406
 407         return cover_art
 408
 409     except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
 410             http.client.HTTPException, socket.error, IOError) as e:
 411         logger.warn('Exception while updating podcast logo: %s', str(e))
 412
 413
 414 def _mark_outdated(podcast, msg=''):
 415     logger.info('marking podcast outdated: %s', msg)
 416     podcast.outdated = True
 417     podcast.last_update = datetime.utcnow()
 418     podcast.save()
 419     _update_episodes(podcast, [])
 420
 421
 422 def get_episode_url(parsed_episode):
 423     """ returns the URL of a parsed episode """
 424     for f in parsed_episode.get('files', []):
 425         if f.get('urls', []):
 426             return f['urls'][0]
 427     return None
 428
 429
 430 def update_episode(parsed_episode, episode, podcast):
 431     """ updates "episode" with the data from "parsed_episode" """
 432
 433     # TODO: check if there have been any changes, to avoid unnecessary updates
 434     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 435                                 episode.guid)
 436     episode.description = parsed_episode.get('description') or \
 437         episode.description
 438     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 439     episode.content = parsed_episode.get('content') or \
 440         parsed_episode.get('description') or episode.content
 441     episode.link = to_maxlength(Episode, 'link',
 442                                 parsed_episode.get('link') or episode.link)
 443     episode.released = datetime.utcfromtimestamp(
 444         parsed_episode.get('released')) if parsed_episode.get('released') \
 445         else episode.released
 446     episode.author = to_maxlength(Episode, 'author',
 447                                   parsed_episode.get('author') or
 448                                   episode.author)
 449     episode.duration = parsed_episode.get('duration') or episode.duration
 450     episode.filesize = parsed_episode['files'][0]['filesize']
 451     episode.language = parsed_episode.get('language') or \
 452         episode.language or podcast.language
 453     episode.mimetypes = ','.join(list(set(
 454         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 455     )))
 456     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 457                                       parsed_episode.get('flattr') or
 458                                       episode.flattr_url)
 459     episode.license = parsed_episode.get('license') or episode.license
 460
 461     episode.title = to_maxlength(Episode, 'title',
 462                                  parsed_episode.get('title') or
 463                                  episode.title or
 464                                  file_basename_no_extension(episode.url))
 465
 466     episode.last_update = datetime.utcnow()
 467     episode.save()
 468
 469     parsed_urls = list(chain.from_iterable(
 470         f.get('urls', []) for f in parsed_episode.get('files', [])))
 471     episode.add_missing_urls(parsed_urls)
 472
 473
 474 def mark_outdated(obj):
 475     """ marks obj outdated if its not already """
 476     if obj.outdated:
 477         return None
 478
 479     obj.outdated = True
 480     obj.last_update = datetime.utcnow()
 481     obj.save()
 482
 483
 484 def get_update_interval(episodes):
 485     """ calculates the avg interval between new episodes """
 486
 487     count = len(episodes)
 488     if not count:
 489         logger.info('no episodes, using default interval of %dh',
 490                     DEFAULT_UPDATE_INTERVAL)
 491         return DEFAULT_UPDATE_INTERVAL
 492
 493     earliest = episodes[0]
 494     now = datetime.utcnow()
 495
 496     timespan_s = (now - earliest.released).total_seconds()
 497     timespan_h = timespan_s / 60 / 60
 498
 499     interval = int(timespan_h / count)
 500     logger.info('%d episodes in %d days => %dh interval', count,
 501                 timespan_h / 24, interval)
 502
 503     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 504     interval = max(interval, MIN_UPDATE_INTERVAL)
 505     interval = min(interval, MAX_UPDATE_INTERVAL)
 506
 507     return interval
 508
 509
 510 def file_basename_no_extension(filename):
 511     """ Returns filename without extension
 512
 513     >>> file_basename_no_extension('/home/me/file.txt')
 514     'file'
 515
 516     >>> file_basename_no_extension('file')
 517     'file'
 518     """
 519     base = os.path.basename(filename)
 520     name, extension = os.path.splitext(base)
 521     return name