mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib.request, urllib.error, urllib.parse
  22 from urllib.parse import urljoin
  23 import http.client
  24 import hashlib
  25 from datetime import datetime, timedelta
  26 from itertools import chain, islice
  27 import socket
  28 import requests
  29
  30 from django.db import transaction
  31 from django.conf import settings
  32
  33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  36     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  37 from mygpo.utils import file_hash, to_maxlength
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.data.tasks import update_related_podcasts
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE = 200
  48
  49
  50 class UpdatePodcastException(Exception):
  51     pass
  52
  53
  54 class NoPodcastCreated(Exception):
  55     """ raised when no podcast obj was created for a new URL """
  56
  57
  58 class NoEpisodesException(Exception):
  59     """ raised when parsing something that doesn't contain any episodes """
  60
  61
  62 def update_podcasts(queue):
  63     """ Fetch data for the URLs supplied as the queue iterable """
  64
  65     for n, podcast_url in enumerate(queue, 1):
  66         logger.info('Update %d - %s', n, podcast_url)
  67         try:
  68             yield update_podcast(podcast_url)
  69
  70         except NoPodcastCreated as npc:
  71             logger.info('No podcast created: %s', npc)
  72
  73         except:
  74             logger.exception('Error while updating podcast "%s"',
  75                              podcast_url)
  76             raise
  77
  78
  79 def update_podcast(podcast_url):
  80     """ Update the podcast for the supplied URL """
  81
  82     try:
  83         parsed = _fetch_feed(podcast_url)
  84         _validate_parsed(parsed)
  85
  86     except requests.exceptions.RequestException as re:
  87         logging.exception('Error while fetching response from feedservice')
  88
  89         # if we fail to parse the URL, we don't even create the
  90         # podcast object
  91         try:
  92             p = Podcast.objects.get(urls__url=podcast_url)
  93             # if it exists already, we mark it as outdated
  94             _mark_outdated(p, 'error while fetching feed: %s' % str(re))
  95             p.last_update = datetime.utcnow()
  96             p.save()
  97             return p
  98
  99         except Podcast.DoesNotExist:
 100             raise NoPodcastCreated(re)
 101
 102     except NoEpisodesException as nee:
 103         logging.warn('No episode found while parsing podcast')
 104
 105         # if we fail to parse the URL, we don't even create the
 106         # podcast object
 107         try:
 108             p = Podcast.objects.get(urls__url=podcast_url)
 109             # if it exists already, we mark it as outdated
 110             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
 111             return p
 112
 113         except Podcast.DoesNotExist:
 114             raise NoPodcastCreated(nee)
 115
 116     assert parsed, 'fetch_feed must return something'
 117     p = Podcast.objects.get_or_create_for_url(podcast_url)
 118     episodes = _update_episodes(p, parsed.get('episodes', []))
 119     p.refresh_from_db()
 120     p.episode_count = Episode.objects.filter(podcast=p).count()
 121     p.save()
 122     max_episode_order = _order_episodes(p)
 123     _update_podcast(p, parsed, episodes, max_episode_order)
 124     return p
 125
 126
 127 def verify_podcast_url(podcast_url):
 128     parsed = _fetch_feed(podcast_url)
 129     _validate_parsed(parsed)
 130     return True
 131
 132
 133 def _fetch_feed(podcast_url):
 134     params = {
 135         'url': podcast_url,
 136         'process_text': 'markdown',
 137     }
 138     headers = {
 139         'Accept': 'application/json',
 140     }
 141     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 142     r = requests.get(url, params=params, headers=headers, timeout=10)
 143
 144     if r.status_code != 200:
 145         logger.error('Feed-service status code for "%s" was %s', podcast_url,
 146                      r.status_code)
 147         return None
 148
 149     try:
 150         return r.json()[0]
 151     except ValueError:
 152         logger.exception(
 153             'Feed-service error while parsing response for url "%s": %s',
 154             podcast_url, r.text,
 155         )
 156         raise
 157
 158
 159 def _validate_parsed(parsed):
 160     """ validates the parsed results and raises an exception if invalid
 161
 162     feedparser parses pretty much everything. We reject anything that
 163     doesn't look like a feed"""
 164
 165     if not parsed or not parsed.get('episodes', []):
 166         raise NoEpisodesException('no episodes found')
 167
 168
 169 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 170     """ updates a podcast according to new parser results """
 171
 172     # we need that later to decide if we can "bump" a category
 173     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 174
 175     podcast.title = parsed.get('title') or podcast.title
 176     podcast.description = parsed.get('description') or podcast.description
 177     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 178     podcast.link = parsed.get('link') or podcast.link
 179     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 180     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 181                                   podcast.author)
 182     podcast.language = to_maxlength(Podcast, 'language',
 183                                     parsed.get('language') or podcast.language)
 184     podcast.content_types = ','.join(parsed.get('content_types')) or \
 185                                      podcast.content_types
 186     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 187     podcast.common_episode_title = to_maxlength(
 188         Podcast,
 189         'common_episode_title',
 190         parsed.get('common_title') or podcast.common_episode_title)
 191     podcast.new_location = parsed.get('new_location') or podcast.new_location
 192     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 193                                       parsed.get('flattr') or
 194                                       podcast.flattr_url)
 195     podcast.hub = parsed.get('hub') or podcast.hub
 196     podcast.license = parsed.get('license') or podcast.license
 197     podcast.max_episode_order = max_episode_order
 198
 199     podcast.add_missing_urls(parsed.get('urls', []))
 200
 201     if podcast.new_location:
 202         try:
 203             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 204             if new_podcast != podcast:
 205                 _mark_outdated(podcast, 'redirected to different podcast')
 206                 return
 207         except Podcast.DoesNotExist:
 208             podcast.set_url(podcast.new_location)
 209
 210     # latest episode timestamp
 211     episodes = Episode.objects.filter(podcast=podcast,
 212                                       released__isnull=False)\
 213                               .order_by('released')
 214
 215     podcast.update_interval = get_update_interval(episodes)
 216
 217     latest_episode = episodes.last()
 218     if latest_episode:
 219         podcast.latest_episode_timestamp = latest_episode.released
 220
 221     # podcast.episode_count is not update here on purpose. It is, instead,
 222     # continuously updated when creating new episodes in
 223     # EpisodeManager.get_or_create_for_url
 224
 225     _update_categories(podcast, prev_latest_episode_timestamp)
 226
 227     # try to download the logo and reset logo_url to None on http errors
 228     found = _save_podcast_logo(podcast.logo_url)
 229     if not found:
 230         podcast.logo_url = None
 231
 232     # The podcast is always saved (not just when there are changes) because
 233     # we need to record the last update
 234     logger.info('Saving podcast.')
 235     podcast.last_update = datetime.utcnow()
 236     podcast.save()
 237
 238     try:
 239         subscribe_at_hub(podcast)
 240     except SubscriptionError as se:
 241         logger.warn('subscribing to hub failed: %s', str(se))
 242
 243     if not podcast.slug:
 244         slug = PodcastSlug(podcast).get_slug()
 245         if slug:
 246             podcast.add_slug(slug)
 247
 248     assign_missing_episode_slugs(podcast)
 249     update_related_podcasts.delay(podcast)
 250
 251
 252 def _update_categories(podcast, prev_timestamp):
 253     """ checks some practical requirements and updates a category """
 254
 255     max_timestamp = datetime.utcnow() + timedelta(days=1)
 256
 257     # no episodes at all
 258     if not podcast.latest_episode_timestamp:
 259         return
 260
 261     # no new episode
 262     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 263         return
 264
 265     # too far in the future
 266     if podcast.latest_episode_timestamp > max_timestamp:
 267         return
 268
 269     # not enough subscribers
 270     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 271         return
 272
 273     update_category(podcast)
 274
 275
 276 def _update_episodes(podcast, parsed_episodes):
 277
 278     pid = podcast.get_id()
 279
 280     # list of (obj, fun) where fun is the function to update obj
 281     updated_episodes = []
 282     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 283     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 284                 len(episodes_to_update))
 285
 286     logger.info('Updating %d episodes', len(episodes_to_update))
 287     for n, parsed in enumerate(episodes_to_update, 1):
 288
 289         url = get_episode_url(parsed)
 290         if not url:
 291             logger.info('Skipping episode %d for missing URL', n)
 292             continue
 293
 294         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 295
 296         episode = Episode.objects.get_or_create_for_url(podcast, url)
 297
 298         update_episode(parsed, episode, podcast)
 299         updated_episodes.append(episode)
 300
 301     # and mark the remaining ones outdated
 302     current_episodes = Episode.objects.filter(podcast=podcast,
 303                                               outdated=False)[:500]
 304     outdated_episodes = set(current_episodes) - set(updated_episodes)
 305
 306     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 307     for episode in outdated_episodes:
 308         mark_outdated(episode)
 309
 310
 311 @transaction.atomic
 312 def _order_episodes(podcast):
 313     """ Reorder the podcast's episode according to release timestamp
 314
 315     Returns the highest order value (corresponding to the most recent
 316     episode) """
 317
 318     num_episodes = podcast.episode_count
 319     if not num_episodes:
 320         return 0
 321
 322     episodes = podcast.episode_set.all().extra(select={
 323         'has_released': 'released IS NOT NULL',
 324         })\
 325         .order_by('-has_released', '-released', 'pk')\
 326         .only('pk')
 327
 328     for n, episode in enumerate(episodes.iterator(), 1):
 329         # assign ``order`` from higher (most recent) to 0 (oldest)
 330         # None means "unknown"
 331         new_order = num_episodes - n
 332
 333         # optimize for new episodes that are newer than all existing
 334         if episode.order == new_order:
 335             continue
 336
 337         logger.info('Updating order from {} to {}'.format(episode.order,
 338                                                           new_order))
 339         episode.order = new_order
 340         episode.save()
 341
 342     return num_episodes - 1
 343
 344
 345 def _save_podcast_logo(cover_art):
 346     if not cover_art:
 347         return
 348
 349     try:
 350         image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
 351         prefix = CoverArt.get_prefix(image_sha1)
 352
 353         filename = CoverArt.get_original(prefix, image_sha1)
 354         dirname = CoverArt.get_dir(filename)
 355
 356         # get hash of existing file
 357         if os.path.exists(filename):
 358             with open(filename, 'rb') as f:
 359                 old_hash = file_hash(f).digest()
 360         else:
 361             old_hash = ''
 362
 363         logger.info('Logo %s', cover_art)
 364
 365         # save new cover art
 366         with open(filename, 'wb') as fp:
 367             fp.write(urllib.request.urlopen(cover_art).read())
 368
 369         # get hash of new file
 370         with open(filename, 'rb') as f:
 371             new_hash = file_hash(f).digest()
 372
 373         # remove thumbnails if cover changed
 374         if old_hash != new_hash:
 375             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 376             logger.info('Removing %d thumbnails', len(thumbnails))
 377             for f in thumbnails:
 378                 os.unlink(f)
 379
 380         return cover_art
 381
 382     except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
 383             http.client.HTTPException, socket.error, IOError) as e:
 384         logger.warn('Exception while updating podcast logo: %s', str(e))
 385
 386
 387 def _mark_outdated(podcast, msg=''):
 388     logger.info('marking podcast outdated: %s', msg)
 389     podcast.outdated = True
 390     podcast.last_update = datetime.utcnow()
 391     podcast.save()
 392     _update_episodes(podcast, [])
 393
 394
 395 def get_episode_url(parsed_episode):
 396     """ returns the URL of a parsed episode """
 397     for f in parsed_episode.get('files', []):
 398         if f.get('urls', []):
 399             return f['urls'][0]
 400     return None
 401
 402
 403 def update_episode(parsed_episode, episode, podcast):
 404     """ updates "episode" with the data from "parsed_episode" """
 405
 406     # TODO: check if there have been any changes, to avoid unnecessary updates
 407     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 408                                 episode.guid)
 409     episode.description = parsed_episode.get('description') or \
 410         episode.description
 411     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 412     episode.content = parsed_episode.get('content') or \
 413         parsed_episode.get('description') or episode.content
 414     episode.link = to_maxlength(Episode, 'link',
 415                                 parsed_episode.get('link') or episode.link)
 416     episode.released = datetime.utcfromtimestamp(
 417         parsed_episode.get('released')) if parsed_episode.get('released') \
 418         else episode.released
 419     episode.author = to_maxlength(Episode, 'author',
 420                                   parsed_episode.get('author') or
 421                                   episode.author)
 422     episode.duration = parsed_episode.get('duration') or episode.duration
 423     episode.filesize = parsed_episode['files'][0]['filesize']
 424     episode.language = parsed_episode.get('language') or \
 425         episode.language or podcast.language
 426     episode.mimetypes = ','.join(list(set(
 427         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 428     )))
 429     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 430                                       parsed_episode.get('flattr') or
 431                                       episode.flattr_url)
 432     episode.license = parsed_episode.get('license') or episode.license
 433
 434     episode.title = to_maxlength(Episode, 'title',
 435                                  parsed_episode.get('title') or
 436                                  episode.title or
 437                                  file_basename_no_extension(episode.url))
 438
 439     episode.last_update = datetime.utcnow()
 440     episode.save()
 441
 442     parsed_urls = list(chain.from_iterable(
 443         f.get('urls', []) for f in parsed_episode.get('files', [])))
 444     episode.add_missing_urls(parsed_urls)
 445
 446
 447 def mark_outdated(obj):
 448     """ marks obj outdated if its not already """
 449     if obj.outdated:
 450         return None
 451
 452     obj.outdated = True
 453     obj.last_update = datetime.utcnow()
 454     obj.save()
 455
 456
 457 def get_update_interval(episodes):
 458     """ calculates the avg interval between new episodes """
 459
 460     count = len(episodes)
 461     if not count:
 462         logger.info('no episodes, using default interval of %dh',
 463                     DEFAULT_UPDATE_INTERVAL)
 464         return DEFAULT_UPDATE_INTERVAL
 465
 466     earliest = episodes[0]
 467     now = datetime.utcnow()
 468
 469     timespan_s = (now - earliest.released).total_seconds()
 470     timespan_h = timespan_s / 60 / 60
 471
 472     interval = int(timespan_h / count)
 473     logger.info('%d episodes in %d days => %dh interval', count,
 474                 timespan_h / 24, interval)
 475
 476     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 477     interval = max(interval, MIN_UPDATE_INTERVAL)
 478     interval = min(interval, MAX_UPDATE_INTERVAL)
 479
 480     return interval
 481
 482
 483 def file_basename_no_extension(filename):
 484     """ Returns filename without extension
 485
 486     >>> file_basename_no_extension('/home/me/file.txt')
 487     'file'
 488
 489     >>> file_basename_no_extension('file')
 490     'file'
 491     """
 492     base = os.path.basename(filename)
 493     name, extension = os.path.splitext(base)
 494     return name