mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib.request, urllib.error, urllib.parse
  22 from urllib.parse import urljoin
  23 import http.client
  24 import hashlib
  25 from datetime import datetime, timedelta
  26 from itertools import chain, islice
  27 import socket
  28 import requests
  29
  30 from django.db import transaction
  31 from django.conf import settings
  32
  33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  36     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  37 from mygpo.utils import file_hash, to_maxlength
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.data.tasks import update_related_podcasts
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE = 200
  48
  49
  50 class UpdatePodcastException(Exception):
  51     pass
  52
  53
  54 class NoPodcastCreated(Exception):
  55     """ raised when no podcast obj was created for a new URL """
  56
  57
  58 class NoEpisodesException(Exception):
  59     """ raised when parsing something that doesn't contain any episodes """
  60
  61
  62 def update_podcasts(queue):
  63     """ Fetch data for the URLs supplied as the queue iterable """
  64
  65     for n, podcast_url in enumerate(queue, 1):
  66         logger.info('Update %d - %s', n, podcast_url)
  67         try:
  68             yield update_podcast(podcast_url)
  69
  70         except NoPodcastCreated as npc:
  71             logger.info('No podcast created: %s', npc)
  72
  73         except:
  74             logger.exception('Error while updating podcast "%s"',
  75                              podcast_url)
  76             raise
  77
  78
  79 def update_podcast(podcast_url):
  80     """ Update the podcast for the supplied URL """
  81
  82     try:
  83         parsed = _fetch_feed(podcast_url)
  84         _validate_parsed(parsed)
  85
  86     except requests.exceptions.RequestException as re:
  87         logging.exception('Error while fetching response from feedservice')
  88         return
  89
  90     except NoEpisodesException as nee:
  91         logging.warn('No episode found while parsing podcast')
  92
  93         # if we fail to parse the URL, we don't even create the
  94         # podcast object
  95         try:
  96             p = Podcast.objects.get(urls__url=podcast_url)
  97             # if it exists already, we mark it as outdated
  98             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
  99             return p
 100
 101         except Podcast.DoesNotExist:
 102             raise NoPodcastCreated(nee)
 103
 104     assert parsed, 'fetch_feed must return something'
 105     p = Podcast.objects.get_or_create_for_url(podcast_url)
 106     episodes = _update_episodes(p, parsed.get('episodes', []))
 107     max_episode_order = _order_episodes(p)
 108     _update_podcast(p, parsed, episodes, max_episode_order)
 109     return p
 110
 111
 112 def verify_podcast_url(podcast_url):
 113     parsed = _fetch_feed(podcast_url)
 114     _validate_parsed(parsed)
 115     return True
 116
 117
 118 def _fetch_feed(podcast_url):
 119     params = {
 120         'url': podcast_url,
 121         'process_text': 'markdown',
 122     }
 123     headers = {
 124         'Accept': 'application/json',
 125     }
 126     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 127     r = requests.get(url, params=params, headers=headers, timeout=10)
 128     try:
 129         return r.json()[0]
 130     except ValueError:
 131         logger.exception('Error while parsing response: {}', r.text)
 132         raise
 133
 134
 135 def _validate_parsed(parsed):
 136     """ validates the parsed results and raises an exception if invalid
 137
 138     feedparser parses pretty much everything. We reject anything that
 139     doesn't look like a feed"""
 140
 141     if not parsed or not parsed.get('episodes', []):
 142         raise NoEpisodesException('no episodes found')
 143
 144
 145 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 146     """ updates a podcast according to new parser results """
 147
 148     # we need that later to decide if we can "bump" a category
 149     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 150
 151     podcast.title = parsed.get('title') or podcast.title
 152     podcast.description = parsed.get('description') or podcast.description
 153     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 154     podcast.link = parsed.get('link') or podcast.link
 155     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 156     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 157                                   podcast.author)
 158     podcast.language = to_maxlength(Podcast, 'language',
 159                                     parsed.get('language') or podcast.language)
 160     podcast.content_types = ','.join(parsed.get('content_types')) or \
 161                                      podcast.content_types
 162     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 163     podcast.common_episode_title = to_maxlength(
 164         Podcast,
 165         'common_episode_title',
 166         parsed.get('common_title') or podcast.common_episode_title)
 167     podcast.new_location = parsed.get('new_location') or podcast.new_location
 168     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 169                                       parsed.get('flattr') or
 170                                       podcast.flattr_url)
 171     podcast.hub = parsed.get('hub') or podcast.hub
 172     podcast.license = parsed.get('license') or podcast.license
 173     podcast.max_episode_order = max_episode_order
 174
 175     podcast.add_missing_urls(parsed.get('urls', []))
 176
 177     if podcast.new_location:
 178         try:
 179             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 180             if new_podcast != podcast:
 181                 _mark_outdated(podcast, 'redirected to different podcast')
 182                 return
 183         except Podcast.DoesNotExist:
 184             podcast.set_url(podcast.new_location)
 185
 186     # latest episode timestamp
 187     episodes = Episode.objects.filter(podcast=podcast,
 188                                       released__isnull=False)\
 189                               .order_by('released')
 190
 191     podcast.update_interval = get_update_interval(episodes)
 192
 193     latest_episode = episodes.last()
 194     if latest_episode:
 195         podcast.latest_episode_timestamp = latest_episode.released
 196
 197     # podcast.episode_count is not update here on purpose. It is, instead,
 198     # continuously updated when creating new episodes in
 199     # EpisodeManager.get_or_create_for_url
 200
 201     _update_categories(podcast, prev_latest_episode_timestamp)
 202
 203     # try to download the logo and reset logo_url to None on http errors
 204     found = _save_podcast_logo(podcast.logo_url)
 205     if not found:
 206         podcast.logo_url = None
 207
 208     # The podcast is always saved (not just when there are changes) because
 209     # we need to record the last update
 210     logger.info('Saving podcast.')
 211     podcast.last_update = datetime.utcnow()
 212     podcast.save()
 213
 214     try:
 215         subscribe_at_hub(podcast)
 216     except SubscriptionError as se:
 217         logger.warn('subscribing to hub failed: %s', str(se))
 218
 219     if not podcast.slug:
 220         slug = PodcastSlug(podcast).get_slug()
 221         if slug:
 222             podcast.add_slug(slug)
 223
 224     assign_missing_episode_slugs(podcast)
 225     update_related_podcasts.delay(podcast)
 226
 227
 228 def _update_categories(podcast, prev_timestamp):
 229     """ checks some practical requirements and updates a category """
 230
 231     max_timestamp = datetime.utcnow() + timedelta(days=1)
 232
 233     # no episodes at all
 234     if not podcast.latest_episode_timestamp:
 235         return
 236
 237     # no new episode
 238     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 239         return
 240
 241     # too far in the future
 242     if podcast.latest_episode_timestamp > max_timestamp:
 243         return
 244
 245     # not enough subscribers
 246     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 247         return
 248
 249     update_category(podcast)
 250
 251
 252 def _update_episodes(podcast, parsed_episodes):
 253
 254     pid = podcast.get_id()
 255
 256     # list of (obj, fun) where fun is the function to update obj
 257     updated_episodes = []
 258     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 259     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 260                 len(episodes_to_update))
 261
 262     logger.info('Updating %d episodes', len(episodes_to_update))
 263     for n, parsed in enumerate(episodes_to_update, 1):
 264
 265         url = get_episode_url(parsed)
 266         if not url:
 267             logger.info('Skipping episode %d for missing URL', n)
 268             continue
 269
 270         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 271
 272         episode = Episode.objects.get_or_create_for_url(podcast, url)
 273
 274         update_episode(parsed, episode, podcast)
 275         updated_episodes.append(episode)
 276
 277     # and mark the remaining ones outdated
 278     current_episodes = Episode.objects.filter(podcast=podcast,
 279                                               outdated=False)[:500]
 280     outdated_episodes = set(current_episodes) - set(updated_episodes)
 281
 282     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 283     for episode in outdated_episodes:
 284         mark_outdated(episode)
 285
 286
 287 @transaction.atomic
 288 def _order_episodes(podcast):
 289     """ Reorder the podcast's episode according to release timestamp
 290
 291     Returns the highest order value (corresponding to the most recent
 292     episode) """
 293
 294     num_episodes = podcast.episode_count
 295     if not num_episodes:
 296         return 0
 297
 298     episodes = podcast.episode_set.all().extra(select={
 299         'has_released': 'released IS NOT NULL',
 300         })\
 301         .order_by('-has_released', '-released', 'pk')\
 302         .only('pk')
 303
 304     for n, episode in enumerate(episodes.iterator(), 1):
 305         # assign ``order`` from higher (most recent) to 0 (oldest)
 306         # None means "unknown"
 307         new_order = num_episodes - n
 308
 309         # optimize for new episodes that are newer than all existing
 310         if episode.order == new_order:
 311             continue
 312
 313         logger.info('Updating order from {} to {}'.format(episode.order,
 314                                                           new_order))
 315         episode.order = new_order
 316         episode.save()
 317
 318     return num_episodes - 1
 319
 320
 321 def _save_podcast_logo(cover_art):
 322     if not cover_art:
 323         return
 324
 325     try:
 326         image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
 327         prefix = CoverArt.get_prefix(image_sha1)
 328
 329         filename = CoverArt.get_original(prefix, image_sha1)
 330         dirname = CoverArt.get_dir(filename)
 331
 332         # get hash of existing file
 333         if os.path.exists(filename):
 334             with open(filename) as f:
 335                 old_hash = file_hash(f).digest()
 336         else:
 337             old_hash = ''
 338
 339         logger.info('Logo %s', cover_art)
 340
 341         # save new cover art
 342         with open(filename, 'wb') as fp:
 343             fp.write(urllib.request.urlopen(cover_art).read())
 344
 345         # get hash of new file
 346         with open(filename) as f:
 347             new_hash = file_hash(f).digest()
 348
 349         # remove thumbnails if cover changed
 350         if old_hash != new_hash:
 351             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 352             logger.info('Removing %d thumbnails', len(thumbnails))
 353             for f in thumbnails:
 354                 os.unlink(f)
 355
 356         return cover_art
 357
 358     except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
 359             http.client.HTTPException, socket.error, IOError) as e:
 360         logger.warn('Exception while updating podcast logo: %s', str(e))
 361
 362
 363 def _mark_outdated(podcast, msg=''):
 364     logger.info('marking podcast outdated: %s', msg)
 365     podcast.outdated = True
 366     podcast.last_update = datetime.utcnow()
 367     podcast.save()
 368     _update_episodes(podcast, [])
 369
 370
 371 def get_episode_url(parsed_episode):
 372     """ returns the URL of a parsed episode """
 373     for f in parsed_episode.get('files', []):
 374         if f.get('urls', []):
 375             return f['urls'][0]
 376     return None
 377
 378
 379 def update_episode(parsed_episode, episode, podcast):
 380     """ updates "episode" with the data from "parsed_episode" """
 381
 382     # TODO: check if there have been any changes, to avoid unnecessary updates
 383     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 384                                 episode.guid)
 385     episode.description = parsed_episode.get('description') or \
 386         episode.description
 387     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 388     episode.content = parsed_episode.get('content') or \
 389         parsed_episode.get('description') or episode.content
 390     episode.link = to_maxlength(Episode, 'link',
 391                                 parsed_episode.get('link') or episode.link)
 392     episode.released = datetime.utcfromtimestamp(
 393         parsed_episode.get('released')) if parsed_episode.get('released') \
 394         else episode.released
 395     episode.author = to_maxlength(Episode, 'author',
 396                                   parsed_episode.get('author') or
 397                                   episode.author)
 398     episode.duration = parsed_episode.get('duration') or episode.duration
 399     episode.filesize = parsed_episode['files'][0]['filesize']
 400     episode.language = parsed_episode.get('language') or \
 401         episode.language or podcast.language
 402     episode.mimetypes = ','.join(list(set(
 403         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 404     )))
 405     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 406                                       parsed_episode.get('flattr') or
 407                                       episode.flattr_url)
 408     episode.license = parsed_episode.get('license') or episode.license
 409
 410     episode.title = to_maxlength(Episode, 'title',
 411                                  parsed_episode.get('title') or
 412                                  episode.title or
 413                                  file_basename_no_extension(episode.url))
 414
 415     episode.last_update = datetime.utcnow()
 416     episode.save()
 417
 418     parsed_urls = list(chain.from_iterable(
 419         f.get('urls', []) for f in parsed_episode.get('files', [])))
 420     episode.add_missing_urls(parsed_urls)
 421
 422
 423 def mark_outdated(obj):
 424     """ marks obj outdated if its not already """
 425     if obj.outdated:
 426         return None
 427
 428     obj.outdated = True
 429     obj.last_update = datetime.utcnow()
 430     obj.save()
 431
 432
 433 def get_update_interval(episodes):
 434     """ calculates the avg interval between new episodes """
 435
 436     count = len(episodes)
 437     if not count:
 438         logger.info('no episodes, using default interval of %dh',
 439                     DEFAULT_UPDATE_INTERVAL)
 440         return DEFAULT_UPDATE_INTERVAL
 441
 442     earliest = episodes[0]
 443     now = datetime.utcnow()
 444
 445     timespan_s = (now - earliest.released).total_seconds()
 446     timespan_h = timespan_s / 60 / 60
 447
 448     interval = int(timespan_h / count)
 449     logger.info('%d episodes in %d days => %dh interval', count,
 450                 timespan_h / 24, interval)
 451
 452     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 453     interval = max(interval, MIN_UPDATE_INTERVAL)
 454     interval = min(interval, MAX_UPDATE_INTERVAL)
 455
 456     return interval
 457
 458
 459 def file_basename_no_extension(filename):
 460     """ Returns filename without extension
 461
 462     >>> file_basename_no_extension('/home/me/file.txt')
 463     'file'
 464
 465     >>> file_basename_no_extension('file')
 466     'file'
 467     """
 468     base = os.path.basename(filename)
 469     name, extension = os.path.splitext(base)
 470     return name