mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib.request, urllib.error, urllib.parse
  22 from urllib.parse import urljoin
  23 import http.client
  24 import hashlib
  25 from datetime import datetime, timedelta
  26 from itertools import chain, islice
  27 import socket
  28 import requests
  29
  30 from django.db import transaction
  31 from django.conf import settings
  32
  33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  36     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  37 from mygpo.utils import file_hash, to_maxlength
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.data.tasks import update_related_podcasts
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE = 200
  48
  49
  50 class UpdatePodcastException(Exception):
  51     pass
  52
  53
  54 class NoPodcastCreated(Exception):
  55     """ raised when no podcast obj was created for a new URL """
  56
  57
  58 class NoEpisodesException(Exception):
  59     """ raised when parsing something that doesn't contain any episodes """
  60
  61
  62 def update_podcasts(queue):
  63     """ Fetch data for the URLs supplied as the queue iterable """
  64
  65     for n, podcast_url in enumerate(queue, 1):
  66         logger.info('Update %d - %s', n, podcast_url)
  67         try:
  68             yield update_podcast(podcast_url)
  69
  70         except NoPodcastCreated as npc:
  71             logger.info('No podcast created: %s', npc)
  72
  73         except:
  74             logger.exception('Error while updating podcast "%s"',
  75                              podcast_url)
  76             raise
  77
  78
  79 def update_podcast(podcast_url):
  80     """ Update the podcast for the supplied URL """
  81
  82     try:
  83         parsed = _fetch_feed(podcast_url)
  84         _validate_parsed(parsed)
  85
  86     except requests.exceptions.RequestException as re:
  87         logging.exception('Error while fetching response from feedservice')
  88         return
  89
  90     except NoEpisodesException as nee:
  91         logging.warn('No episode found while parsing podcast')
  92
  93         # if we fail to parse the URL, we don't even create the
  94         # podcast object
  95         try:
  96             p = Podcast.objects.get(urls__url=podcast_url)
  97             # if it exists already, we mark it as outdated
  98             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
  99             return p
 100
 101         except Podcast.DoesNotExist:
 102             raise NoPodcastCreated(nee)
 103
 104     assert parsed, 'fetch_feed must return something'
 105     p = Podcast.objects.get_or_create_for_url(podcast_url)
 106     episodes = _update_episodes(p, parsed.get('episodes', []))
 107     p.refresh_from_db()
 108     p.episode_count = Episode.objects.filter(podcast=p).count()
 109     p.save()
 110     max_episode_order = _order_episodes(p)
 111     _update_podcast(p, parsed, episodes, max_episode_order)
 112     return p
 113
 114
 115 def verify_podcast_url(podcast_url):
 116     parsed = _fetch_feed(podcast_url)
 117     _validate_parsed(parsed)
 118     return True
 119
 120
 121 def _fetch_feed(podcast_url):
 122     params = {
 123         'url': podcast_url,
 124         'process_text': 'markdown',
 125     }
 126     headers = {
 127         'Accept': 'application/json',
 128     }
 129     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 130     r = requests.get(url, params=params, headers=headers, timeout=10)
 131     try:
 132         return r.json()[0]
 133     except ValueError:
 134         logger.exception('Error while parsing response: {}', r.text)
 135         raise
 136
 137
 138 def _validate_parsed(parsed):
 139     """ validates the parsed results and raises an exception if invalid
 140
 141     feedparser parses pretty much everything. We reject anything that
 142     doesn't look like a feed"""
 143
 144     if not parsed or not parsed.get('episodes', []):
 145         raise NoEpisodesException('no episodes found')
 146
 147
 148 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 149     """ updates a podcast according to new parser results """
 150
 151     # we need that later to decide if we can "bump" a category
 152     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 153
 154     podcast.title = parsed.get('title') or podcast.title
 155     podcast.description = parsed.get('description') or podcast.description
 156     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 157     podcast.link = parsed.get('link') or podcast.link
 158     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 159     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 160                                   podcast.author)
 161     podcast.language = to_maxlength(Podcast, 'language',
 162                                     parsed.get('language') or podcast.language)
 163     podcast.content_types = ','.join(parsed.get('content_types')) or \
 164                                      podcast.content_types
 165     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 166     podcast.common_episode_title = to_maxlength(
 167         Podcast,
 168         'common_episode_title',
 169         parsed.get('common_title') or podcast.common_episode_title)
 170     podcast.new_location = parsed.get('new_location') or podcast.new_location
 171     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 172                                       parsed.get('flattr') or
 173                                       podcast.flattr_url)
 174     podcast.hub = parsed.get('hub') or podcast.hub
 175     podcast.license = parsed.get('license') or podcast.license
 176     podcast.max_episode_order = max_episode_order
 177
 178     podcast.add_missing_urls(parsed.get('urls', []))
 179
 180     if podcast.new_location:
 181         try:
 182             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 183             if new_podcast != podcast:
 184                 _mark_outdated(podcast, 'redirected to different podcast')
 185                 return
 186         except Podcast.DoesNotExist:
 187             podcast.set_url(podcast.new_location)
 188
 189     # latest episode timestamp
 190     episodes = Episode.objects.filter(podcast=podcast,
 191                                       released__isnull=False)\
 192                               .order_by('released')
 193
 194     podcast.update_interval = get_update_interval(episodes)
 195
 196     latest_episode = episodes.last()
 197     if latest_episode:
 198         podcast.latest_episode_timestamp = latest_episode.released
 199
 200     # podcast.episode_count is not update here on purpose. It is, instead,
 201     # continuously updated when creating new episodes in
 202     # EpisodeManager.get_or_create_for_url
 203
 204     _update_categories(podcast, prev_latest_episode_timestamp)
 205
 206     # try to download the logo and reset logo_url to None on http errors
 207     found = _save_podcast_logo(podcast.logo_url)
 208     if not found:
 209         podcast.logo_url = None
 210
 211     # The podcast is always saved (not just when there are changes) because
 212     # we need to record the last update
 213     logger.info('Saving podcast.')
 214     podcast.last_update = datetime.utcnow()
 215     podcast.save()
 216
 217     try:
 218         subscribe_at_hub(podcast)
 219     except SubscriptionError as se:
 220         logger.warn('subscribing to hub failed: %s', str(se))
 221
 222     if not podcast.slug:
 223         slug = PodcastSlug(podcast).get_slug()
 224         if slug:
 225             podcast.add_slug(slug)
 226
 227     assign_missing_episode_slugs(podcast)
 228     update_related_podcasts.delay(podcast)
 229
 230
 231 def _update_categories(podcast, prev_timestamp):
 232     """ checks some practical requirements and updates a category """
 233
 234     max_timestamp = datetime.utcnow() + timedelta(days=1)
 235
 236     # no episodes at all
 237     if not podcast.latest_episode_timestamp:
 238         return
 239
 240     # no new episode
 241     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 242         return
 243
 244     # too far in the future
 245     if podcast.latest_episode_timestamp > max_timestamp:
 246         return
 247
 248     # not enough subscribers
 249     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 250         return
 251
 252     update_category(podcast)
 253
 254
 255 def _update_episodes(podcast, parsed_episodes):
 256
 257     pid = podcast.get_id()
 258
 259     # list of (obj, fun) where fun is the function to update obj
 260     updated_episodes = []
 261     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 262     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 263                 len(episodes_to_update))
 264
 265     logger.info('Updating %d episodes', len(episodes_to_update))
 266     for n, parsed in enumerate(episodes_to_update, 1):
 267
 268         url = get_episode_url(parsed)
 269         if not url:
 270             logger.info('Skipping episode %d for missing URL', n)
 271             continue
 272
 273         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 274
 275         episode = Episode.objects.get_or_create_for_url(podcast, url)
 276
 277         update_episode(parsed, episode, podcast)
 278         updated_episodes.append(episode)
 279
 280     # and mark the remaining ones outdated
 281     current_episodes = Episode.objects.filter(podcast=podcast,
 282                                               outdated=False)[:500]
 283     outdated_episodes = set(current_episodes) - set(updated_episodes)
 284
 285     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 286     for episode in outdated_episodes:
 287         mark_outdated(episode)
 288
 289
 290 @transaction.atomic
 291 def _order_episodes(podcast):
 292     """ Reorder the podcast's episode according to release timestamp
 293
 294     Returns the highest order value (corresponding to the most recent
 295     episode) """
 296
 297     num_episodes = podcast.episode_count
 298     if not num_episodes:
 299         return 0
 300
 301     episodes = podcast.episode_set.all().extra(select={
 302         'has_released': 'released IS NOT NULL',
 303         })\
 304         .order_by('-has_released', '-released', 'pk')\
 305         .only('pk')
 306
 307     for n, episode in enumerate(episodes.iterator(), 1):
 308         # assign ``order`` from higher (most recent) to 0 (oldest)
 309         # None means "unknown"
 310         new_order = num_episodes - n
 311
 312         # optimize for new episodes that are newer than all existing
 313         if episode.order == new_order:
 314             continue
 315
 316         logger.info('Updating order from {} to {}'.format(episode.order,
 317                                                           new_order))
 318         episode.order = new_order
 319         episode.save()
 320
 321     return num_episodes - 1
 322
 323
 324 def _save_podcast_logo(cover_art):
 325     if not cover_art:
 326         return
 327
 328     try:
 329         image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
 330         prefix = CoverArt.get_prefix(image_sha1)
 331
 332         filename = CoverArt.get_original(prefix, image_sha1)
 333         dirname = CoverArt.get_dir(filename)
 334
 335         # get hash of existing file
 336         if os.path.exists(filename):
 337             with open(filename) as f:
 338                 old_hash = file_hash(f).digest()
 339         else:
 340             old_hash = ''
 341
 342         logger.info('Logo %s', cover_art)
 343
 344         # save new cover art
 345         with open(filename, 'wb') as fp:
 346             fp.write(urllib.request.urlopen(cover_art).read())
 347
 348         # get hash of new file
 349         with open(filename) as f:
 350             new_hash = file_hash(f).digest()
 351
 352         # remove thumbnails if cover changed
 353         if old_hash != new_hash:
 354             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 355             logger.info('Removing %d thumbnails', len(thumbnails))
 356             for f in thumbnails:
 357                 os.unlink(f)
 358
 359         return cover_art
 360
 361     except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
 362             http.client.HTTPException, socket.error, IOError) as e:
 363         logger.warn('Exception while updating podcast logo: %s', str(e))
 364
 365
 366 def _mark_outdated(podcast, msg=''):
 367     logger.info('marking podcast outdated: %s', msg)
 368     podcast.outdated = True
 369     podcast.last_update = datetime.utcnow()
 370     podcast.save()
 371     _update_episodes(podcast, [])
 372
 373
 374 def get_episode_url(parsed_episode):
 375     """ returns the URL of a parsed episode """
 376     for f in parsed_episode.get('files', []):
 377         if f.get('urls', []):
 378             return f['urls'][0]
 379     return None
 380
 381
 382 def update_episode(parsed_episode, episode, podcast):
 383     """ updates "episode" with the data from "parsed_episode" """
 384
 385     # TODO: check if there have been any changes, to avoid unnecessary updates
 386     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 387                                 episode.guid)
 388     episode.description = parsed_episode.get('description') or \
 389         episode.description
 390     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 391     episode.content = parsed_episode.get('content') or \
 392         parsed_episode.get('description') or episode.content
 393     episode.link = to_maxlength(Episode, 'link',
 394                                 parsed_episode.get('link') or episode.link)
 395     episode.released = datetime.utcfromtimestamp(
 396         parsed_episode.get('released')) if parsed_episode.get('released') \
 397         else episode.released
 398     episode.author = to_maxlength(Episode, 'author',
 399                                   parsed_episode.get('author') or
 400                                   episode.author)
 401     episode.duration = parsed_episode.get('duration') or episode.duration
 402     episode.filesize = parsed_episode['files'][0]['filesize']
 403     episode.language = parsed_episode.get('language') or \
 404         episode.language or podcast.language
 405     episode.mimetypes = ','.join(list(set(
 406         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 407     )))
 408     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 409                                       parsed_episode.get('flattr') or
 410                                       episode.flattr_url)
 411     episode.license = parsed_episode.get('license') or episode.license
 412
 413     episode.title = to_maxlength(Episode, 'title',
 414                                  parsed_episode.get('title') or
 415                                  episode.title or
 416                                  file_basename_no_extension(episode.url))
 417
 418     episode.last_update = datetime.utcnow()
 419     episode.save()
 420
 421     parsed_urls = list(chain.from_iterable(
 422         f.get('urls', []) for f in parsed_episode.get('files', [])))
 423     episode.add_missing_urls(parsed_urls)
 424
 425
 426 def mark_outdated(obj):
 427     """ marks obj outdated if its not already """
 428     if obj.outdated:
 429         return None
 430
 431     obj.outdated = True
 432     obj.last_update = datetime.utcnow()
 433     obj.save()
 434
 435
 436 def get_update_interval(episodes):
 437     """ calculates the avg interval between new episodes """
 438
 439     count = len(episodes)
 440     if not count:
 441         logger.info('no episodes, using default interval of %dh',
 442                     DEFAULT_UPDATE_INTERVAL)
 443         return DEFAULT_UPDATE_INTERVAL
 444
 445     earliest = episodes[0]
 446     now = datetime.utcnow()
 447
 448     timespan_s = (now - earliest.released).total_seconds()
 449     timespan_h = timespan_s / 60 / 60
 450
 451     interval = int(timespan_h / count)
 452     logger.info('%d episodes in %d days => %dh interval', count,
 453                 timespan_h / 24, interval)
 454
 455     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 456     interval = max(interval, MIN_UPDATE_INTERVAL)
 457     interval = min(interval, MAX_UPDATE_INTERVAL)
 458
 459     return interval
 460
 461
 462 def file_basename_no_extension(filename):
 463     """ Returns filename without extension
 464
 465     >>> file_basename_no_extension('/home/me/file.txt')
 466     'file'
 467
 468     >>> file_basename_no_extension('file')
 469     'file'
 470     """
 471     base = os.path.basename(filename)
 472     name, extension = os.path.splitext(base)
 473     return name