mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 from urlparse import urljoin
  23 import httplib
  24 import hashlib
  25 from datetime import datetime, timedelta
  26 from itertools import chain, islice
  27 import socket
  28 import requests
  29
  30 from django.db import transaction
  31 from django.conf import settings
  32
  33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
  34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
  35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
  36     MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
  37 from mygpo.utils import file_hash, to_maxlength
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.data.tasks import update_related_podcasts
  41 from mygpo.pubsub.models import SubscriptionError
  42 from mygpo.directory.tags import update_category
  43
  44 import logging
  45 logger = logging.getLogger(__name__)
  46
  47 MAX_EPISODES_UPDATE = 200
  48
  49
  50 class UpdatePodcastException(Exception):
  51     pass
  52
  53
  54 class NoPodcastCreated(Exception):
  55     """ raised when no podcast obj was created for a new URL """
  56
  57
  58 class NoEpisodesException(Exception):
  59     """ raised when parsing something that doesn't contain any episodes """
  60
  61
  62 def update_podcasts(queue):
  63     """ Fetch data for the URLs supplied as the queue iterable """
  64
  65     for n, podcast_url in enumerate(queue, 1):
  66         logger.info('Update %d - %s', n, podcast_url)
  67         try:
  68             yield update_podcast(podcast_url)
  69
  70         except NoPodcastCreated as npc:
  71             logger.info('No podcast created: %s', npc)
  72
  73         except:
  74             logger.exception('Error while updating podcast "%s"',
  75                              podcast_url)
  76             raise
  77
  78
  79 def update_podcast(podcast_url):
  80     """ Update the podcast for the supplied URL """
  81
  82     try:
  83         parsed = _fetch_feed(podcast_url)
  84         _validate_parsed(parsed)
  85
  86     except requests.exceptions.RequestException as re:
  87         logging.exception('Error while fetching response from feedservice')
  88
  89     except NoEpisodesException as nee:
  90         logging.warn('No episode found while parsing podcast')
  91
  92         # if we fail to parse the URL, we don't even create the
  93         # podcast object
  94         try:
  95             p = Podcast.objects.get(urls__url=podcast_url)
  96             # if it exists already, we mark it as outdated
  97             _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
  98             return p
  99
 100         except Podcast.DoesNotExist:
 101             raise NoPodcastCreated(nee)
 102
 103     assert parsed, 'fetch_feed must return something'
 104     p = Podcast.objects.get_or_create_for_url(podcast_url)
 105     episodes = _update_episodes(p, parsed.get('episodes', []))
 106     max_episode_order = _order_episodes(p)
 107     _update_podcast(p, parsed, episodes, max_episode_order)
 108     return p
 109
 110
 111 def verify_podcast_url(podcast_url):
 112     parsed = _fetch_feed(podcast_url)
 113     _validate_parsed(parsed)
 114     return True
 115
 116
 117 def _fetch_feed(podcast_url):
 118     params = {'url': podcast_url}
 119     headers = {
 120         'Accept': 'application/json',
 121     }
 122     # markdown and other parameters?
 123     url = urljoin(settings.FEEDSERVICE_URL, 'parse')
 124     r = requests.get(url, params=params, headers=headers, timeout=10)
 125     return r.json()[0]
 126
 127
 128 def _validate_parsed(parsed):
 129     """ validates the parsed results and raises an exception if invalid
 130
 131     feedparser parses pretty much everything. We reject anything that
 132     doesn't look like a feed"""
 133
 134     if not parsed or not parsed.get('episodes', []):
 135         raise NoEpisodesException('no episodes found')
 136
 137
 138 def _update_podcast(podcast, parsed, episodes, max_episode_order):
 139     """ updates a podcast according to new parser results """
 140
 141     # we need that later to decide if we can "bump" a category
 142     prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 143
 144     podcast.title = parsed.get('title') or podcast.title
 145     podcast.description = parsed.get('description') or podcast.description
 146     podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
 147     podcast.link = parsed.get('link') or podcast.link
 148     podcast.logo_url = parsed.get('logo') or podcast.logo_url
 149     podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
 150                                   podcast.author)
 151     podcast.language = to_maxlength(Podcast, 'language',
 152                                     parsed.get('language') or podcast.language)
 153     podcast.content_types = ','.join(parsed.get('content_types') or
 154                                      podcast.content_types)
 155     #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 156     podcast.common_episode_title = to_maxlength(
 157         Podcast,
 158         'common_episode_title',
 159         parsed.get('common_title') or podcast.common_episode_title)
 160     podcast.new_location = parsed.get('new_location') or podcast.new_location
 161     podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
 162                                       parsed.get('flattr') or
 163                                       podcast.flattr_url)
 164     podcast.hub = parsed.get('hub') or podcast.hub
 165     podcast.license = parsed.get('license') or podcast.license
 166     podcast.max_episode_order = max_episode_order
 167
 168     podcast.add_missing_urls(parsed.get('urls', []))
 169
 170     if podcast.new_location:
 171         try:
 172             new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
 173             if new_podcast != podcast:
 174                 _mark_outdated(podcast, 'redirected to different podcast')
 175                 return
 176         except Podcast.DoesNotExist:
 177             podcast.set_url(podcast.new_location)
 178
 179     # latest episode timestamp
 180     episodes = Episode.objects.filter(podcast=podcast,
 181                                       released__isnull=False)\
 182                               .order_by('released')
 183
 184     podcast.update_interval = get_update_interval(episodes)
 185
 186     latest_episode = episodes.last()
 187     if latest_episode:
 188         podcast.latest_episode_timestamp = latest_episode.released
 189
 190     # podcast.episode_count is not update here on purpose. It is, instead,
 191     # continuously updated when creating new episodes in
 192     # EpisodeManager.get_or_create_for_url
 193
 194     _update_categories(podcast, prev_latest_episode_timestamp)
 195
 196     # try to download the logo and reset logo_url to None on http errors
 197     found = _save_podcast_logo(podcast.logo_url)
 198     if not found:
 199         podcast.logo_url = None
 200
 201     # The podcast is always saved (not just when there are changes) because
 202     # we need to record the last update
 203     logger.info('Saving podcast.')
 204     podcast.last_update = datetime.utcnow()
 205     podcast.save()
 206
 207     try:
 208         subscribe_at_hub(podcast)
 209     except SubscriptionError as se:
 210         logger.warn('subscribing to hub failed: %s', str(se))
 211
 212     if not podcast.slug:
 213         slug = PodcastSlug(podcast).get_slug()
 214         if slug:
 215             podcast.add_slug(slug)
 216
 217     assign_missing_episode_slugs(podcast)
 218     update_related_podcasts.delay(podcast)
 219
 220
 221 def _update_categories(podcast, prev_timestamp):
 222     """ checks some practical requirements and updates a category """
 223
 224     max_timestamp = datetime.utcnow() + timedelta(days=1)
 225
 226     # no episodes at all
 227     if not podcast.latest_episode_timestamp:
 228         return
 229
 230     # no new episode
 231     if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 232         return
 233
 234     # too far in the future
 235     if podcast.latest_episode_timestamp > max_timestamp:
 236         return
 237
 238     # not enough subscribers
 239     if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 240         return
 241
 242     update_category(podcast)
 243
 244
 245 def _update_episodes(podcast, parsed_episodes):
 246
 247     pid = podcast.get_id()
 248
 249     # list of (obj, fun) where fun is the function to update obj
 250     updated_episodes = []
 251     episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 252     logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 253                 len(episodes_to_update))
 254
 255     logger.info('Updating %d episodes', len(episodes_to_update))
 256     for n, parsed in enumerate(episodes_to_update, 1):
 257
 258         url = get_episode_url(parsed)
 259         if not url:
 260             logger.info('Skipping episode %d for missing URL', n)
 261             continue
 262
 263         logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 264
 265         episode = Episode.objects.get_or_create_for_url(podcast, url)
 266
 267         update_episode(parsed, episode, podcast)
 268         updated_episodes.append(episode)
 269
 270     # and mark the remaining ones outdated
 271     current_episodes = Episode.objects.filter(podcast=podcast,
 272                                               outdated=False)[:500]
 273     outdated_episodes = set(current_episodes) - set(updated_episodes)
 274
 275     logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 276     for episode in outdated_episodes:
 277         mark_outdated(episode)
 278
 279
 280 @transaction.atomic
 281 def _order_episodes(podcast):
 282     """ Reorder the podcast's episode according to release timestamp
 283
 284     Returns the highest order value (corresponding to the most recent
 285     episode) """
 286
 287     num_episodes = podcast.episode_count
 288     if not num_episodes:
 289         return 0
 290
 291     episodes = podcast.episode_set.all().extra(select={
 292         'has_released': 'released IS NOT NULL',
 293         })\
 294         .order_by('-has_released', '-released', 'pk')\
 295         .only('pk')
 296
 297     for n, episode in enumerate(episodes.iterator(), 1):
 298         # assign ``order`` from higher (most recent) to 0 (oldest)
 299         # None means "unknown"
 300         new_order = num_episodes - n
 301
 302         # optimize for new episodes that are newer than all existing
 303         if episode.order == new_order:
 304             continue
 305
 306         logger.info('Updating order from {} to {}'.format(episode.order,
 307                                                           new_order))
 308         episode.order = new_order
 309         episode.save()
 310
 311     return num_episodes - 1
 312
 313
 314 def _save_podcast_logo(cover_art):
 315     if not cover_art:
 316         return
 317
 318     try:
 319         image_sha1 = hashlib.sha1(cover_art).hexdigest()
 320         prefix = CoverArt.get_prefix(image_sha1)
 321
 322         filename = CoverArt.get_original(prefix, image_sha1)
 323         dirname = CoverArt.get_dir(filename)
 324
 325         # get hash of existing file
 326         if os.path.exists(filename):
 327             with open(filename) as f:
 328                 old_hash = file_hash(f).digest()
 329         else:
 330             old_hash = ''
 331
 332         logger.info('Logo %s', cover_art)
 333
 334         # save new cover art
 335         with open(filename, 'w') as fp:
 336             fp.write(urllib2.urlopen(cover_art).read())
 337
 338         # get hash of new file
 339         with open(filename) as f:
 340             new_hash = file_hash(f).digest()
 341
 342         # remove thumbnails if cover changed
 343         if old_hash != new_hash:
 344             thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 345             logger.info('Removing %d thumbnails', len(thumbnails))
 346             for f in thumbnails:
 347                 os.unlink(f)
 348
 349         return cover_art
 350
 351     except (urllib2.HTTPError, urllib2.URLError, ValueError,
 352             httplib.BadStatusLine, socket.error, IOError) as e:
 353         logger.warn('Exception while updating podcast logo: %s', str(e))
 354
 355
 356 def _mark_outdated(podcast, msg=''):
 357     logger.info('marking podcast outdated: %s', msg)
 358     podcast.outdated = True
 359     podcast.last_update = datetime.utcnow()
 360     podcast.save()
 361     _update_episodes(podcast, [])
 362
 363
 364 def get_episode_url(parsed_episode):
 365     """ returns the URL of a parsed episode """
 366     for f in parsed_episode.get('files', []):
 367         if f.get('urls', []):
 368             return f['urls'][0]
 369     return None
 370
 371
 372 def update_episode(parsed_episode, episode, podcast):
 373     """ updates "episode" with the data from "parsed_episode" """
 374
 375     # TODO: check if there have been any changes, to avoid unnecessary updates
 376     episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
 377                                 episode.guid)
 378     episode.description = parsed_episode.get('description') or \
 379         episode.description
 380     episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
 381     episode.content = parsed_episode.get('content') or \
 382         parsed_episode.get('description') or episode.content
 383     episode.link = to_maxlength(Episode, 'link',
 384                                 parsed_episode.get('link') or episode.link)
 385     episode.released = datetime.utcfromtimestamp(
 386         parsed_episode.get('released')) if parsed_episode.get('released') \
 387         else episode.released
 388     episode.author = to_maxlength(Episode, 'author',
 389                                   parsed_episode.get('author') or
 390                                   episode.author)
 391     episode.duration = parsed_episode.get('duration') or episode.duration
 392     episode.filesize = parsed_episode['files'][0]['filesize']
 393     episode.language = parsed_episode.get('language') or \
 394         episode.language or podcast.language
 395     episode.mimetypes = ','.join(list(set(
 396         filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
 397     )))
 398     episode.flattr_url = to_maxlength(Episode, 'flattr_url',
 399                                       parsed_episode.get('flattr') or
 400                                       episode.flattr_url)
 401     episode.license = parsed_episode.get('license') or episode.license
 402
 403     episode.title = to_maxlength(Episode, 'title',
 404                                  parsed_episode.get('title') or
 405                                  episode.title or
 406                                  file_basename_no_extension(episode.url))
 407
 408     episode.last_update = datetime.utcnow()
 409     episode.save()
 410
 411     parsed_urls = list(chain.from_iterable(
 412         f.get('urls', []) for f in parsed_episode.get('files', [])))
 413     episode.add_missing_urls(parsed_urls)
 414
 415
 416 def mark_outdated(obj):
 417     """ marks obj outdated if its not already """
 418     if obj.outdated:
 419         return None
 420
 421     obj.outdated = True
 422     obj.last_update = datetime.utcnow()
 423     obj.save()
 424
 425
 426 def get_update_interval(episodes):
 427     """ calculates the avg interval between new episodes """
 428
 429     count = len(episodes)
 430     if not count:
 431         logger.info('no episodes, using default interval of %dh',
 432                     DEFAULT_UPDATE_INTERVAL)
 433         return DEFAULT_UPDATE_INTERVAL
 434
 435     earliest = episodes[0]
 436     now = datetime.utcnow()
 437
 438     timespan_s = (now - earliest.released).total_seconds()
 439     timespan_h = timespan_s / 60 / 60
 440
 441     interval = int(timespan_h / count)
 442     logger.info('%d episodes in %d days => %dh interval', count,
 443                 timespan_h / 24, interval)
 444
 445     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 446     interval = max(interval, MIN_UPDATE_INTERVAL)
 447     interval = min(interval, MAX_UPDATE_INTERVAL)
 448
 449     return interval
 450
 451
 452 def file_basename_no_extension(filename):
 453     """ Returns filename without extension
 454
 455     >>> file_basename_no_extension('/home/me/file.txt')
 456     'file'
 457
 458     >>> file_basename_no_extension('file')
 459     'file'
 460     """
 461     base = os.path.basename(filename)
 462     name, extension = os.path.splitext(base)
 463     return name