mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain, islice
  27 import socket
  28
  29 from django.conf import settings
  30
  31 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  32          PodcastSlug
  33 from mygpo.core.models import DEFAULT_UPDATE_INTERVAL, MIN_UPDATE_INTERVAL, \
  34     MAX_UPDATE_INTERVAL
  35 from feedservice.parse import parse_feed, FetchFeedException
  36 from feedservice.parse.text import ConvertMarkdown
  37 from feedservice.parse.models import ParserException
  38 from feedservice.parse.vimeo import VimeoError
  39 from mygpo.utils import file_hash, deep_eq
  40 from mygpo.web.logo import CoverArt
  41 from mygpo.data.podcast import subscribe_at_hub
  42 from mygpo.pubsub.models import SubscriptionError
  43 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  44          episodes_for_podcast_current, episode_count_for_podcast
  45 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
  46 from mygpo.directory.tags import update_category
  47 from mygpo.decorators import repeat_on_conflict
  48 from mygpo.db.couchdb import get_main_database, bulk_save_retry
  49
  50 import logging
  51 logger = logging.getLogger(__name__)
  52
  53 MAX_EPISODES_UPDATE=200
  54
  55 class NoPodcastCreated(Exception):
  56     """ raised when no podcast obj was created for a new URL """
  57
  58
  59 class NoEpisodesException(Exception):
  60     """ raised when parsing something that doesn't contain any episodes """
  61
  62
  63 class PodcastUpdater(object):
  64     """ Updates a number of podcasts with data from their feeds """
  65
  66     def __init__(self):
  67         """ Queue is an iterable of podcast objects """
  68         self.db = get_main_database()
  69
  70
  71     def update_queue(self, queue):
  72         """ Fetch data for the URLs supplied as the queue iterable """
  73
  74         for n, podcast_url in enumerate(queue, 1):
  75             logger.info('Update %d - %s', n, podcast_url)
  76             try:
  77                 yield self.update(podcast_url)
  78
  79             except NoPodcastCreated as npc:
  80                 logger.info('No podcast created: %s', npc)
  81
  82
  83     def update(self, podcast_url):
  84         """ Update the podcast for the supplied URL """
  85
  86         try:
  87             parsed = self._fetch_feed(podcast_url)
  88             self._validate_parsed(parsed)
  89
  90         except (ParserException, FetchFeedException, NoEpisodesException,
  91                 VimeoError, ValueError, socket.error) as ex:
  92             #TODO: catch valueError (for invalid Ipv6 in feedservice)
  93
  94             if isinstance(ex, VimeoError):
  95                 logger.exception('Problem when updating Vimeo feed %s',
  96                                  podcast_url)
  97
  98             # if we fail to parse the URL, we don't even create the
  99             # podcast object
 100             p = podcast_for_url(podcast_url, create=False)
 101             if p:
 102                 # if it exists already, we mark it as outdated
 103                 self._mark_outdated(p, 'error while fetching feed: %s' %
 104                     str(ex))
 105                 return p
 106
 107             else:
 108                 raise NoPodcastCreated(ex)
 109
 110         assert parsed, 'fetch_feed must return something'
 111         p = podcast_for_url(podcast_url, create=True)
 112         episodes = self._update_episodes(p, parsed.episodes)
 113         self._update_podcast(p, parsed, episodes)
 114         return p
 115
 116
 117     def verify_podcast_url(self, podcast_url):
 118         parsed = self._fetch_feed(podcast_url)
 119         self._validate_parsed(parsed)
 120         return True
 121
 122
 123     def _fetch_feed(self, podcast_url):
 124         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 125
 126
 127
 128     def _validate_parsed(self, parsed):
 129         """ validates the parsed results and raises an exception if invalid
 130
 131         feedparser parses pretty much everything. We reject anything that
 132         doesn't look like a feed"""
 133
 134         if not parsed or not parsed.episodes:
 135             raise NoEpisodesException('no episodes found')
 136
 137
 138     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 139     def _update_podcast(self, podcast, parsed, episodes):
 140         """ updates a podcast according to new parser results """
 141
 142         # we need that later to decide if we can "bump" a category
 143         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 144
 145         old_json = copy.deepcopy(podcast.to_json())
 146
 147         podcast.title = parsed.title or podcast.title
 148         podcast.urls = list(set(podcast.urls + parsed.urls))
 149         podcast.description = parsed.description or podcast.description
 150         podcast.subtitle = parsed.subtitle or podcast.subtitle
 151         podcast.link = parsed.link or podcast.link
 152         podcast.logo_url = parsed.logo or podcast.logo_url
 153         podcast.author = parsed.author or podcast.author
 154         podcast.language = parsed.language or podcast.language
 155         podcast.content_types = parsed.content_types or podcast.content_types
 156         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 157         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 158         podcast.new_location = parsed.new_location or podcast.new_location
 159         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 160         podcast.hub = parsed.hub or podcast.hub
 161         podcast.license = parsed.license or podcast.license
 162
 163
 164         if podcast.new_location:
 165             new_podcast = podcast_for_url(podcast.new_location)
 166             if new_podcast != podcast:
 167                 self._mark_outdated(podcast, 'redirected to different podcast')
 168                 return
 169
 170             elif not new_podcast:
 171                 podcast.urls.insert(0, podcast.new_location)
 172
 173
 174         logger.info('Retrieved %d episodes in total', len(episodes))
 175
 176         # latest episode timestamp
 177         eps = filter(lambda e: bool(e.released), episodes)
 178         eps = sorted(eps, key=lambda e: e.released)
 179
 180         podcast.update_interval = get_update_interval(eps)
 181
 182         if eps:
 183             podcast.latest_episode_timestamp = eps[-1].released
 184
 185         podcast.episode_count = episode_count_for_podcast(podcast)
 186
 187
 188         self._update_categories(podcast, prev_latest_episode_timestamp)
 189
 190         # try to download the logo and reset logo_url to None on http errors
 191         found = self._save_podcast_logo(podcast.logo_url)
 192         if not found:
 193             podcast.logo_url = None
 194
 195         # The podcast is always saved (not just when there are changes) because
 196         # we need to record the last update
 197         logger.info('Saving podcast.')
 198         podcast.last_update = datetime.utcnow()
 199         podcast.save()
 200
 201
 202         try:
 203             subscribe_at_hub(podcast)
 204         except SubscriptionError as se:
 205             logger.warn('subscribing to hub failed: %s', str(se))
 206
 207         assign_slug(podcast, PodcastSlug)
 208         assign_missing_episode_slugs(podcast)
 209
 210
 211     def _update_categories(self, podcast, prev_timestamp):
 212         """ checks some practical requirements and updates a category """
 213
 214         from datetime import timedelta
 215
 216         max_timestamp = datetime.utcnow() + timedelta(days=1)
 217
 218         # no episodes at all
 219         if not podcast.latest_episode_timestamp:
 220             return
 221
 222         # no new episode
 223         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 224             return
 225
 226         # too far in the future
 227         if podcast.latest_episode_timestamp > max_timestamp:
 228             return
 229
 230         # not enough subscribers
 231         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 232             return
 233
 234         update_category(podcast)
 235
 236
 237     def _update_episodes(self, podcast, parsed_episodes):
 238
 239         pid = podcast.get_id()
 240
 241         # list of (obj, fun) where fun is the function to update obj
 242         changes = []
 243         episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 244         logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 245                     len(episodes_to_update))
 246
 247         for n, parsed in enumerate(episodes_to_update, 1):
 248
 249             url = get_episode_url(parsed)
 250             if not url:
 251                 logger.info('Skipping episode %d for missing URL', n)
 252                 continue
 253
 254             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 255             episode = episode_for_podcast_id_url(pid, url, create=True)
 256
 257             update_episode = get_episode_update_function(parsed, episode,
 258                                                          podcast)
 259             changes.append((episode, update_episode))
 260
 261         # determine which episodes have been found
 262         updated_episodes = [e for (e, f) in changes]
 263         logger.info('Updating %d episodes with new data', len(updated_episodes))
 264
 265         # and mark the remaining ones outdated
 266         current_episodes = set(episodes_for_podcast_current(podcast, limit=500))
 267         outdated_episodes = current_episodes - set(updated_episodes)
 268         logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 269         changes.extend((e, mark_outdated) for e in outdated_episodes)
 270
 271         logger.info('Saving %d changes', len(changes))
 272         bulk_save_retry(changes, self.db)
 273
 274         return updated_episodes
 275
 276
 277     def _save_podcast_logo(self, cover_art):
 278         if not cover_art:
 279             return
 280
 281         try:
 282             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 283             prefix = CoverArt.get_prefix(image_sha1)
 284
 285             filename = CoverArt.get_original(prefix, image_sha1)
 286             dirname = CoverArt.get_dir(filename)
 287
 288             # get hash of existing file
 289             if os.path.exists(filename):
 290                 with open(filename) as f:
 291                     old_hash = file_hash(f).digest()
 292             else:
 293                 old_hash = ''
 294
 295             logger.info('Logo %s', cover_art)
 296
 297             # save new cover art
 298             with open(filename, 'w') as fp:
 299                 fp.write(urllib2.urlopen(cover_art).read())
 300
 301             # get hash of new file
 302             with open(filename) as f:
 303                 new_hash = file_hash(f).digest()
 304
 305             # remove thumbnails if cover changed
 306             if old_hash != new_hash:
 307                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 308                 logger.info('Removing %d thumbnails', len(thumbnails))
 309                 for f in thumbnails:
 310                     os.unlink(f)
 311
 312             return cover_art
 313
 314         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 315                 httplib.BadStatusLine, socket.error) as e:
 316             logger.warn('Exception while updating podcast logo: %s', str(e))
 317
 318
 319     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 320     def _mark_outdated(self, podcast, msg=''):
 321         logger.info('marking podcast outdated: %s', msg)
 322         podcast.outdated = True
 323         podcast.last_update = datetime.utcnow()
 324         podcast.save()
 325         self._update_episodes(podcast, [])
 326
 327
 328 def get_episode_url(parsed_episode):
 329     """ returns the URL of a parsed episode """
 330     for f in parsed_episode.files:
 331         if f.urls:
 332             return f.urls[0]
 333     return None
 334
 335
 336 def get_episode_update_function(parsed_episode, episode, podcast):
 337     """ returns an update function that can be passed to bulk_save_retry """
 338
 339     def update_episode(episode):
 340         """ updates "episode" with the data from "parsed_episode" """
 341
 342         # copy the json so we can determine if there have been any changes
 343         old_json = copy.deepcopy(episode.to_json())
 344
 345         episode.guid = parsed_episode.guid or episode.guid
 346         episode.description = parsed_episode.description or episode.description
 347         episode.subtitle = parsed_episode.subtitle or episode.subtitle
 348         episode.content = parsed_episode.content or parsed_episode.description or episode.content
 349         episode.link = parsed_episode.link or episode.link
 350         episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 351         episode.author = parsed_episode.author or episode.author
 352         episode.duration = parsed_episode.duration or episode.duration
 353         episode.filesize = parsed_episode.files[0].filesize
 354         episode.language = parsed_episode.language or episode.language or \
 355                                                       podcast.language
 356         episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 357         episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 358         episode.license = parsed_episode.license or episode.license
 359
 360         urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 361         episode.urls = sorted(set(episode.urls + urls), key=len)
 362
 363         episode.title = parsed_episode.title or episode.title or \
 364                         file_basename_no_extension(episode.url)
 365
 366         # if nothing changed we return None to indicate no required action
 367         if deep_eq(old_json, episode.to_json()):
 368             return None
 369
 370         # set the last_update only if there have been changed above
 371         episode.last_update = datetime.utcnow()
 372         return episode
 373
 374     return update_episode
 375
 376 def mark_outdated(obj):
 377     """ marks obj outdated if its not already """
 378     if obj.outdated:
 379         return None
 380
 381     obj.outdated = True
 382     obj.last_update = datetime.utcnow()
 383     return obj
 384
 385
 386 def get_update_interval(episodes):
 387     """ calculates the avg interval between new episodes """
 388
 389     count = len(episodes)
 390     if not count:
 391         logger.info('no episodes, using default interval of %dh',
 392             DEFAULT_UPDATE_INTERVAL)
 393         return DEFAULT_UPDATE_INTERVAL
 394
 395     earliest = episodes[0]
 396     now = datetime.utcnow()
 397
 398     timespan_s = (now - earliest.released).total_seconds()
 399     timespan_h = timespan_s / 60 / 60
 400
 401     interval = int(timespan_h / count)
 402     logger.info('%d episodes in %d days => %dh interval', count,
 403         timespan_h / 24, interval)
 404
 405     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 406     interval = max(interval, MIN_UPDATE_INTERVAL)
 407     interval = min(interval, MAX_UPDATE_INTERVAL)
 408
 409     return interval
 410
 411
 412 def file_basename_no_extension(filename):
 413     """ Returns filename without extension
 414
 415     >>> file_basename_no_extension('/home/me/file.txt')
 416     'file'
 417
 418     >>> file_basename_no_extension('file')
 419     'file'
 420     """
 421     base = os.path.basename(filename)
 422     name, extension = os.path.splitext(base)
 423     return name