mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain, islice
  27 import socket
  28
  29 from django.conf import settings
  30
  31 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  32          PodcastSlug
  33 from mygpo.core.models import DEFAULT_UPDATE_INTERVAL, MIN_UPDATE_INTERVAL, \
  34     MAX_UPDATE_INTERVAL
  35 from feedservice.parse import parse_feed, FetchFeedException
  36 from feedservice.parse.text import ConvertMarkdown
  37 from feedservice.parse.models import ParserException
  38 from feedservice.parse.vimeo import VimeoError
  39 from mygpo.utils import file_hash, deep_eq
  40 from mygpo.web.logo import CoverArt
  41 from mygpo.data.podcast import subscribe_at_hub
  42 from mygpo.pubsub.models import SubscriptionError
  43 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  44          episodes_for_podcast_current, episode_count_for_podcast
  45 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
  46 from mygpo.directory.tags import update_category
  47 from mygpo.decorators import repeat_on_conflict
  48 from mygpo.db.couchdb import get_main_database, bulk_save_retry
  49
  50 import logging
  51 logger = logging.getLogger(__name__)
  52
  53 MAX_EPISODES_UPDATE=200
  54
  55 class NoPodcastCreated(Exception):
  56     """ raised when no podcast obj was created for a new URL """
  57
  58
  59 class NoEpisodesException(Exception):
  60     """ raised when parsing something that doesn't contain any episodes """
  61
  62
  63 class PodcastUpdater(object):
  64     """ Updates a number of podcasts with data from their feeds """
  65
  66     def __init__(self):
  67         """ Queue is an iterable of podcast objects """
  68         self.db = get_main_database()
  69
  70
  71     def update_queue(self, queue):
  72         """ Fetch data for the URLs supplied as the queue iterable """
  73
  74         for n, podcast_url in enumerate(queue, 1):
  75             logger.info('Update %d - %s', n, podcast_url)
  76             try:
  77                 yield self.update(podcast_url)
  78
  79             except NoPodcastCreated as npc:
  80                 logger.info('No podcast created: %s', npc)
  81
  82
  83     def update(self, podcast_url):
  84         """ Update the podcast for the supplied URL """
  85
  86         try:
  87             parsed = self._fetch_feed(podcast_url)
  88             self._validate_parsed(parsed)
  89
  90         except (ParserException, FetchFeedException, NoEpisodesException,
  91                 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
  92             #TODO: catch valueError (for invalid Ipv6 in feedservice)
  93
  94             if isinstance(ex, VimeoError):
  95                 logger.exception('Problem when updating Vimeo feed %s',
  96                                  podcast_url)
  97
  98             # if we fail to parse the URL, we don't even create the
  99             # podcast object
 100             p = podcast_for_url(podcast_url, create=False)
 101             if p:
 102                 # if it exists already, we mark it as outdated
 103                 self._mark_outdated(p, 'error while fetching feed: %s' %
 104                     str(ex))
 105                 return p
 106
 107             else:
 108                 raise NoPodcastCreated(ex)
 109
 110         assert parsed, 'fetch_feed must return something'
 111         p = podcast_for_url(podcast_url, create=True)
 112         episodes = self._update_episodes(p, parsed.episodes)
 113         self._update_podcast(p, parsed, episodes)
 114         return p
 115
 116
 117     def verify_podcast_url(self, podcast_url):
 118         parsed = self._fetch_feed(podcast_url)
 119         self._validate_parsed(parsed)
 120         return True
 121
 122
 123     def _fetch_feed(self, podcast_url):
 124         import socket
 125         t = socket.getdefaulttimeout()
 126         socket.setdefaulttimeout(10)
 127         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 128         socket.setdefaulttimeout(t)
 129
 130
 131
 132     def _validate_parsed(self, parsed):
 133         """ validates the parsed results and raises an exception if invalid
 134
 135         feedparser parses pretty much everything. We reject anything that
 136         doesn't look like a feed"""
 137
 138         if not parsed or not parsed.episodes:
 139             raise NoEpisodesException('no episodes found')
 140
 141
 142     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 143     def _update_podcast(self, podcast, parsed, episodes):
 144         """ updates a podcast according to new parser results """
 145
 146         # we need that later to decide if we can "bump" a category
 147         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 148
 149         old_json = copy.deepcopy(podcast.to_json())
 150
 151         podcast.title = parsed.title or podcast.title
 152         podcast.urls = list(set(podcast.urls + parsed.urls))
 153         podcast.description = parsed.description or podcast.description
 154         podcast.subtitle = parsed.subtitle or podcast.subtitle
 155         podcast.link = parsed.link or podcast.link
 156         podcast.logo_url = parsed.logo or podcast.logo_url
 157         podcast.author = parsed.author or podcast.author
 158         podcast.language = parsed.language or podcast.language
 159         podcast.content_types = parsed.content_types or podcast.content_types
 160         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 161         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 162         podcast.new_location = parsed.new_location or podcast.new_location
 163         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 164         podcast.hub = parsed.hub or podcast.hub
 165         podcast.license = parsed.license or podcast.license
 166
 167
 168         if podcast.new_location:
 169             new_podcast = podcast_for_url(podcast.new_location)
 170             if new_podcast != podcast:
 171                 self._mark_outdated(podcast, 'redirected to different podcast')
 172                 return
 173
 174             elif not new_podcast:
 175                 podcast.urls.insert(0, podcast.new_location)
 176
 177
 178         logger.info('Retrieved %d episodes in total', len(episodes))
 179
 180         # latest episode timestamp
 181         eps = filter(lambda e: bool(e.released), episodes)
 182         eps = sorted(eps, key=lambda e: e.released)
 183
 184         podcast.update_interval = get_update_interval(eps)
 185
 186         if eps:
 187             podcast.latest_episode_timestamp = eps[-1].released
 188
 189         podcast.episode_count = episode_count_for_podcast(podcast)
 190
 191
 192         self._update_categories(podcast, prev_latest_episode_timestamp)
 193
 194         # try to download the logo and reset logo_url to None on http errors
 195         found = self._save_podcast_logo(podcast.logo_url)
 196         if not found:
 197             podcast.logo_url = None
 198
 199         # The podcast is always saved (not just when there are changes) because
 200         # we need to record the last update
 201         logger.info('Saving podcast.')
 202         podcast.last_update = datetime.utcnow()
 203         podcast.save()
 204
 205
 206         try:
 207             subscribe_at_hub(podcast)
 208         except SubscriptionError as se:
 209             logger.warn('subscribing to hub failed: %s', str(se))
 210
 211         assign_slug(podcast, PodcastSlug)
 212         assign_missing_episode_slugs(podcast)
 213
 214
 215     def _update_categories(self, podcast, prev_timestamp):
 216         """ checks some practical requirements and updates a category """
 217
 218         from datetime import timedelta
 219
 220         max_timestamp = datetime.utcnow() + timedelta(days=1)
 221
 222         # no episodes at all
 223         if not podcast.latest_episode_timestamp:
 224             return
 225
 226         # no new episode
 227         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 228             return
 229
 230         # too far in the future
 231         if podcast.latest_episode_timestamp > max_timestamp:
 232             return
 233
 234         # not enough subscribers
 235         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 236             return
 237
 238         update_category(podcast)
 239
 240
 241     def _update_episodes(self, podcast, parsed_episodes):
 242
 243         pid = podcast.get_id()
 244
 245         # list of (obj, fun) where fun is the function to update obj
 246         changes = []
 247         episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
 248         logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
 249                     len(episodes_to_update))
 250
 251         for n, parsed in enumerate(episodes_to_update, 1):
 252
 253             url = get_episode_url(parsed)
 254             if not url:
 255                 logger.info('Skipping episode %d for missing URL', n)
 256                 continue
 257
 258             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 259             episode = episode_for_podcast_id_url(pid, url, create=True)
 260
 261             update_episode = get_episode_update_function(parsed, episode,
 262                                                          podcast)
 263             changes.append((episode, update_episode))
 264
 265         # determine which episodes have been found
 266         updated_episodes = [e for (e, f) in changes]
 267         logger.info('Updating %d episodes with new data', len(updated_episodes))
 268
 269         # and mark the remaining ones outdated
 270         current_episodes = set(episodes_for_podcast_current(podcast, limit=500))
 271         outdated_episodes = current_episodes - set(updated_episodes)
 272         logger.info('Marking %d episodes as outdated', len(outdated_episodes))
 273         changes.extend((e, mark_outdated) for e in outdated_episodes)
 274
 275         logger.info('Saving %d changes', len(changes))
 276         bulk_save_retry(changes, self.db)
 277
 278         return updated_episodes
 279
 280
 281     def _save_podcast_logo(self, cover_art):
 282         if not cover_art:
 283             return
 284
 285         try:
 286             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 287             prefix = CoverArt.get_prefix(image_sha1)
 288
 289             filename = CoverArt.get_original(prefix, image_sha1)
 290             dirname = CoverArt.get_dir(filename)
 291
 292             # get hash of existing file
 293             if os.path.exists(filename):
 294                 with open(filename) as f:
 295                     old_hash = file_hash(f).digest()
 296             else:
 297                 old_hash = ''
 298
 299             logger.info('Logo %s', cover_art)
 300
 301             # save new cover art
 302             with open(filename, 'w') as fp:
 303                 fp.write(urllib2.urlopen(cover_art).read())
 304
 305             # get hash of new file
 306             with open(filename) as f:
 307                 new_hash = file_hash(f).digest()
 308
 309             # remove thumbnails if cover changed
 310             if old_hash != new_hash:
 311                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 312                 logger.info('Removing %d thumbnails', len(thumbnails))
 313                 for f in thumbnails:
 314                     os.unlink(f)
 315
 316             return cover_art
 317
 318         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 319                 httplib.BadStatusLine, socket.error, IOError) as e:
 320             logger.warn('Exception while updating podcast logo: %s', str(e))
 321
 322
 323     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 324     def _mark_outdated(self, podcast, msg=''):
 325         logger.info('marking podcast outdated: %s', msg)
 326         podcast.outdated = True
 327         podcast.last_update = datetime.utcnow()
 328         podcast.save()
 329         self._update_episodes(podcast, [])
 330
 331
 332 def get_episode_url(parsed_episode):
 333     """ returns the URL of a parsed episode """
 334     for f in parsed_episode.files:
 335         if f.urls:
 336             return f.urls[0]
 337     return None
 338
 339
 340 def get_episode_update_function(parsed_episode, episode, podcast):
 341     """ returns an update function that can be passed to bulk_save_retry """
 342
 343     def update_episode(episode):
 344         """ updates "episode" with the data from "parsed_episode" """
 345
 346         # copy the json so we can determine if there have been any changes
 347         old_json = copy.deepcopy(episode.to_json())
 348
 349         episode.guid = parsed_episode.guid or episode.guid
 350         episode.description = parsed_episode.description or episode.description
 351         episode.subtitle = parsed_episode.subtitle or episode.subtitle
 352         episode.content = parsed_episode.content or parsed_episode.description or episode.content
 353         episode.link = parsed_episode.link or episode.link
 354         episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 355         episode.author = parsed_episode.author or episode.author
 356         episode.duration = parsed_episode.duration or episode.duration
 357         episode.filesize = parsed_episode.files[0].filesize
 358         episode.language = parsed_episode.language or episode.language or \
 359                                                       podcast.language
 360         episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 361         episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 362         episode.license = parsed_episode.license or episode.license
 363
 364         urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 365         episode.urls = sorted(set(episode.urls + urls), key=len)
 366
 367         episode.title = parsed_episode.title or episode.title or \
 368                         file_basename_no_extension(episode.url)
 369
 370         # if nothing changed we return None to indicate no required action
 371         if deep_eq(old_json, episode.to_json()):
 372             return None
 373
 374         # set the last_update only if there have been changed above
 375         episode.last_update = datetime.utcnow()
 376         return episode
 377
 378     return update_episode
 379
 380 def mark_outdated(obj):
 381     """ marks obj outdated if its not already """
 382     if obj.outdated:
 383         return None
 384
 385     obj.outdated = True
 386     obj.last_update = datetime.utcnow()
 387     return obj
 388
 389
 390 def get_update_interval(episodes):
 391     """ calculates the avg interval between new episodes """
 392
 393     count = len(episodes)
 394     if not count:
 395         logger.info('no episodes, using default interval of %dh',
 396             DEFAULT_UPDATE_INTERVAL)
 397         return DEFAULT_UPDATE_INTERVAL
 398
 399     earliest = episodes[0]
 400     now = datetime.utcnow()
 401
 402     timespan_s = (now - earliest.released).total_seconds()
 403     timespan_h = timespan_s / 60 / 60
 404
 405     interval = int(timespan_h / count)
 406     logger.info('%d episodes in %d days => %dh interval', count,
 407         timespan_h / 24, interval)
 408
 409     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 410     interval = max(interval, MIN_UPDATE_INTERVAL)
 411     interval = min(interval, MAX_UPDATE_INTERVAL)
 412
 413     return interval
 414
 415
 416 def file_basename_no_extension(filename):
 417     """ Returns filename without extension
 418
 419     >>> file_basename_no_extension('/home/me/file.txt')
 420     'file'
 421
 422     >>> file_basename_no_extension('file')
 423     'file'
 424     """
 425     base = os.path.basename(filename)
 426     name, extension = os.path.splitext(base)
 427     return name