mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain
  27
  28 from django.conf import settings
  29
  30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  31          PodcastSlug
  32 from feedservice.parse import parse_feed, FetchFeedException
  33 from feedservice.parse.text import ConvertMarkdown
  34 from feedservice.parse.models import ParserException
  35 from mygpo.utils import file_hash, split_list, deep_eq
  36 from mygpo.web.logo import CoverArt
  37 from mygpo.data.podcast import subscribe_at_hub
  38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  39          episodes_for_podcast_uncached
  40 from mygpo.db.couchdb.podcast import podcast_for_url
  41 from mygpo.directory.tags import update_category
  42 from mygpo.decorators import repeat_on_conflict
  43 from mygpo.db.couchdb import get_main_database
  44
  45 import logging
  46 logger = logging.getLogger(__name__)
  47
  48
  49 class NoPodcastCreated(Exception):
  50     """ raised when no podcast obj was created for a new URL """
  51
  52
  53 class NoEpisodesException(Exception):
  54     """ raised when parsing something that doesn't contain any episodes """
  55
  56
  57 class PodcastUpdater(object):
  58     """ Updates a number of podcasts with data from their feeds """
  59
  60     def __init__(self):
  61         """ Queue is an iterable of podcast objects """
  62         self.db = get_main_database()
  63
  64
  65     def update_queue(self, queue):
  66         """ Fetch data for the URLs supplied as the queue iterable """
  67
  68         for n, podcast_url in enumerate(queue):
  69             logger.info('Update %d - %s', n, podcast_url)
  70             try:
  71                 yield self.update(podcast_url)
  72
  73             except NoPodcastCreated as npc:
  74                 logger.info('No podcast created: %s', npc)
  75
  76
  77     def update(self, podcast_url):
  78         """ Update the podcast for the supplied URL """
  79
  80         try:
  81             parsed = self._fetch_feed(podcast_url)
  82             self._validate_parsed(parsed)
  83
  84         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  85
  86             # if we fail to parse the URL, we don't even create the
  87             # podcast object
  88             p = podcast_for_url(podcast_url, create=False)
  89             if p:
  90                 # if it exists already, we mark it as outdated
  91                 self._mark_outdated(p)
  92                 return
  93
  94             else:
  95                 raise NoPodcastCreated(ex)
  96
  97         assert parsed, 'fetch_feed must return something'
  98         p = podcast_for_url(podcast_url, create=True)
  99         self._update_podcast(p, parsed)
 100         return p
 101
 102
 103     def verify_podcast_url(self, podcast_url):
 104         parsed = self._fetch_feed(podcast_url)
 105         self._validate_parsed(parsed)
 106         return True
 107
 108
 109     def _fetch_feed(self, podcast_url):
 110         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 111
 112
 113
 114     def _validate_parsed(self, parsed):
 115         """ validates the parsed results and raises an exception if invalid
 116
 117         feedparser parses pretty much everything. We reject anything that
 118         doesn't look like a feed"""
 119
 120         if not parsed or not parsed.episodes:
 121             raise NoEpisodesException('no episodes found')
 122
 123
 124     @repeat_on_conflict(['podcast'])
 125     def _update_podcast(self, podcast, parsed):
 126         """ updates a podcast according to new parser results """
 127
 128         # we need that later to decide if we can "bump" a category
 129         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 130
 131         old_json = copy.deepcopy(podcast.to_json())
 132
 133         podcast.title = parsed.title or podcast.title
 134         podcast.urls = list(set(podcast.urls + parsed.urls))
 135         podcast.description = parsed.description or podcast.description
 136         podcast.link = parsed.link or podcast.link
 137         podcast.logo_url = parsed.logo or podcast.logo_url
 138         podcast.author = parsed.author or podcast.author
 139         podcast.language = parsed.language or podcast.language
 140         podcast.content_types = parsed.content_types or podcast.content_types
 141         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 142         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 143         podcast.new_location = parsed.new_location or podcast.new_location
 144         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 145         podcast.hub = parsed.hub or podcast.hub
 146
 147
 148         if podcast.new_location:
 149             new_podcast = podcast_for_url(podcast.new_location)
 150             if new_podcast != podcast:
 151                 self._mark_outdated(podcast, 'redirected to different podcast')
 152                 return
 153
 154             elif not new_podcast:
 155                 podcast.urls.insert(0, podcast.new_location)
 156
 157
 158         episodes = self._update_episodes(podcast, parsed.episodes)
 159
 160         # latest episode timestamp
 161         eps = filter(lambda e: bool(e.released), episodes)
 162         eps = sorted(eps, key=lambda e: e.released)
 163         if eps:
 164             podcast.latest_episode_timestamp = eps[-1].released
 165             podcast.episode_count = len(eps)
 166
 167
 168         self._update_categories(podcast, prev_latest_episode_timestamp)
 169
 170         # try to download the logo and reset logo_url to None on http errors
 171         found = self._save_podcast_logo(podcast.logo_url)
 172         if not found:
 173             podcast.logo_url = None
 174
 175         if not deep_eq(old_json, podcast.to_json()):
 176             logger.info('Saving podcast.')
 177             podcast.last_update = datetime.utcnow()
 178             podcast.save()
 179
 180
 181         subscribe_at_hub(podcast)
 182
 183         assign_slug(podcast, PodcastSlug)
 184         assign_missing_episode_slugs(podcast)
 185
 186
 187     def _update_categories(self, podcast, prev_timestamp):
 188         """ checks some practical requirements and updates a category """
 189
 190         from datetime import timedelta
 191
 192         max_timestamp = datetime.utcnow() + timedelta(days=1)
 193
 194         # no episodes at all
 195         if not podcast.latest_episode_timestamp:
 196             return
 197
 198         # no new episode
 199         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 200             return
 201
 202         # too far in the future
 203         if podcast.latest_episode_timestamp > max_timestamp:
 204             return
 205
 206         # not enough subscribers
 207         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 208             return
 209
 210         update_category(podcast)
 211
 212
 213     @repeat_on_conflict(['podcast'])
 214     def _update_episodes(self, podcast, parsed_episodes):
 215
 216         all_episodes = set(episodes_for_podcast_uncached(podcast))
 217         remaining = list(all_episodes)
 218         updated_episodes = []
 219
 220         for parsed_episode in parsed_episodes:
 221
 222             url = None
 223
 224             for f in parsed_episode.files:
 225                 if f.urls:
 226                     url = f.urls[0]
 227
 228             if not url:
 229                 continue
 230
 231             guid = parsed_episode.guid
 232
 233             # pop matchin episodes out of the "existing" list
 234             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 235
 236             if not matching:
 237                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 238                     url, create=True)
 239                 matching = [new_episode]
 240                 all_episodes.add(new_episode)
 241
 242
 243             for episode in matching:
 244                 old_json = copy.deepcopy(episode.to_json())
 245
 246                 episode.guid = parsed_episode.guid or episode.guid
 247                 episode.title = parsed_episode.title or episode.title
 248                 episode.description = parsed_episode.description or episode.description
 249                 episode.content = parsed_episode.content or parsed_episode.description or episode.content
 250                 episode.link = parsed_episode.link or episode.link
 251                 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 252                 episode.author = parsed_episode.author or episode.author
 253                 episode.duration = parsed_episode.duration or episode.duration
 254                 episode.filesize = parsed_episode.files[0].filesize
 255                 episode.language = parsed_episode.language or episode.language
 256                 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 257                 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 258
 259                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 260                 episode.urls = sorted(set(episode.urls + urls), key=len)
 261
 262                 if not deep_eq(old_json, episode.to_json()):
 263                     episode.last_update = datetime.utcnow()
 264                     updated_episodes.append(episode)
 265
 266
 267         outdated_episodes = all_episodes - set(updated_episodes)
 268
 269         # set episodes to be outdated, where necessary
 270         for e in filter(lambda e: not e.outdated, outdated_episodes):
 271             e.outdated = True
 272             updated_episodes.append(e)
 273
 274
 275         if updated_episodes:
 276             logger.info('Updating %d episodes', len(updated_episodes))
 277             self.db.save_docs(updated_episodes)
 278
 279         return all_episodes
 280
 281
 282     def _save_podcast_logo(self, cover_art):
 283         if not cover_art:
 284             return
 285
 286         try:
 287             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 288             prefix = CoverArt.get_prefix(image_sha1)
 289
 290             filename = CoverArt.get_original(prefix, image_sha1)
 291             dirname = CoverArt.get_dir(filename)
 292
 293             # get hash of existing file
 294             if os.path.exists(filename):
 295                 with open(filename) as f:
 296                     old_hash = file_hash(f).digest()
 297             else:
 298                 old_hash = ''
 299
 300             logger.info('Logo %s', cover_art)
 301
 302             # save new cover art
 303             with open(filename, 'w') as fp:
 304                 fp.write(urllib2.urlopen(cover_art).read())
 305
 306             # get hash of new file
 307             with open(filename) as f:
 308                 new_hash = file_hash(f).digest()
 309
 310             # remove thumbnails if cover changed
 311             if old_hash != new_hash:
 312                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 313                 logger.info('Removing %d thumbnails', len(thumbnails))
 314                 for f in thumbnails:
 315                     os.unlink(f)
 316
 317             return cover_art
 318
 319         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 320                 httplib.BadStatusLine) as e:
 321             logger.warn('Exception while updating podcast: %s', str(e))
 322
 323
 324     @repeat_on_conflict(['podcast'])
 325     def _mark_outdated(self, podcast, msg=''):
 326         logger.info('marking podcast outdated: %s', msg)
 327         podcast.outdated = True
 328         podcast.last_update = datetime.utcnow()
 329         podcast.save()
 330         self._update_episodes(podcast, [])