mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain
  27
  28 from django.conf import settings
  29
  30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  31          PodcastSlug
  32 from feedservice.parse import parse_feed, FetchFeedException
  33 from feedservice.parse.text import ConvertMarkdown
  34 from feedservice.parse.models import ParserException
  35 from mygpo.utils import file_hash, split_list, deep_eq
  36 from mygpo.web.logo import CoverArt
  37 from mygpo.data.podcast import subscribe_at_hub
  38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  39          episodes_for_podcast_uncached
  40 from mygpo.db.couchdb.podcast import podcast_for_url, podcast_by_id_uncached, \
  41     reload_podcast
  42 from mygpo.directory.tags import update_category
  43 from mygpo.decorators import repeat_on_conflict
  44 from mygpo.db.couchdb import get_main_database
  45
  46 import logging
  47 logger = logging.getLogger(__name__)
  48
  49
  50 class NoPodcastCreated(Exception):
  51     """ raised when no podcast obj was created for a new URL """
  52
  53
  54 class NoEpisodesException(Exception):
  55     """ raised when parsing something that doesn't contain any episodes """
  56
  57
  58 class PodcastUpdater(object):
  59     """ Updates a number of podcasts with data from their feeds """
  60
  61     def __init__(self):
  62         """ Queue is an iterable of podcast objects """
  63         self.db = get_main_database()
  64
  65
  66     def update_queue(self, queue):
  67         """ Fetch data for the URLs supplied as the queue iterable """
  68
  69         for n, podcast_url in enumerate(queue):
  70             logger.info('Update %d - %s', n, podcast_url)
  71             try:
  72                 yield self.update(podcast_url)
  73
  74             except NoPodcastCreated as npc:
  75                 logger.info('No podcast created: %s', npc)
  76
  77
  78     def update(self, podcast_url):
  79         """ Update the podcast for the supplied URL """
  80
  81         try:
  82             parsed = self._fetch_feed(podcast_url)
  83             self._validate_parsed(parsed)
  84
  85         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  86
  87             # if we fail to parse the URL, we don't even create the
  88             # podcast object
  89             p = podcast_for_url(podcast_url, create=False)
  90             if p:
  91                 # if it exists already, we mark it as outdated
  92                 self._mark_outdated(p)
  93                 return
  94
  95             else:
  96                 raise NoPodcastCreated(ex)
  97
  98         assert parsed, 'fetch_feed must return something'
  99         p = podcast_for_url(podcast_url, create=True)
 100         self._update_podcast(p, parsed)
 101         return p
 102
 103
 104     def verify_podcast_url(self, podcast_url):
 105         parsed = self._fetch_feed(podcast_url)
 106         self._validate_parsed(parsed)
 107         return True
 108
 109
 110     def _fetch_feed(self, podcast_url):
 111         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 112
 113
 114
 115     def _validate_parsed(self, parsed):
 116         """ validates the parsed results and raises an exception if invalid
 117
 118         feedparser parses pretty much everything. We reject anything that
 119         doesn't look like a feed"""
 120
 121         if not parsed or not parsed.episodes:
 122             raise NoEpisodesException('no episodes found')
 123
 124
 125     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 126     def _update_podcast(self, podcast, parsed):
 127         """ updates a podcast according to new parser results """
 128
 129         # we need that later to decide if we can "bump" a category
 130         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 131
 132         old_json = copy.deepcopy(podcast.to_json())
 133
 134         podcast.title = parsed.title or podcast.title
 135         podcast.urls = list(set(podcast.urls + parsed.urls))
 136         podcast.description = parsed.description or podcast.description
 137         podcast.subtitle = parsed.subtitle or podcast.subtitle
 138         podcast.link = parsed.link or podcast.link
 139         podcast.logo_url = parsed.logo or podcast.logo_url
 140         podcast.author = parsed.author or podcast.author
 141         podcast.language = parsed.language or podcast.language
 142         podcast.content_types = parsed.content_types or podcast.content_types
 143         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 144         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 145         podcast.new_location = parsed.new_location or podcast.new_location
 146         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 147         podcast.hub = parsed.hub or podcast.hub
 148         podcast.license = parsed.license or podcast.license
 149
 150
 151         if podcast.new_location:
 152             new_podcast = podcast_for_url(podcast.new_location)
 153             if new_podcast != podcast:
 154                 self._mark_outdated(podcast, 'redirected to different podcast')
 155                 return
 156
 157             elif not new_podcast:
 158                 podcast.urls.insert(0, podcast.new_location)
 159
 160
 161         episodes = self._update_episodes(podcast, parsed.episodes)
 162
 163         # latest episode timestamp
 164         eps = filter(lambda e: bool(e.released), episodes)
 165         eps = sorted(eps, key=lambda e: e.released)
 166         if eps:
 167             podcast.latest_episode_timestamp = eps[-1].released
 168             podcast.episode_count = len(eps)
 169
 170
 171         self._update_categories(podcast, prev_latest_episode_timestamp)
 172
 173         # try to download the logo and reset logo_url to None on http errors
 174         found = self._save_podcast_logo(podcast.logo_url)
 175         if not found:
 176             podcast.logo_url = None
 177
 178         if not deep_eq(old_json, podcast.to_json()):
 179             logger.info('Saving podcast.')
 180             podcast.last_update = datetime.utcnow()
 181             podcast.save()
 182
 183
 184         subscribe_at_hub(podcast)
 185
 186         assign_slug(podcast, PodcastSlug)
 187         assign_missing_episode_slugs(podcast)
 188
 189
 190     def _update_categories(self, podcast, prev_timestamp):
 191         """ checks some practical requirements and updates a category """
 192
 193         from datetime import timedelta
 194
 195         max_timestamp = datetime.utcnow() + timedelta(days=1)
 196
 197         # no episodes at all
 198         if not podcast.latest_episode_timestamp:
 199             return
 200
 201         # no new episode
 202         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 203             return
 204
 205         # too far in the future
 206         if podcast.latest_episode_timestamp > max_timestamp:
 207             return
 208
 209         # not enough subscribers
 210         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 211             return
 212
 213         update_category(podcast)
 214
 215
 216     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 217     def _update_episodes(self, podcast, parsed_episodes):
 218
 219         all_episodes = set(episodes_for_podcast_uncached(podcast))
 220         remaining = list(all_episodes)
 221         updated_episodes = []
 222
 223         for parsed_episode in parsed_episodes:
 224
 225             url = None
 226
 227             for f in parsed_episode.files:
 228                 if f.urls:
 229                     url = f.urls[0]
 230
 231             if not url:
 232                 continue
 233
 234             guid = parsed_episode.guid
 235
 236             # pop matchin episodes out of the "existing" list
 237             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 238
 239             if not matching:
 240                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 241                     url, create=True)
 242                 matching = [new_episode]
 243                 all_episodes.add(new_episode)
 244
 245
 246             for episode in matching:
 247                 old_json = copy.deepcopy(episode.to_json())
 248
 249                 episode.guid = parsed_episode.guid or episode.guid
 250                 episode.title = parsed_episode.title or episode.title
 251                 episode.description = parsed_episode.description or episode.description
 252                 episode.subtitle = parsed_episode.subtitle or episode.subtitle
 253                 episode.content = parsed_episode.content or parsed_episode.description or episode.content
 254                 episode.link = parsed_episode.link or episode.link
 255                 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 256                 episode.author = parsed_episode.author or episode.author
 257                 episode.duration = parsed_episode.duration or episode.duration
 258                 episode.filesize = parsed_episode.files[0].filesize
 259                 episode.language = parsed_episode.language or episode.language
 260                 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 261                 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 262                 episode.license = parsed_episode.license or episode.license
 263
 264                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 265                 episode.urls = sorted(set(episode.urls + urls), key=len)
 266
 267                 if not deep_eq(old_json, episode.to_json()):
 268                     episode.last_update = datetime.utcnow()
 269                     updated_episodes.append(episode)
 270
 271
 272         outdated_episodes = all_episodes - set(updated_episodes)
 273
 274         # set episodes to be outdated, where necessary
 275         for e in filter(lambda e: not e.outdated, outdated_episodes):
 276             e.outdated = True
 277             updated_episodes.append(e)
 278
 279
 280         if updated_episodes:
 281             logger.info('Updating %d episodes', len(updated_episodes))
 282             self.db.save_docs(updated_episodes)
 283
 284         return all_episodes
 285
 286
 287     def _save_podcast_logo(self, cover_art):
 288         if not cover_art:
 289             return
 290
 291         try:
 292             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 293             prefix = CoverArt.get_prefix(image_sha1)
 294
 295             filename = CoverArt.get_original(prefix, image_sha1)
 296             dirname = CoverArt.get_dir(filename)
 297
 298             # get hash of existing file
 299             if os.path.exists(filename):
 300                 with open(filename) as f:
 301                     old_hash = file_hash(f).digest()
 302             else:
 303                 old_hash = ''
 304
 305             logger.info('Logo %s', cover_art)
 306
 307             # save new cover art
 308             with open(filename, 'w') as fp:
 309                 fp.write(urllib2.urlopen(cover_art).read())
 310
 311             # get hash of new file
 312             with open(filename) as f:
 313                 new_hash = file_hash(f).digest()
 314
 315             # remove thumbnails if cover changed
 316             if old_hash != new_hash:
 317                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 318                 logger.info('Removing %d thumbnails', len(thumbnails))
 319                 for f in thumbnails:
 320                     os.unlink(f)
 321
 322             return cover_art
 323
 324         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 325                 httplib.BadStatusLine) as e:
 326             logger.warn('Exception while updating podcast: %s', str(e))
 327
 328
 329     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 330     def _mark_outdated(self, podcast, msg=''):
 331         logger.info('marking podcast outdated: %s', msg)
 332         podcast.outdated = True
 333         podcast.last_update = datetime.utcnow()
 334         podcast.save()
 335         self._update_episodes(podcast, [])