mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain
  27
  28 from django.conf import settings
  29
  30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  31          PodcastSlug
  32 from feedservice.parse import parse_feed, FetchFeedException
  33 from feedservice.parse.text import ConvertMarkdown
  34 from feedservice.parse.models import ParserException
  35 from mygpo.utils import file_hash, split_list, deep_eq
  36 from mygpo.web.logo import CoverArt
  37 from mygpo.data.podcast import subscribe_at_hub
  38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  39          episodes_for_podcast_uncached
  40 from mygpo.db.couchdb.podcast import podcast_for_url
  41 from mygpo.directory.tags import update_category
  42 from mygpo.decorators import repeat_on_conflict
  43 from mygpo.db.couchdb import get_main_database
  44
  45 import logging
  46 logger = logging.getLogger(__name__)
  47
  48
  49 class NoPodcastCreated(Exception):
  50     """ raised when no podcast obj was created for a new URL """
  51
  52
  53 class NoEpisodesException(Exception):
  54     """ raised when parsing something that doesn't contain any episodes """
  55
  56
  57 class PodcastUpdater(object):
  58     """ Updates a number of podcasts with data from their feeds """
  59
  60     def __init__(self):
  61         """ Queue is an iterable of podcast objects """
  62         self.db = get_main_database()
  63
  64
  65     def update_queue(self, queue):
  66         """ Fetch data for the URLs supplied as the queue iterable """
  67
  68         for n, podcast_url in enumerate(queue):
  69             logger.info('Update %d - %s', n, podcast_url)
  70             try:
  71                 yield self.update(podcast_url)
  72
  73             except NoPodcastCreated as npc:
  74                 logger.info('No podcast created: %s', npc)
  75
  76
  77     def update(self, podcast_url):
  78         """ Update the podcast for the supplied URL """
  79
  80         try:
  81             parsed = self._fetch_feed(podcast_url)
  82             self._validate_parsed(parsed)
  83
  84         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  85
  86             # if we fail to parse the URL, we don't even create the
  87             # podcast object
  88             p = podcast_for_url(podcast_url, create=False)
  89             if p:
  90                 # if it exists already, we mark it as outdated
  91                 self._mark_outdated(p)
  92                 return
  93
  94             else:
  95                 raise NoPodcastCreated(ex)
  96
  97         assert parsed, 'fetch_feed must return something'
  98         p = podcast_for_url(podcast_url, create=True)
  99         self._update_podcast(p, parsed)
 100         return p
 101
 102
 103     def verify_podcast_url(self, podcast_url):
 104         parsed = self._fetch_feed(podcast_url)
 105         self._validate_parsed(parsed)
 106         return True
 107
 108
 109     def _fetch_feed(self, podcast_url):
 110         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 111
 112
 113
 114     def _validate_parsed(self, parsed):
 115         """ validates the parsed results and raises an exception if invalid
 116
 117         feedparser parses pretty much everything. We reject anything that
 118         doesn't look like a feed"""
 119
 120         if not parsed or not parsed.episodes:
 121             raise NoEpisodesException('no episodes found')
 122
 123
 124     @repeat_on_conflict(['podcast'])
 125     def _update_podcast(self, podcast, parsed):
 126         """ updates a podcast according to new parser results """
 127
 128         # we need that later to decide if we can "bump" a category
 129         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 130
 131         old_json = copy.deepcopy(podcast.to_json())
 132
 133         podcast.title = parsed.title or podcast.title
 134         podcast.urls = list(set(podcast.urls + parsed.urls))
 135         podcast.description = parsed.description or podcast.description
 136         podcast.subtitle = parsed.subtitle or podcast.subtitle
 137         podcast.link = parsed.link or podcast.link
 138         podcast.logo_url = parsed.logo or podcast.logo_url
 139         podcast.author = parsed.author or podcast.author
 140         podcast.language = parsed.language or podcast.language
 141         podcast.content_types = parsed.content_types or podcast.content_types
 142         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 143         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 144         podcast.new_location = parsed.new_location or podcast.new_location
 145         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 146         podcast.hub = parsed.hub or podcast.hub
 147         podcast.license = parsed.license or podcast.license
 148
 149
 150         if podcast.new_location:
 151             new_podcast = podcast_for_url(podcast.new_location)
 152             if new_podcast != podcast:
 153                 self._mark_outdated(podcast, 'redirected to different podcast')
 154                 return
 155
 156             elif not new_podcast:
 157                 podcast.urls.insert(0, podcast.new_location)
 158
 159
 160         episodes = self._update_episodes(podcast, parsed.episodes)
 161
 162         # latest episode timestamp
 163         eps = filter(lambda e: bool(e.released), episodes)
 164         eps = sorted(eps, key=lambda e: e.released)
 165         if eps:
 166             podcast.latest_episode_timestamp = eps[-1].released
 167             podcast.episode_count = len(eps)
 168
 169
 170         self._update_categories(podcast, prev_latest_episode_timestamp)
 171
 172         # try to download the logo and reset logo_url to None on http errors
 173         found = self._save_podcast_logo(podcast.logo_url)
 174         if not found:
 175             podcast.logo_url = None
 176
 177         if not deep_eq(old_json, podcast.to_json()):
 178             logger.info('Saving podcast.')
 179             podcast.last_update = datetime.utcnow()
 180             podcast.save()
 181
 182
 183         subscribe_at_hub(podcast)
 184
 185         assign_slug(podcast, PodcastSlug)
 186         assign_missing_episode_slugs(podcast)
 187
 188
 189     def _update_categories(self, podcast, prev_timestamp):
 190         """ checks some practical requirements and updates a category """
 191
 192         from datetime import timedelta
 193
 194         max_timestamp = datetime.utcnow() + timedelta(days=1)
 195
 196         # no episodes at all
 197         if not podcast.latest_episode_timestamp:
 198             return
 199
 200         # no new episode
 201         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 202             return
 203
 204         # too far in the future
 205         if podcast.latest_episode_timestamp > max_timestamp:
 206             return
 207
 208         # not enough subscribers
 209         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 210             return
 211
 212         update_category(podcast)
 213
 214
 215     @repeat_on_conflict(['podcast'])
 216     def _update_episodes(self, podcast, parsed_episodes):
 217
 218         all_episodes = set(episodes_for_podcast_uncached(podcast))
 219         remaining = list(all_episodes)
 220         updated_episodes = []
 221
 222         for parsed_episode in parsed_episodes:
 223
 224             url = None
 225
 226             for f in parsed_episode.files:
 227                 if f.urls:
 228                     url = f.urls[0]
 229
 230             if not url:
 231                 continue
 232
 233             guid = parsed_episode.guid
 234
 235             # pop matchin episodes out of the "existing" list
 236             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 237
 238             if not matching:
 239                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 240                     url, create=True)
 241                 matching = [new_episode]
 242                 all_episodes.add(new_episode)
 243
 244
 245             for episode in matching:
 246                 old_json = copy.deepcopy(episode.to_json())
 247
 248                 episode.guid = parsed_episode.guid or episode.guid
 249                 episode.title = parsed_episode.title or episode.title
 250                 episode.description = parsed_episode.description or episode.description
 251                 episode.subtitle = parsed_episode.subtitle or episode.subtitle
 252                 episode.content = parsed_episode.content or parsed_episode.description or episode.content
 253                 episode.link = parsed_episode.link or episode.link
 254                 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 255                 episode.author = parsed_episode.author or episode.author
 256                 episode.duration = parsed_episode.duration or episode.duration
 257                 episode.filesize = parsed_episode.files[0].filesize
 258                 episode.language = parsed_episode.language or episode.language
 259                 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 260                 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 261                 episode.license = parsed_episode.license or episode.license
 262
 263                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 264                 episode.urls = sorted(set(episode.urls + urls), key=len)
 265
 266                 if not deep_eq(old_json, episode.to_json()):
 267                     episode.last_update = datetime.utcnow()
 268                     updated_episodes.append(episode)
 269
 270
 271         outdated_episodes = all_episodes - set(updated_episodes)
 272
 273         # set episodes to be outdated, where necessary
 274         for e in filter(lambda e: not e.outdated, outdated_episodes):
 275             e.outdated = True
 276             updated_episodes.append(e)
 277
 278
 279         if updated_episodes:
 280             logger.info('Updating %d episodes', len(updated_episodes))
 281             self.db.save_docs(updated_episodes)
 282
 283         return all_episodes
 284
 285
 286     def _save_podcast_logo(self, cover_art):
 287         if not cover_art:
 288             return
 289
 290         try:
 291             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 292             prefix = CoverArt.get_prefix(image_sha1)
 293
 294             filename = CoverArt.get_original(prefix, image_sha1)
 295             dirname = CoverArt.get_dir(filename)
 296
 297             # get hash of existing file
 298             if os.path.exists(filename):
 299                 with open(filename) as f:
 300                     old_hash = file_hash(f).digest()
 301             else:
 302                 old_hash = ''
 303
 304             logger.info('Logo %s', cover_art)
 305
 306             # save new cover art
 307             with open(filename, 'w') as fp:
 308                 fp.write(urllib2.urlopen(cover_art).read())
 309
 310             # get hash of new file
 311             with open(filename) as f:
 312                 new_hash = file_hash(f).digest()
 313
 314             # remove thumbnails if cover changed
 315             if old_hash != new_hash:
 316                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 317                 logger.info('Removing %d thumbnails', len(thumbnails))
 318                 for f in thumbnails:
 319                     os.unlink(f)
 320
 321             return cover_art
 322
 323         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 324                 httplib.BadStatusLine) as e:
 325             logger.warn('Exception while updating podcast: %s', str(e))
 326
 327
 328     @repeat_on_conflict(['podcast'])
 329     def _mark_outdated(self, podcast, msg=''):
 330         logger.info('marking podcast outdated: %s', msg)
 331         podcast.outdated = True
 332         podcast.last_update = datetime.utcnow()
 333         podcast.save()
 334         self._update_episodes(podcast, [])