mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain
  27
  28 from django.conf import settings
  29
  30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  31          PodcastSlug
  32 from feedservice.parse import parse_feed, FetchFeedException
  33 from feedservice.parse.text import ConvertMarkdown
  34 from feedservice.parse.models import ParserException
  35 from mygpo.utils import file_hash, deep_eq
  36 from mygpo.web.logo import CoverArt
  37 from mygpo.data.podcast import subscribe_at_hub
  38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  39          episodes_for_podcast_current
  40 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
  41 from mygpo.directory.tags import update_category
  42 from mygpo.decorators import repeat_on_conflict
  43 from mygpo.db.couchdb import get_main_database, bulk_save_retry
  44
  45 import logging
  46 logger = logging.getLogger(__name__)
  47
  48
  49 class NoPodcastCreated(Exception):
  50     """ raised when no podcast obj was created for a new URL """
  51
  52
  53 class NoEpisodesException(Exception):
  54     """ raised when parsing something that doesn't contain any episodes """
  55
  56
  57 class PodcastUpdater(object):
  58     """ Updates a number of podcasts with data from their feeds """
  59
  60     def __init__(self):
  61         """ Queue is an iterable of podcast objects """
  62         self.db = get_main_database()
  63
  64
  65     def update_queue(self, queue):
  66         """ Fetch data for the URLs supplied as the queue iterable """
  67
  68         for n, podcast_url in enumerate(queue, 1):
  69             logger.info('Update %d - %s', n, podcast_url)
  70             try:
  71                 yield self.update(podcast_url)
  72
  73             except NoPodcastCreated as npc:
  74                 logger.info('No podcast created: %s', npc)
  75
  76
  77     def update(self, podcast_url):
  78         """ Update the podcast for the supplied URL """
  79
  80         try:
  81             parsed = self._fetch_feed(podcast_url)
  82             self._validate_parsed(parsed)
  83
  84         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  85
  86             # if we fail to parse the URL, we don't even create the
  87             # podcast object
  88             p = podcast_for_url(podcast_url, create=False)
  89             if p:
  90                 # if it exists already, we mark it as outdated
  91                 self._mark_outdated(p, 'error while fetching feed: %s' %
  92                     str(ex))
  93                 return p
  94
  95             else:
  96                 raise NoPodcastCreated(ex)
  97
  98         assert parsed, 'fetch_feed must return something'
  99         p = podcast_for_url(podcast_url, create=True)
 100         episodes = self._update_episodes(p, parsed.episodes)
 101         self._update_podcast(p, parsed, episodes)
 102         return p
 103
 104
 105     def verify_podcast_url(self, podcast_url):
 106         parsed = self._fetch_feed(podcast_url)
 107         self._validate_parsed(parsed)
 108         return True
 109
 110
 111     def _fetch_feed(self, podcast_url):
 112         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 113
 114
 115
 116     def _validate_parsed(self, parsed):
 117         """ validates the parsed results and raises an exception if invalid
 118
 119         feedparser parses pretty much everything. We reject anything that
 120         doesn't look like a feed"""
 121
 122         if not parsed or not parsed.episodes:
 123             raise NoEpisodesException('no episodes found')
 124
 125
 126     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 127     def _update_podcast(self, podcast, parsed, episodes):
 128         """ updates a podcast according to new parser results """
 129
 130         # we need that later to decide if we can "bump" a category
 131         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 132
 133         old_json = copy.deepcopy(podcast.to_json())
 134
 135         podcast.title = parsed.title or podcast.title
 136         podcast.urls = list(set(podcast.urls + parsed.urls))
 137         podcast.description = parsed.description or podcast.description
 138         podcast.subtitle = parsed.subtitle or podcast.subtitle
 139         podcast.link = parsed.link or podcast.link
 140         podcast.logo_url = parsed.logo or podcast.logo_url
 141         podcast.author = parsed.author or podcast.author
 142         podcast.language = parsed.language or podcast.language
 143         podcast.content_types = parsed.content_types or podcast.content_types
 144         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 145         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 146         podcast.new_location = parsed.new_location or podcast.new_location
 147         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 148         podcast.hub = parsed.hub or podcast.hub
 149         podcast.license = parsed.license or podcast.license
 150
 151
 152         if podcast.new_location:
 153             new_podcast = podcast_for_url(podcast.new_location)
 154             if new_podcast != podcast:
 155                 self._mark_outdated(podcast, 'redirected to different podcast')
 156                 return
 157
 158             elif not new_podcast:
 159                 podcast.urls.insert(0, podcast.new_location)
 160
 161
 162         logger.info('Retrieved %d episodes in total', len(episodes))
 163
 164         # latest episode timestamp
 165         eps = filter(lambda e: bool(e.released), episodes)
 166         eps = sorted(eps, key=lambda e: e.released)
 167         if eps:
 168             podcast.latest_episode_timestamp = eps[-1].released
 169             podcast.episode_count = len(eps)
 170
 171
 172         self._update_categories(podcast, prev_latest_episode_timestamp)
 173
 174         # try to download the logo and reset logo_url to None on http errors
 175         found = self._save_podcast_logo(podcast.logo_url)
 176         if not found:
 177             podcast.logo_url = None
 178
 179         if not deep_eq(old_json, podcast.to_json()):
 180             logger.info('Saving podcast.')
 181             podcast.last_update = datetime.utcnow()
 182             podcast.save()
 183
 184
 185         subscribe_at_hub(podcast)
 186
 187         assign_slug(podcast, PodcastSlug)
 188         assign_missing_episode_slugs(podcast)
 189
 190
 191     def _update_categories(self, podcast, prev_timestamp):
 192         """ checks some practical requirements and updates a category """
 193
 194         from datetime import timedelta
 195
 196         max_timestamp = datetime.utcnow() + timedelta(days=1)
 197
 198         # no episodes at all
 199         if not podcast.latest_episode_timestamp:
 200             return
 201
 202         # no new episode
 203         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 204             return
 205
 206         # too far in the future
 207         if podcast.latest_episode_timestamp > max_timestamp:
 208             return
 209
 210         # not enough subscribers
 211         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 212             return
 213
 214         update_category(podcast)
 215
 216
 217     def _update_episodes(self, podcast, parsed_episodes):
 218
 219         pid = podcast.get_id()
 220
 221         # list of (obj, fun) where fun is the function to update obj
 222         changes = []
 223         logger.info('Parsed %d episodes', len(parsed_episodes))
 224
 225         for n, parsed in enumerate(parsed_episodes, 1):
 226
 227             url = get_episode_url(parsed)
 228             if not url:
 229                 logger.info('Skipping episode %d for missing URL', n)
 230                 continue
 231
 232             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 233             episode = episode_for_podcast_id_url(pid, url, create=True)
 234
 235             update_episode = get_episode_update_function(parsed, episode,
 236                                                          podcast)
 237             changes.append((episode, update_episode))
 238
 239         # determine which episodes have been found
 240         updated_episodes = [e for (e, f) in changes]
 241         logging.info('Updating %d episodes with new data', len(updated_episodes))
 242
 243         # and mark the remaining ones outdated
 244         current_episodes = set(episodes_for_podcast_current(podcast, limit=100))
 245         outdated_episodes = current_episodes - set(updated_episodes)
 246         logging.info('Marking %d episodes as outdated', len(outdated_episodes))
 247         changes.extend((e, mark_outdated) for e in outdated_episodes)
 248
 249         logging.info('Saving %d changes', len(changes))
 250         bulk_save_retry(changes, self.db)
 251
 252         return updated_episodes
 253
 254
 255     def _save_podcast_logo(self, cover_art):
 256         if not cover_art:
 257             return
 258
 259         try:
 260             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 261             prefix = CoverArt.get_prefix(image_sha1)
 262
 263             filename = CoverArt.get_original(prefix, image_sha1)
 264             dirname = CoverArt.get_dir(filename)
 265
 266             # get hash of existing file
 267             if os.path.exists(filename):
 268                 with open(filename) as f:
 269                     old_hash = file_hash(f).digest()
 270             else:
 271                 old_hash = ''
 272
 273             logger.info('Logo %s', cover_art)
 274
 275             # save new cover art
 276             with open(filename, 'w') as fp:
 277                 fp.write(urllib2.urlopen(cover_art).read())
 278
 279             # get hash of new file
 280             with open(filename) as f:
 281                 new_hash = file_hash(f).digest()
 282
 283             # remove thumbnails if cover changed
 284             if old_hash != new_hash:
 285                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 286                 logger.info('Removing %d thumbnails', len(thumbnails))
 287                 for f in thumbnails:
 288                     os.unlink(f)
 289
 290             return cover_art
 291
 292         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 293                 httplib.BadStatusLine) as e:
 294             logger.warn('Exception while updating podcast: %s', str(e))
 295
 296
 297     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 298     def _mark_outdated(self, podcast, msg=''):
 299         logger.info('marking podcast outdated: %s', msg)
 300         podcast.outdated = True
 301         podcast.last_update = datetime.utcnow()
 302         podcast.save()
 303         self._update_episodes(podcast, [])
 304
 305
 306 def get_episode_url(parsed_episode):
 307     """ returns the URL of a parsed episode """
 308     for f in parsed_episode.files:
 309         if f.urls:
 310             return f.urls[0]
 311     return None
 312
 313
 314 def get_episode_update_function(parsed_episode, episode, podcast):
 315     """ returns an update function that can be passed to bulk_save_retry """
 316
 317     def update_episode(episode):
 318         """ updates "episode" with the data from "parsed_episode" """
 319
 320         # copy the json so we can determine if there have been any changes
 321         old_json = copy.deepcopy(episode.to_json())
 322
 323         episode.guid = parsed_episode.guid or episode.guid
 324         episode.title = parsed_episode.title or episode.title
 325         episode.description = parsed_episode.description or episode.description
 326         episode.subtitle = parsed_episode.subtitle or episode.subtitle
 327         episode.content = parsed_episode.content or parsed_episode.description or episode.content
 328         episode.link = parsed_episode.link or episode.link
 329         episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 330         episode.author = parsed_episode.author or episode.author
 331         episode.duration = parsed_episode.duration or episode.duration
 332         episode.filesize = parsed_episode.files[0].filesize
 333         episode.language = parsed_episode.language or episode.language or \
 334                                                       podcast.language
 335         episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 336         episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 337         episode.license = parsed_episode.license or episode.license
 338
 339         urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 340         episode.urls = sorted(set(episode.urls + urls), key=len)
 341
 342         # if nothing changed we return None to indicate no required action
 343         if deep_eq(old_json, episode.to_json()):
 344             return None
 345
 346         # set the last_update only if there have been changed above
 347         episode.last_update = datetime.utcnow()
 348         return episode
 349
 350     return update_episode
 351
 352 def mark_outdated(obj):
 353     """ marks obj outdated if its not already """
 354     if obj.outdated:
 355         return None
 356
 357     obj.outdated = True
 358     obj.last_update = datetime.utcnow()
 359     return obj