mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import copy
  21 import os.path
  22 import urllib2
  23 import httplib
  24 import hashlib
  25 from datetime import datetime
  26 from itertools import chain
  27
  28 from django.conf import settings
  29
  30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  31          PodcastSlug
  32 from mygpo.core.models import DEFAULT_UPDATE_INTERVAL, MIN_UPDATE_INTERVAL, \
  33     MAX_UPDATE_INTERVAL
  34 from feedservice.parse import parse_feed, FetchFeedException
  35 from feedservice.parse.text import ConvertMarkdown
  36 from feedservice.parse.models import ParserException
  37 from mygpo.utils import file_hash, deep_eq
  38 from mygpo.web.logo import CoverArt
  39 from mygpo.data.podcast import subscribe_at_hub
  40 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  41          episodes_for_podcast_current
  42 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
  43 from mygpo.directory.tags import update_category
  44 from mygpo.decorators import repeat_on_conflict
  45 from mygpo.db.couchdb import get_main_database, bulk_save_retry
  46
  47 import logging
  48 logger = logging.getLogger(__name__)
  49
  50
  51 class NoPodcastCreated(Exception):
  52     """ raised when no podcast obj was created for a new URL """
  53
  54
  55 class NoEpisodesException(Exception):
  56     """ raised when parsing something that doesn't contain any episodes """
  57
  58
  59 class PodcastUpdater(object):
  60     """ Updates a number of podcasts with data from their feeds """
  61
  62     def __init__(self):
  63         """ Queue is an iterable of podcast objects """
  64         self.db = get_main_database()
  65
  66
  67     def update_queue(self, queue):
  68         """ Fetch data for the URLs supplied as the queue iterable """
  69
  70         for n, podcast_url in enumerate(queue, 1):
  71             logger.info('Update %d - %s', n, podcast_url)
  72             try:
  73                 yield self.update(podcast_url)
  74
  75             except NoPodcastCreated as npc:
  76                 logger.info('No podcast created: %s', npc)
  77
  78
  79     def update(self, podcast_url):
  80         """ Update the podcast for the supplied URL """
  81
  82         try:
  83             parsed = self._fetch_feed(podcast_url)
  84             self._validate_parsed(parsed)
  85
  86         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  87
  88             # if we fail to parse the URL, we don't even create the
  89             # podcast object
  90             p = podcast_for_url(podcast_url, create=False)
  91             if p:
  92                 # if it exists already, we mark it as outdated
  93                 self._mark_outdated(p, 'error while fetching feed: %s' %
  94                     str(ex))
  95                 return p
  96
  97             else:
  98                 raise NoPodcastCreated(ex)
  99
 100         assert parsed, 'fetch_feed must return something'
 101         p = podcast_for_url(podcast_url, create=True)
 102         episodes = self._update_episodes(p, parsed.episodes)
 103         self._update_podcast(p, parsed, episodes)
 104         return p
 105
 106
 107     def verify_podcast_url(self, podcast_url):
 108         parsed = self._fetch_feed(podcast_url)
 109         self._validate_parsed(parsed)
 110         return True
 111
 112
 113     def _fetch_feed(self, podcast_url):
 114         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 115
 116
 117
 118     def _validate_parsed(self, parsed):
 119         """ validates the parsed results and raises an exception if invalid
 120
 121         feedparser parses pretty much everything. We reject anything that
 122         doesn't look like a feed"""
 123
 124         if not parsed or not parsed.episodes:
 125             raise NoEpisodesException('no episodes found')
 126
 127
 128     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 129     def _update_podcast(self, podcast, parsed, episodes):
 130         """ updates a podcast according to new parser results """
 131
 132         # we need that later to decide if we can "bump" a category
 133         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 134
 135         old_json = copy.deepcopy(podcast.to_json())
 136
 137         podcast.title = parsed.title or podcast.title
 138         podcast.urls = list(set(podcast.urls + parsed.urls))
 139         podcast.description = parsed.description or podcast.description
 140         podcast.subtitle = parsed.subtitle or podcast.subtitle
 141         podcast.link = parsed.link or podcast.link
 142         podcast.logo_url = parsed.logo or podcast.logo_url
 143         podcast.author = parsed.author or podcast.author
 144         podcast.language = parsed.language or podcast.language
 145         podcast.content_types = parsed.content_types or podcast.content_types
 146         podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
 147         podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
 148         podcast.new_location = parsed.new_location or podcast.new_location
 149         podcast.flattr_url = parsed.flattr or podcast.flattr_url
 150         podcast.hub = parsed.hub or podcast.hub
 151         podcast.license = parsed.license or podcast.license
 152
 153
 154         if podcast.new_location:
 155             new_podcast = podcast_for_url(podcast.new_location)
 156             if new_podcast != podcast:
 157                 self._mark_outdated(podcast, 'redirected to different podcast')
 158                 return
 159
 160             elif not new_podcast:
 161                 podcast.urls.insert(0, podcast.new_location)
 162
 163
 164         logger.info('Retrieved %d episodes in total', len(episodes))
 165
 166         # latest episode timestamp
 167         eps = filter(lambda e: bool(e.released), episodes)
 168         eps = sorted(eps, key=lambda e: e.released)
 169
 170         podcast.update_interval = get_update_interval(eps)
 171
 172         if eps:
 173             podcast.latest_episode_timestamp = eps[-1].released
 174             podcast.episode_count = len(eps)
 175
 176
 177         self._update_categories(podcast, prev_latest_episode_timestamp)
 178
 179         # try to download the logo and reset logo_url to None on http errors
 180         found = self._save_podcast_logo(podcast.logo_url)
 181         if not found:
 182             podcast.logo_url = None
 183
 184         if not deep_eq(old_json, podcast.to_json()):
 185             logger.info('Saving podcast.')
 186             podcast.last_update = datetime.utcnow()
 187             podcast.save()
 188
 189
 190         subscribe_at_hub(podcast)
 191
 192         assign_slug(podcast, PodcastSlug)
 193         assign_missing_episode_slugs(podcast)
 194
 195
 196     def _update_categories(self, podcast, prev_timestamp):
 197         """ checks some practical requirements and updates a category """
 198
 199         from datetime import timedelta
 200
 201         max_timestamp = datetime.utcnow() + timedelta(days=1)
 202
 203         # no episodes at all
 204         if not podcast.latest_episode_timestamp:
 205             return
 206
 207         # no new episode
 208         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 209             return
 210
 211         # too far in the future
 212         if podcast.latest_episode_timestamp > max_timestamp:
 213             return
 214
 215         # not enough subscribers
 216         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 217             return
 218
 219         update_category(podcast)
 220
 221
 222     def _update_episodes(self, podcast, parsed_episodes):
 223
 224         pid = podcast.get_id()
 225
 226         # list of (obj, fun) where fun is the function to update obj
 227         changes = []
 228         logger.info('Parsed %d episodes', len(parsed_episodes))
 229
 230         for n, parsed in enumerate(parsed_episodes, 1):
 231
 232             url = get_episode_url(parsed)
 233             if not url:
 234                 logger.info('Skipping episode %d for missing URL', n)
 235                 continue
 236
 237             logger.info('Updating episode %d / %d', n, len(parsed_episodes))
 238             episode = episode_for_podcast_id_url(pid, url, create=True)
 239
 240             update_episode = get_episode_update_function(parsed, episode,
 241                                                          podcast)
 242             changes.append((episode, update_episode))
 243
 244         # determine which episodes have been found
 245         updated_episodes = [e for (e, f) in changes]
 246         logging.info('Updating %d episodes with new data', len(updated_episodes))
 247
 248         # and mark the remaining ones outdated
 249         current_episodes = set(episodes_for_podcast_current(podcast, limit=100))
 250         outdated_episodes = current_episodes - set(updated_episodes)
 251         logging.info('Marking %d episodes as outdated', len(outdated_episodes))
 252         changes.extend((e, mark_outdated) for e in outdated_episodes)
 253
 254         logging.info('Saving %d changes', len(changes))
 255         bulk_save_retry(changes, self.db)
 256
 257         return updated_episodes
 258
 259
 260     def _save_podcast_logo(self, cover_art):
 261         if not cover_art:
 262             return
 263
 264         try:
 265             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 266             prefix = CoverArt.get_prefix(image_sha1)
 267
 268             filename = CoverArt.get_original(prefix, image_sha1)
 269             dirname = CoverArt.get_dir(filename)
 270
 271             # get hash of existing file
 272             if os.path.exists(filename):
 273                 with open(filename) as f:
 274                     old_hash = file_hash(f).digest()
 275             else:
 276                 old_hash = ''
 277
 278             logger.info('Logo %s', cover_art)
 279
 280             # save new cover art
 281             with open(filename, 'w') as fp:
 282                 fp.write(urllib2.urlopen(cover_art).read())
 283
 284             # get hash of new file
 285             with open(filename) as f:
 286                 new_hash = file_hash(f).digest()
 287
 288             # remove thumbnails if cover changed
 289             if old_hash != new_hash:
 290                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 291                 logger.info('Removing %d thumbnails', len(thumbnails))
 292                 for f in thumbnails:
 293                     os.unlink(f)
 294
 295             return cover_art
 296
 297         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 298                 httplib.BadStatusLine) as e:
 299             logger.warn('Exception while updating podcast: %s', str(e))
 300
 301
 302     @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
 303     def _mark_outdated(self, podcast, msg=''):
 304         logger.info('marking podcast outdated: %s', msg)
 305         podcast.outdated = True
 306         podcast.last_update = datetime.utcnow()
 307         podcast.save()
 308         self._update_episodes(podcast, [])
 309
 310
 311 def get_episode_url(parsed_episode):
 312     """ returns the URL of a parsed episode """
 313     for f in parsed_episode.files:
 314         if f.urls:
 315             return f.urls[0]
 316     return None
 317
 318
 319 def get_episode_update_function(parsed_episode, episode, podcast):
 320     """ returns an update function that can be passed to bulk_save_retry """
 321
 322     def update_episode(episode):
 323         """ updates "episode" with the data from "parsed_episode" """
 324
 325         # copy the json so we can determine if there have been any changes
 326         old_json = copy.deepcopy(episode.to_json())
 327
 328         episode.guid = parsed_episode.guid or episode.guid
 329         episode.title = parsed_episode.title or episode.title
 330         episode.description = parsed_episode.description or episode.description
 331         episode.subtitle = parsed_episode.subtitle or episode.subtitle
 332         episode.content = parsed_episode.content or parsed_episode.description or episode.content
 333         episode.link = parsed_episode.link or episode.link
 334         episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
 335         episode.author = parsed_episode.author or episode.author
 336         episode.duration = parsed_episode.duration or episode.duration
 337         episode.filesize = parsed_episode.files[0].filesize
 338         episode.language = parsed_episode.language or episode.language or \
 339                                                       podcast.language
 340         episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
 341         episode.flattr_url = parsed_episode.flattr or episode.flattr_url
 342         episode.license = parsed_episode.license or episode.license
 343
 344         urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 345         episode.urls = sorted(set(episode.urls + urls), key=len)
 346
 347         # if nothing changed we return None to indicate no required action
 348         if deep_eq(old_json, episode.to_json()):
 349             return None
 350
 351         # set the last_update only if there have been changed above
 352         episode.last_update = datetime.utcnow()
 353         return episode
 354
 355     return update_episode
 356
 357 def mark_outdated(obj):
 358     """ marks obj outdated if its not already """
 359     if obj.outdated:
 360         return None
 361
 362     obj.outdated = True
 363     obj.last_update = datetime.utcnow()
 364     return obj
 365
 366
 367 def get_update_interval(episodes):
 368     """ calculates the avg interval between new episodes """
 369
 370     count = len(episodes)
 371     if count <= 1:
 372         logger.info('%d episodes, using default interval of %dh',
 373             count, DEFAULT_UPDATE_INTERVAL)
 374         return DEFAULT_UPDATE_INTERVAL
 375
 376     earliest = episodes[0]
 377     latest   = episodes[-1]
 378
 379     timespan_s = (latest.released - earliest.released).total_seconds()
 380     timespan_h = timespan_s / 60 / 60
 381
 382     interval = int(timespan_h / count)
 383     logger.info('%d episodes in %d days => %dh interval', count,
 384         timespan_h / 24, interval)
 385
 386     # place interval between {MIN,MAX}_UPDATE_INTERVAL
 387     interval = max(interval, MIN_UPDATE_INTERVAL)
 388     interval = min(interval, MAX_UPDATE_INTERVAL)
 389
 390     return interval