mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain
  26
  27 from django.conf import settings
  28
  29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  30          PodcastSlug
  31 from feedservice.parse import parse_feed, FetchFeedException
  32 from feedservice.parse.text import ConvertMarkdown
  33 from feedservice.parse.models import ParserException
  34 from mygpo.utils import file_hash, split_list
  35 from mygpo.web.logo import CoverArt
  36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  37          episodes_for_podcast_uncached
  38 from mygpo.db.couchdb.podcast import podcast_for_url
  39 from mygpo.directory.tags import update_category
  40 from mygpo.decorators import repeat_on_conflict
  41 from mygpo.db.couchdb import get_main_database
  42
  43
  44 class NoPodcastCreated(Exception):
  45     """ raised when no podcast obj was created for a new URL """
  46
  47
  48 class NoEpisodesException(Exception):
  49     """ raised when parsing something that doesn't contain any episodes """
  50
  51
  52 class PodcastUpdater(object):
  53     """ Updates a number of podcasts with data from their feeds """
  54
  55     def __init__(self):
  56         """ Queue is an iterable of podcast objects """
  57         self.db = get_main_database()
  58
  59
  60     def update_queue(self, queue):
  61         """ Fetch data for the URLs supplied as the queue iterable """
  62
  63         for n, podcast_url in enumerate(queue):
  64             print n, podcast_url
  65             try:
  66                 yield self.update(podcast_url)
  67
  68             except NoPodcastCreated as npc:
  69                 print 'no podcast created:', npc
  70
  71             print
  72
  73
  74     def update(self, podcast_url):
  75         """ Update the podcast for the supplied URL """
  76
  77         try:
  78             parsed = self._fetch_feed(podcast_url)
  79             self._validate_parsed(parsed)
  80
  81         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  82
  83             # if we fail to parse the URL, we don't even create the
  84             # podcast object
  85             p = podcast_for_url(podcast_url, create=False)
  86             if p:
  87                 # if it exists already, we mark it as outdated
  88                 self._mark_outdated(p)
  89                 return
  90
  91             else:
  92                 raise NoPodcastCreated(ex)
  93
  94         assert parsed, 'fetch_feed must return something'
  95         p = podcast_for_url(podcast_url, create=True)
  96         self._update_podcast(p, parsed)
  97         return p
  98
  99
 100     def verify_podcast_url(self, podcast_url):
 101         parsed = self._fetch_feed(podcast_url)
 102         self._validate_parsed(parsed)
 103         return True
 104
 105
 106     def _fetch_feed(self, podcast_url):
 107         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 108
 109
 110
 111     def _validate_parsed(self, parsed):
 112         """ validates the parsed results and raises an exception if invalid
 113
 114         feedparser parses pretty much everything. We reject anything that
 115         doesn't look like a feed"""
 116
 117         if not parsed or not parsed.episodes:
 118             raise NoEpisodesException('no episodes found')
 119
 120
 121     @repeat_on_conflict(['podcast'])
 122     def _update_podcast(self, podcast, parsed):
 123         """ updates a podcast according to new parser results """
 124
 125         changed = False
 126
 127         # we need that later to decide if we can "bump" a category
 128         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 129
 130         changed |= update_a(podcast, 'title', parsed.title or podcast.title)
 131         changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
 132         changed |= update_a(podcast, 'description', parsed.description or podcast.description)
 133         changed |= update_a(podcast, 'link',  parsed.link or podcast.link)
 134         changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
 135         changed |= update_a(podcast, 'author', parsed.author or podcast.author)
 136         changed |= update_a(podcast, 'language', parsed.language or podcast.language)
 137         changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
 138         changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
 139         changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
 140         changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
 141         changed |= update_a(podcast, 'flattr_url', parsed.flattr)
 142
 143
 144         if podcast.new_location:
 145             new_podcast = podcast_for_url(podcast.new_location)
 146             if new_podcast != podcast:
 147                 self._mark_outdated(podcast, 'redirected to different podcast')
 148                 return
 149
 150             elif not new_podcast:
 151                 podcast.urls.insert(0, podcast.new_location)
 152                 changed = True
 153
 154
 155         episodes = self._update_episodes(podcast, parsed.episodes)
 156
 157         # latest episode timestamp
 158         eps = filter(lambda e: bool(e.released), episodes)
 159         eps = sorted(eps, key=lambda e: e.released)
 160         if eps:
 161             changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
 162             changed |= update_a(podcast, 'episode_count', len(eps))
 163
 164
 165         self._update_categories(podcast, prev_latest_episode_timestamp)
 166
 167         # try to download the logo and reset logo_url to None on http errors
 168         found = self._save_podcast_logo(podcast.logo_url)
 169         if not found:
 170             changed |= update_a(podcast, 'logo_url', None)
 171
 172         if changed:
 173             print 'saving podcast'
 174             podcast.last_update = datetime.utcnow()
 175             podcast.save()
 176
 177
 178         assign_slug(podcast, PodcastSlug)
 179         assign_missing_episode_slugs(podcast)
 180
 181
 182     def _update_categories(self, podcast, prev_timestamp):
 183         """ checks some practical requirements and updates a category """
 184
 185         from datetime import timedelta
 186
 187         max_timestamp = datetime.utcnow() + timedelta(days=1)
 188
 189         # no episodes at all
 190         if not podcast.latest_episode_timestamp:
 191             return
 192
 193         # no new episode
 194         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 195             return
 196
 197         # too far in the future
 198         if podcast.latest_episode_timestamp > max_timestamp:
 199             return
 200
 201         # not enough subscribers
 202         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 203             return
 204
 205         update_category(podcast)
 206
 207
 208     @repeat_on_conflict(['podcast'])
 209     def _update_episodes(self, podcast, parsed_episodes):
 210
 211         all_episodes = set(episodes_for_podcast_uncached(podcast))
 212         remaining = list(all_episodes)
 213         updated_episodes = []
 214
 215         for parsed_episode in parsed_episodes:
 216
 217             url = None
 218
 219             for f in parsed_episode.files:
 220                 if f.urls:
 221                     url = f.urls[0]
 222
 223             if not url:
 224                 continue
 225
 226             guid = parsed_episode.guid
 227
 228             # pop matchin episodes out of the "existing" list
 229             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 230
 231             if not matching:
 232                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 233                     url, create=True)
 234                 matching = [new_episode]
 235                 all_episodes.add(new_episode)
 236
 237
 238             for episode in matching:
 239                 changed = False
 240                 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
 241                 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
 242                 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
 243                 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
 244                 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
 245                 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
 246                 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
 247                 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
 248                 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
 249                 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
 250                 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 251                 changed |= update_a(episode, 'flattr_url', parsed_episode.flattr)
 252
 253                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 254                 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
 255
 256                 if changed:
 257                     episode.last_update = datetime.utcnow()
 258                     updated_episodes.append(episode)
 259
 260
 261         outdated_episodes = all_episodes - set(updated_episodes)
 262
 263         # set episodes to be outdated, where necessary
 264         for e in filter(lambda e: not e.outdated, outdated_episodes):
 265             e.outdated = True
 266             updated_episodes.append(e)
 267
 268
 269         if updated_episodes:
 270             print 'Updating', len(updated_episodes), 'episodes'
 271             self.db.save_docs(updated_episodes)
 272
 273         return all_episodes
 274
 275
 276     def _save_podcast_logo(self, cover_art):
 277         if not cover_art:
 278             return
 279
 280         try:
 281             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 282             prefix = CoverArt.get_prefix(image_sha1)
 283
 284             filename = CoverArt.get_original(prefix, image_sha1)
 285             dirname = CoverArt.get_dir(filename)
 286
 287             # get hash of existing file
 288             if os.path.exists(filename):
 289                 with open(filename) as f:
 290                     old_hash = file_hash(f).digest()
 291             else:
 292                 old_hash = ''
 293
 294             print 'LOGO @', cover_art
 295
 296             # save new cover art
 297             with open(filename, 'w') as fp:
 298                 fp.write(urllib2.urlopen(cover_art).read())
 299
 300             # get hash of new file
 301             with open(filename) as f:
 302                 new_hash = file_hash(f).digest()
 303
 304             # remove thumbnails if cover changed
 305             if old_hash != new_hash:
 306                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 307                 print 'Removing %d thumbnails' % len(thumbnails)
 308                 for f in thumbnails:
 309                     os.unlink(f)
 310
 311             return cover_art
 312
 313         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 314                 httplib.BadStatusLine) as e:
 315             print e
 316
 317
 318     @repeat_on_conflict(['podcast'])
 319     def _mark_outdated(self, podcast, msg=''):
 320         print 'mark outdated', msg
 321         podcast.outdated = True
 322         podcast.last_update = datetime.utcnow()
 323         podcast.save()
 324         self._update_episodes(podcast, [])
 325
 326
 327
 328 _none = object()
 329
 330 def update_a(obj, attrib, value):
 331     changed = getattr(obj, attrib, _none) != value
 332     setattr(obj, attrib, value)
 333     return changed
 334
 335
 336 def update_i(obj, item, value):
 337     changed = obj.get(item, _none) != value
 338     obj[item] = value
 339     return changed