mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain
  26
  27 from django.conf import settings
  28
  29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  30          PodcastSlug
  31 from feedservice.parse import parse_feed, FetchFeedException
  32 from feedservice.parse.text import ConvertMarkdown
  33 from feedservice.parse.models import ParserException
  34 from mygpo.utils import file_hash, split_list
  35 from mygpo.web.logo import CoverArt
  36 from mygpo.data.podcast import subscribe_at_hub
  37 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  38          episodes_for_podcast_uncached
  39 from mygpo.db.couchdb.podcast import podcast_for_url
  40 from mygpo.directory.tags import update_category
  41 from mygpo.decorators import repeat_on_conflict
  42 from mygpo.db.couchdb import get_main_database
  43
  44
  45 class NoPodcastCreated(Exception):
  46     """ raised when no podcast obj was created for a new URL """
  47
  48
  49 class NoEpisodesException(Exception):
  50     """ raised when parsing something that doesn't contain any episodes """
  51
  52
  53 class PodcastUpdater(object):
  54     """ Updates a number of podcasts with data from their feeds """
  55
  56     def __init__(self):
  57         """ Queue is an iterable of podcast objects """
  58         self.db = get_main_database()
  59
  60
  61     def update_queue(self, queue):
  62         """ Fetch data for the URLs supplied as the queue iterable """
  63
  64         for n, podcast_url in enumerate(queue):
  65             print n, podcast_url
  66             try:
  67                 yield self.update(podcast_url)
  68
  69             except NoPodcastCreated as npc:
  70                 print 'no podcast created:', npc
  71
  72             print
  73
  74
  75     def update(self, podcast_url):
  76         """ Update the podcast for the supplied URL """
  77
  78         try:
  79             parsed = self._fetch_feed(podcast_url)
  80             self._validate_parsed(parsed)
  81
  82         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  83
  84             # if we fail to parse the URL, we don't even create the
  85             # podcast object
  86             p = podcast_for_url(podcast_url, create=False)
  87             if p:
  88                 # if it exists already, we mark it as outdated
  89                 self._mark_outdated(p)
  90                 return
  91
  92             else:
  93                 raise NoPodcastCreated(ex)
  94
  95         assert parsed, 'fetch_feed must return something'
  96         p = podcast_for_url(podcast_url, create=True)
  97         self._update_podcast(p, parsed)
  98         return p
  99
 100
 101     def verify_podcast_url(self, podcast_url):
 102         parsed = self._fetch_feed(podcast_url)
 103         self._validate_parsed(parsed)
 104         return True
 105
 106
 107     def _fetch_feed(self, podcast_url):
 108         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 109
 110
 111
 112     def _validate_parsed(self, parsed):
 113         """ validates the parsed results and raises an exception if invalid
 114
 115         feedparser parses pretty much everything. We reject anything that
 116         doesn't look like a feed"""
 117
 118         if not parsed or not parsed.episodes:
 119             raise NoEpisodesException('no episodes found')
 120
 121
 122     @repeat_on_conflict(['podcast'])
 123     def _update_podcast(self, podcast, parsed):
 124         """ updates a podcast according to new parser results """
 125
 126         changed = False
 127
 128         # we need that later to decide if we can "bump" a category
 129         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 130
 131         changed |= update_a(podcast, 'title', parsed.title or podcast.title)
 132         changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
 133         changed |= update_a(podcast, 'description', parsed.description or podcast.description)
 134         changed |= update_a(podcast, 'link',  parsed.link or podcast.link)
 135         changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
 136         changed |= update_a(podcast, 'author', parsed.author or podcast.author)
 137         changed |= update_a(podcast, 'language', parsed.language or podcast.language)
 138         changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
 139         changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
 140         changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
 141         changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
 142         changed |= update_a(podcast, 'flattr_url', parsed.flattr)
 143         changed |= update_a(podcast, 'hub', parsed.hub)
 144
 145
 146         if podcast.new_location:
 147             new_podcast = podcast_for_url(podcast.new_location)
 148             if new_podcast != podcast:
 149                 self._mark_outdated(podcast, 'redirected to different podcast')
 150                 return
 151
 152             elif not new_podcast:
 153                 podcast.urls.insert(0, podcast.new_location)
 154                 changed = True
 155
 156
 157         episodes = self._update_episodes(podcast, parsed.episodes)
 158
 159         # latest episode timestamp
 160         eps = filter(lambda e: bool(e.released), episodes)
 161         eps = sorted(eps, key=lambda e: e.released)
 162         if eps:
 163             changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
 164             changed |= update_a(podcast, 'episode_count', len(eps))
 165
 166
 167         self._update_categories(podcast, prev_latest_episode_timestamp)
 168
 169         # try to download the logo and reset logo_url to None on http errors
 170         found = self._save_podcast_logo(podcast.logo_url)
 171         if not found:
 172             changed |= update_a(podcast, 'logo_url', None)
 173
 174         if changed:
 175             print 'saving podcast'
 176             podcast.last_update = datetime.utcnow()
 177             podcast.save()
 178
 179
 180         subscribe_at_hub(podcast)
 181
 182         assign_slug(podcast, PodcastSlug)
 183         assign_missing_episode_slugs(podcast)
 184
 185
 186     def _update_categories(self, podcast, prev_timestamp):
 187         """ checks some practical requirements and updates a category """
 188
 189         from datetime import timedelta
 190
 191         max_timestamp = datetime.utcnow() + timedelta(days=1)
 192
 193         # no episodes at all
 194         if not podcast.latest_episode_timestamp:
 195             return
 196
 197         # no new episode
 198         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 199             return
 200
 201         # too far in the future
 202         if podcast.latest_episode_timestamp > max_timestamp:
 203             return
 204
 205         # not enough subscribers
 206         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 207             return
 208
 209         update_category(podcast)
 210
 211
 212     @repeat_on_conflict(['podcast'])
 213     def _update_episodes(self, podcast, parsed_episodes):
 214
 215         all_episodes = set(episodes_for_podcast_uncached(podcast))
 216         remaining = list(all_episodes)
 217         updated_episodes = []
 218
 219         for parsed_episode in parsed_episodes:
 220
 221             url = None
 222
 223             for f in parsed_episode.files:
 224                 if f.urls:
 225                     url = f.urls[0]
 226
 227             if not url:
 228                 continue
 229
 230             guid = parsed_episode.guid
 231
 232             # pop matchin episodes out of the "existing" list
 233             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 234
 235             if not matching:
 236                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 237                     url, create=True)
 238                 matching = [new_episode]
 239                 all_episodes.add(new_episode)
 240
 241
 242             for episode in matching:
 243                 changed = False
 244                 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
 245                 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
 246                 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
 247                 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
 248                 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
 249                 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
 250                 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
 251                 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
 252                 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
 253                 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
 254                 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 255                 changed |= update_a(episode, 'flattr_url', parsed_episode.flattr)
 256
 257                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 258                 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
 259
 260                 if changed:
 261                     episode.last_update = datetime.utcnow()
 262                     updated_episodes.append(episode)
 263
 264
 265         outdated_episodes = all_episodes - set(updated_episodes)
 266
 267         # set episodes to be outdated, where necessary
 268         for e in filter(lambda e: not e.outdated, outdated_episodes):
 269             e.outdated = True
 270             updated_episodes.append(e)
 271
 272
 273         if updated_episodes:
 274             print 'Updating', len(updated_episodes), 'episodes'
 275             self.db.save_docs(updated_episodes)
 276
 277         return all_episodes
 278
 279
 280     def _save_podcast_logo(self, cover_art):
 281         if not cover_art:
 282             return
 283
 284         try:
 285             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 286             prefix = CoverArt.get_prefix(image_sha1)
 287
 288             filename = CoverArt.get_original(prefix, image_sha1)
 289             dirname = CoverArt.get_dir(filename)
 290
 291             # get hash of existing file
 292             if os.path.exists(filename):
 293                 with open(filename) as f:
 294                     old_hash = file_hash(f).digest()
 295             else:
 296                 old_hash = ''
 297
 298             print 'LOGO @', cover_art
 299
 300             # save new cover art
 301             with open(filename, 'w') as fp:
 302                 fp.write(urllib2.urlopen(cover_art).read())
 303
 304             # get hash of new file
 305             with open(filename) as f:
 306                 new_hash = file_hash(f).digest()
 307
 308             # remove thumbnails if cover changed
 309             if old_hash != new_hash:
 310                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 311                 print 'Removing %d thumbnails' % len(thumbnails)
 312                 for f in thumbnails:
 313                     os.unlink(f)
 314
 315             return cover_art
 316
 317         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 318                 httplib.BadStatusLine) as e:
 319             print e
 320
 321
 322     @repeat_on_conflict(['podcast'])
 323     def _mark_outdated(self, podcast, msg=''):
 324         print 'mark outdated', msg
 325         podcast.outdated = True
 326         podcast.last_update = datetime.utcnow()
 327         podcast.save()
 328         self._update_episodes(podcast, [])
 329
 330
 331
 332 _none = object()
 333
 334 def update_a(obj, attrib, value):
 335     changed = getattr(obj, attrib, _none) != value
 336     setattr(obj, attrib, value)
 337     return changed
 338
 339
 340 def update_i(obj, item, value):
 341     changed = obj.get(item, _none) != value
 342     obj[item] = value
 343     return changed