mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain
  26
  27 from django.conf import settings
  28
  29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  30          PodcastSlug
  31 from feedservice.parse import parse_feed, FetchFeedException
  32 from feedservice.parse.text import ConvertMarkdown
  33 from feedservice.parse.models import ParserException
  34 from mygpo.utils import file_hash, split_list
  35 from mygpo.web.logo import CoverArt
  36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  37          episodes_for_podcast_uncached
  38 from mygpo.db.couchdb.podcast import podcast_for_url
  39 from mygpo.directory.tags import update_category
  40 from mygpo.decorators import repeat_on_conflict
  41 from mygpo.db.couchdb import get_main_database
  42
  43 import socket
  44 socket.setdefaulttimeout(300)
  45
  46
  47 class NoPodcastCreated(Exception):
  48     """ raised when no podcast obj was created for a new URL """
  49
  50
  51 class NoEpisodesException(Exception):
  52     """ raised when parsing something that doesn't contain any episodes """
  53
  54
  55 class PodcastUpdater(object):
  56     """ Updates a number of podcasts with data from their feeds """
  57
  58     def __init__(self):
  59         """ Queue is an iterable of podcast objects """
  60         self.db = get_main_database()
  61
  62
  63     def update_queue(self, queue):
  64         """ Fetch data for the URLs supplied as the queue iterable """
  65
  66         for n, podcast_url in enumerate(queue):
  67             print n, podcast_url
  68             try:
  69                 self.update(podcast_url)
  70
  71             except NoPodcastCreated as npc:
  72                 print 'no podcast created:', npc
  73
  74             print
  75
  76
  77     def update(self, podcast_url):
  78         """ Update the podcast for the supplied URL """
  79
  80         try:
  81             parsed = self._fetch_feed(podcast_url)
  82             self._validate_parsed(parsed)
  83
  84         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  85
  86             # if we fail to parse the URL, we don't even create the
  87             # podcast object
  88             p = podcast_for_url(podcast_url, create=False)
  89             if p:
  90                 # if it exists already, we mark it as outdated
  91                 self._mark_outdated(p)
  92                 return
  93
  94             else:
  95                 raise NoPodcastCreated(ex)
  96
  97         assert parsed, 'fetch_feed must return something'
  98         p = podcast_for_url(podcast_url, create=True)
  99         self._update_podcast(p, parsed)
 100         return p
 101
 102
 103     def verify_podcast_url(self, podcast_url):
 104         parsed = self._fetch_feed(podcast_url)
 105         self._validate_parsed(parsed)
 106         return True
 107
 108
 109     def _fetch_feed(self, podcast_url):
 110         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 111
 112
 113
 114     def _validate_parsed(self, parsed):
 115         """ validates the parsed results and raises an exception if invalid
 116
 117         feedparser parses pretty much everything. We reject anything that
 118         doesn't look like a feed"""
 119
 120         if not parsed or not parsed.episodes:
 121             raise NoEpisodesException('no episodes found')
 122
 123
 124     @repeat_on_conflict(['podcast'])
 125     def _update_podcast(self, podcast, parsed):
 126         """ updates a podcast according to new parser results """
 127
 128         changed = False
 129
 130         # we need that later to decide if we can "bump" a category
 131         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 132
 133         changed |= update_a(podcast, 'title', parsed.title or podcast.title)
 134         changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
 135         changed |= update_a(podcast, 'description', parsed.description or podcast.description)
 136         changed |= update_a(podcast, 'link',  parsed.link or podcast.link)
 137         changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
 138         changed |= update_a(podcast, 'author', parsed.author or podcast.author)
 139         changed |= update_a(podcast, 'language', parsed.language or podcast.language)
 140         changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
 141         changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
 142         changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
 143         changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
 144         changed |= update_a(podcast, 'flattr_url', parsed.flattr)
 145
 146
 147         if podcast.new_location:
 148             new_podcast = podcast_for_url(podcast.new_location)
 149             if new_podcast != podcast:
 150                 self._mark_outdated(podcast, 'redirected to different podcast')
 151                 return
 152
 153             elif not new_podcast:
 154                 podcast.urls.insert(0, podcast.new_location)
 155                 changed = True
 156
 157
 158         episodes = self._update_episodes(podcast, parsed.episodes)
 159
 160         # latest episode timestamp
 161         eps = filter(lambda e: bool(e.released), episodes)
 162         eps = sorted(eps, key=lambda e: e.released)
 163         if eps:
 164             changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
 165             changed |= update_a(podcast, 'episode_count', len(eps))
 166
 167
 168         self._update_categories(podcast, prev_latest_episode_timestamp)
 169
 170         # try to download the logo and reset logo_url to None on http errors
 171         found = self._save_podcast_logo(podcast.logo_url)
 172         if not found:
 173             changed |= update_a(podcast, 'logo_url', None)
 174
 175         if changed:
 176             print 'saving podcast'
 177             podcast.last_update = datetime.utcnow()
 178             podcast.save()
 179
 180
 181         assign_slug(podcast, PodcastSlug)
 182         assign_missing_episode_slugs(podcast)
 183
 184
 185     def _update_categories(self, podcast, prev_timestamp):
 186         """ checks some practical requirements and updates a category """
 187
 188         from datetime import timedelta
 189
 190         max_timestamp = datetime.utcnow() + timedelta(days=1)
 191
 192         # no episodes at all
 193         if not podcast.latest_episode_timestamp:
 194             return
 195
 196         # no new episode
 197         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 198             return
 199
 200         # too far in the future
 201         if podcast.latest_episode_timestamp > max_timestamp:
 202             return
 203
 204         # not enough subscribers
 205         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 206             return
 207
 208         update_category(podcast)
 209
 210
 211     @repeat_on_conflict(['podcast'])
 212     def _update_episodes(self, podcast, parsed_episodes):
 213
 214         all_episodes = set(episodes_for_podcast_uncached(podcast))
 215         remaining = list(all_episodes)
 216         updated_episodes = []
 217
 218         for parsed_episode in parsed_episodes:
 219
 220             url = None
 221
 222             for f in parsed_episode.files:
 223                 if f.urls:
 224                     url = f.urls[0]
 225
 226             if not url:
 227                 continue
 228
 229             guid = parsed_episode.guid
 230
 231             # pop matchin episodes out of the "existing" list
 232             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 233
 234             if not matching:
 235                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 236                     url, create=True)
 237                 matching = [new_episode]
 238                 all_episodes.add(new_episode)
 239
 240
 241             for episode in matching:
 242                 changed = False
 243                 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
 244                 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
 245                 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
 246                 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
 247                 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
 248                 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
 249                 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
 250                 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
 251                 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
 252                 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
 253                 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 254                 changed |= update_a(episode, 'flattr_url', parsed_episode.flattr)
 255
 256                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 257                 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
 258
 259                 if changed:
 260                     episode.last_update = datetime.utcnow()
 261                     updated_episodes.append(episode)
 262
 263
 264         outdated_episodes = all_episodes - set(updated_episodes)
 265
 266         # set episodes to be outdated, where necessary
 267         for e in filter(lambda e: not e.outdated, outdated_episodes):
 268             e.outdated = True
 269             updated_episodes.append(e)
 270
 271
 272         if updated_episodes:
 273             print 'Updating', len(updated_episodes), 'episodes'
 274             self.db.save_docs(updated_episodes)
 275
 276         return all_episodes
 277
 278
 279     def _save_podcast_logo(self, cover_art):
 280         if not cover_art:
 281             return
 282
 283         try:
 284             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 285             prefix = CoverArt.get_prefix(image_sha1)
 286
 287             filename = CoverArt.get_original(prefix, image_sha1)
 288             dirname = CoverArt.get_dir(filename)
 289
 290             # get hash of existing file
 291             if os.path.exists(filename):
 292                 with open(filename) as f:
 293                     old_hash = file_hash(f).digest()
 294             else:
 295                 old_hash = ''
 296
 297             print 'LOGO @', cover_art
 298
 299             # save new cover art
 300             with open(filename, 'w') as fp:
 301                 fp.write(urllib2.urlopen(cover_art).read())
 302
 303             # get hash of new file
 304             with open(filename) as f:
 305                 new_hash = file_hash(f).digest()
 306
 307             # remove thumbnails if cover changed
 308             if old_hash != new_hash:
 309                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 310                 print 'Removing %d thumbnails' % len(thumbnails)
 311                 for f in thumbnails:
 312                     os.unlink(f)
 313
 314             return  cover_art
 315
 316         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 317                 httplib.BadStatusLine) as e:
 318             print e
 319
 320
 321     @repeat_on_conflict(['podcast'])
 322     def _mark_outdated(self, podcast, msg=''):
 323         print 'mark outdated', msg
 324         podcast.outdated = True
 325         podcast.last_update = datetime.utcnow()
 326         podcast.save()
 327         self._update_episodes(podcast, [])
 328
 329
 330
 331 _none = object()
 332
 333 def update_a(obj, attrib, value):
 334     changed = getattr(obj, attrib, _none) != value
 335     setattr(obj, attrib, value)
 336     return changed
 337
 338
 339 def update_i(obj, item, value):
 340     changed = obj.get(item, _none) != value
 341     obj[item] = value
 342     return changed