mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain
  26
  27 from django.conf import settings
  28
  29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  30          PodcastSlug
  31 from feedservice.parse import parse_feed, FetchFeedException
  32 from feedservice.parse.text import ConvertMarkdown
  33 from feedservice.parse.models import ParserException
  34 from mygpo.utils import file_hash, split_list
  35 from mygpo.web.logo import CoverArt
  36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  37          episodes_for_podcast_uncached
  38 from mygpo.db.couchdb.podcast import podcast_for_url
  39 from mygpo.directory.tags import update_category
  40 from mygpo.decorators import repeat_on_conflict
  41 from mygpo.couch import get_main_database
  42
  43 import socket
  44 socket.setdefaulttimeout(300)
  45
  46
  47 class NoPodcastCreated(Exception):
  48     """ raised when no podcast obj was created for a new URL """
  49
  50
  51 class NoEpisodesException(Exception):
  52     """ raised when parsing something that doesn't contain any episodes """
  53
  54
  55 class PodcastUpdater(object):
  56     """ Updates a number of podcasts with data from their feeds """
  57
  58     def __init__(self):
  59         """ Queue is an iterable of podcast objects """
  60         self.db = get_main_database()
  61
  62
  63     def update_queue(self, queue):
  64         """ Fetch data for the URLs supplied as the queue iterable """
  65
  66         for n, podcast_url in enumerate(queue):
  67             print n, podcast_url
  68             try:
  69                 self.update(podcast_url)
  70
  71             except NoPodcastCreated as npc:
  72                 print 'no podcast created:', npc
  73
  74             print
  75
  76
  77     def update(self, podcast_url):
  78         """ Update the podcast for the supplied URL """
  79
  80         try:
  81             parsed = self._fetch_feed(podcast_url)
  82             self._validate_parsed(parsed)
  83
  84         except (ParserException, FetchFeedException, NoEpisodesException) as ex:
  85
  86             # if we fail to parse the URL, we don't even create the
  87             # podcast object
  88             p = podcast_for_url(podcast_url, create=False)
  89             if p:
  90                 # if it exists already, we mark it as outdated
  91                 self._mark_outdated(p)
  92                 return
  93
  94             else:
  95                 raise NoPodcastCreated(ex)
  96
  97         assert parsed, 'fetch_feed must return something'
  98         p = podcast_for_url(podcast_url, create=True)
  99         self._update_podcast(p, parsed)
 100         return p
 101
 102
 103     def verify_podcast_url(self, podcast_url):
 104         parsed = self._fetch_feed(podcast_url)
 105         self._validate_parsed(parsed)
 106         return True
 107
 108
 109     def _fetch_feed(self, podcast_url):
 110         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 111
 112
 113
 114     def _validate_parsed(self, parsed):
 115         """ validates the parsed results and raises an exception if invalid
 116
 117         feedparser parses pretty much everything. We reject anything that
 118         doesn't look like a feed"""
 119
 120         if not parsed or not parsed.episodes:
 121             raise NoEpisodesException('no episodes found')
 122
 123
 124     @repeat_on_conflict(['podcast'])
 125     def _update_podcast(self, podcast, parsed):
 126         """ updates a podcast according to new parser results """
 127
 128         changed = False
 129
 130         # we need that later to decide if we can "bump" a category
 131         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 132
 133         changed |= update_a(podcast, 'title', parsed.title or podcast.title)
 134         changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
 135         changed |= update_a(podcast, 'description', parsed.description or podcast.description)
 136         changed |= update_a(podcast, 'link',  parsed.link or podcast.link)
 137         changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
 138         changed |= update_a(podcast, 'author', parsed.author or podcast.author)
 139         changed |= update_a(podcast, 'language', parsed.language or podcast.language)
 140         changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
 141         changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
 142         changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
 143         changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
 144
 145
 146         if podcast.new_location:
 147             new_podcast = podcast_for_url(podcast.new_location)
 148             if new_podcast != podcast:
 149                 self._mark_outdated(podcast, 'redirected to different podcast')
 150                 return
 151
 152             elif not new_podcast:
 153                 podcast.urls.insert(0, podcast.new_location)
 154                 changed = True
 155
 156
 157         episodes = self._update_episodes(podcast, parsed.episodes)
 158
 159         # latest episode timestamp
 160         eps = filter(lambda e: bool(e.released), episodes)
 161         eps = sorted(eps, key=lambda e: e.released)
 162         if eps:
 163             changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
 164             changed |= update_a(podcast, 'episode_count', len(eps))
 165
 166
 167         self._update_categories(podcast, prev_latest_episode_timestamp)
 168
 169         # try to download the logo and reset logo_url to None on http errors
 170         found = self._save_podcast_logo(podcast.logo_url)
 171         if not found:
 172             changed |= update_a(podcast, 'logo_url', None)
 173
 174         if changed:
 175             print 'saving podcast'
 176             podcast.last_update = datetime.utcnow()
 177             podcast.save()
 178
 179
 180         assign_slug(podcast, PodcastSlug)
 181         assign_missing_episode_slugs(podcast)
 182
 183
 184     def _update_categories(self, podcast, prev_timestamp):
 185         """ checks some practical requirements and updates a category """
 186
 187         from datetime import timedelta
 188
 189         max_timestamp = datetime.utcnow() + timedelta(days=1)
 190
 191         # no episodes at all
 192         if not podcast.latest_episode_timestamp:
 193             return
 194
 195         # no new episode
 196         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 197             return
 198
 199         # too far in the future
 200         if podcast.latest_episode_timestamp > max_timestamp:
 201             return
 202
 203         # not enough subscribers
 204         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 205             return
 206
 207         update_category(podcast)
 208
 209
 210     @repeat_on_conflict(['podcast'])
 211     def _update_episodes(self, podcast, parsed_episodes):
 212
 213         all_episodes = set(episodes_for_podcast_uncached(podcast))
 214         remaining = list(all_episodes)
 215         updated_episodes = []
 216
 217         for parsed_episode in parsed_episodes:
 218
 219             url = None
 220
 221             for f in parsed_episode.files:
 222                 if f.urls:
 223                     url = f.urls[0]
 224
 225             if not url:
 226                 continue
 227
 228             guid = parsed_episode.guid
 229
 230             # pop matchin episodes out of the "existing" list
 231             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 232
 233             if not matching:
 234                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 235                     url, create=True)
 236                 matching = [new_episode]
 237                 all_episodes.add(new_episode)
 238
 239
 240             for episode in matching:
 241                 changed = False
 242                 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
 243                 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
 244                 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
 245                 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
 246                 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
 247                 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
 248                 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
 249                 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
 250                 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
 251                 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
 252                 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 253
 254                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 255                 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
 256
 257                 if changed:
 258                     episode.last_update = datetime.utcnow()
 259                     updated_episodes.append(episode)
 260
 261
 262         outdated_episodes = all_episodes - set(updated_episodes)
 263
 264         # set episodes to be outdated, where necessary
 265         for e in filter(lambda e: not e.outdated, outdated_episodes):
 266             e.outdated = True
 267             updated_episodes.append(e)
 268
 269
 270         if updated_episodes:
 271             print 'Updating', len(updated_episodes), 'episodes'
 272             self.db.save_docs(updated_episodes)
 273
 274         return all_episodes
 275
 276
 277     def _save_podcast_logo(self, cover_art):
 278         if not cover_art:
 279             return
 280
 281         try:
 282             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 283             prefix = CoverArt.get_prefix(image_sha1)
 284
 285             filename = CoverArt.get_original(prefix, image_sha1)
 286             dirname = CoverArt.get_dir(filename)
 287
 288             # get hash of existing file
 289             if os.path.exists(filename):
 290                 with open(filename) as f:
 291                     old_hash = file_hash(f).digest()
 292             else:
 293                 old_hash = ''
 294
 295             print 'LOGO @', cover_art
 296
 297             # save new cover art
 298             with open(filename, 'w') as fp:
 299                 fp.write(urllib2.urlopen(cover_art).read())
 300
 301             # get hash of new file
 302             with open(filename) as f:
 303                 new_hash = file_hash(f).digest()
 304
 305             # remove thumbnails if cover changed
 306             if old_hash != new_hash:
 307                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 308                 print 'Removing %d thumbnails' % len(thumbnails)
 309                 for f in thumbnails:
 310                     os.unlink(f)
 311
 312             return  cover_art
 313
 314         except (urllib2.HTTPError, urllib2.URLError, ValueError,
 315                 httplib.BadStatusLine) as e:
 316             print e
 317
 318
 319     @repeat_on_conflict(['podcast'])
 320     def _mark_outdated(self, podcast, msg=''):
 321         print 'mark outdated', msg
 322         podcast.outdated = True
 323         podcast.last_update = datetime.utcnow()
 324         podcast.save()
 325         self._update_episodes(podcast, [])
 326
 327
 328
 329 _none = object()
 330
 331 def update_a(obj, attrib, value):
 332     changed = getattr(obj, attrib, _none) != value
 333     setattr(obj, attrib, value)
 334     return changed
 335
 336
 337 def update_i(obj, item, value):
 338     changed = obj.get(item, _none) != value
 339     obj[item] = value
 340     return changed