mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import httplib
  23 import hashlib
  24 from datetime import datetime
  25 from itertools import chain
  26
  27 from django.conf import settings
  28
  29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  30          PodcastSlug
  31 from feedservice.parse import parse_feed, FetchFeedException
  32 from feedservice.parse.text import ConvertMarkdown
  33 from feedservice.parse.models import ParserException
  34 from mygpo.utils import file_hash, split_list
  35 from mygpo.web.logo import CoverArt
  36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  37          episodes_for_podcast_uncached
  38 from mygpo.db.couchdb.podcast import podcast_for_url
  39 from mygpo.directory.tags import update_category
  40 from mygpo.decorators import repeat_on_conflict
  41 from mygpo.couch import get_main_database
  42
  43 import socket
  44 socket.setdefaulttimeout(30)
  45
  46
  47 class NoPodcastCreated(Exception):
  48     """ raised when no podcast obj was created for a new URL """
  49
  50
  51 class NoEpisodesException(Exception):
  52     """ raised when parsing something that doesn't contain any episodes """
  53
  54
  55 class PodcastUpdater(object):
  56     """ Updates a number of podcasts with data from their feeds """
  57
  58     def __init__(self):
  59         """ Queue is an iterable of podcast objects """
  60         self.db = get_main_database()
  61
  62
  63     def update_queue(self, queue):
  64         """ Fetch data for the URLs supplied as the queue iterable """
  65
  66         for n, podcast_url in enumerate(queue):
  67             print n, podcast_url
  68             try:
  69                 self.update(podcast_url)
  70
  71             except NoPodcastCreated as npc:
  72                 print 'no podcast created:', npc
  73             print
  74
  75
  76     def update(self, podcast_url):
  77         """ Update the podcast for the supplied URL """
  78
  79         try:
  80             parsed = self._fetch_feed(podcast_url)
  81             self._validate_parsed(parsed)
  82
  83         except (ParserException, FetchFeedException) as ex:
  84             # if we fail to parse the URL, we don't even create the
  85             # podcast object
  86             p = podcast_for_url(podcast_url, create=False)
  87             if p:
  88                 # if it exists already, we mark it as outdated
  89                 self._mark_outdated(p)
  90
  91             else:
  92                 raise NoPodcastCreated(ex)
  93
  94         assert parsed, 'fetch_feed must return something'
  95         p = podcast_for_url(podcast_url, create=True)
  96         self._update_podcast(p, parsed)
  97         return p
  98
  99
 100     def verify_podcast_url(self, podcast_url):
 101         parsed = self._fetch_feed(podcast_url)
 102         self._validate_parsed(parsed)
 103         return True
 104
 105
 106     def _fetch_feed(self, podcast_url):
 107         return parse_feed(podcast_url, text_processor=ConvertMarkdown())
 108
 109
 110
 111     def _validate_parsed(self, parsed):
 112         """ validates the parsed results and raises an exception if invalid
 113
 114         feedparser parses pretty much everything. We reject anything that
 115         doesn't look like a feed"""
 116
 117         if not parsed.episodes:
 118             raise NoEpisodesException('no episodes found')
 119
 120
 121     @repeat_on_conflict(['podcast'])
 122     def _update_podcast(self, podcast, parsed):
 123         """ updates a podcast according to new parser results """
 124
 125         changed = False
 126
 127         # we need that later to decide if we can "bump" a category
 128         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
 129
 130         changed |= update_a(podcast, 'title', parsed.title or podcast.title)
 131         changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
 132         changed |= update_a(podcast, 'description', parsed.description or podcast.description)
 133         changed |= update_a(podcast, 'link',  parsed.link or podcast.link)
 134         changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
 135         changed |= update_a(podcast, 'author', parsed.author or podcast.author)
 136         changed |= update_a(podcast, 'language', parsed.language or podcast.language)
 137         changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
 138         changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
 139         changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
 140         changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
 141
 142
 143         if podcast.new_location:
 144             new_podcast = podcast_for_url(podcast.new_location)
 145             if new_podcast != podcast:
 146                 self._mark_outdated(podcast, 'redirected to different podcast')
 147                 return
 148
 149             elif not new_podcast:
 150                 podcast.urls.insert(0, podcast.new_location)
 151                 changed = True
 152
 153
 154         episodes = self._update_episodes(podcast, parsed.episodes)
 155
 156         # latest episode timestamp
 157         eps = filter(lambda e: bool(e.released), episodes)
 158         eps = sorted(eps, key=lambda e: e.released)
 159         if eps:
 160             changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
 161             changed |= update_a(podcast, 'episode_count', len(eps))
 162
 163
 164         self._update_categories(podcast, prev_latest_episode_timestamp)
 165
 166         # try to download the logo and reset logo_url to None on http errors
 167         found = self._save_podcast_logo(podcast.logo_url)
 168         if not found:
 169             changed |= update_a(podcast, 'logo_url', None)
 170
 171         if changed:
 172             print 'saving podcast'
 173             podcast.last_update = datetime.utcnow()
 174             podcast.save()
 175
 176
 177         assign_slug(podcast, PodcastSlug)
 178         assign_missing_episode_slugs(podcast)
 179
 180
 181     def _update_categories(self, podcast, prev_timestamp):
 182         """ checks some practical requirements and updates a category """
 183
 184         from datetime import timedelta
 185
 186         max_timestamp = datetime.utcnow() + timedelta(days=1)
 187
 188         # no episodes at all
 189         if not podcast.latest_episode_timestamp:
 190             return
 191
 192         # no new episode
 193         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 194             return
 195
 196         # too far in the future
 197         if podcast.latest_episode_timestamp > max_timestamp:
 198             return
 199
 200         # not enough subscribers
 201         if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
 202             return
 203
 204         update_category(podcast)
 205
 206
 207     @repeat_on_conflict(['podcast'])
 208     def _update_episodes(self, podcast, parsed_episodes):
 209
 210         all_episodes = set(episodes_for_podcast_uncached(podcast))
 211         remaining = list(all_episodes)
 212         updated_episodes = []
 213
 214         for parsed_episode in parsed_episodes:
 215
 216             url = None
 217
 218             for f in parsed_episode.files:
 219                 if f.urls:
 220                     url = f.urls[0]
 221
 222             if not url:
 223                 continue
 224
 225             guid = parsed_episode.guid
 226
 227             # pop matchin episodes out of the "existing" list
 228             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 229
 230             if not matching:
 231                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 232                     url, create=True)
 233                 matching = [new_episode]
 234                 all_episodes.add(new_episode)
 235
 236
 237             for episode in matching:
 238                 changed = False
 239                 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
 240                 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
 241                 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
 242                 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
 243                 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
 244                 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
 245                 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
 246                 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
 247                 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
 248                 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
 249                 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 250
 251                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 252                 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
 253
 254                 if changed:
 255                     episode.last_update = datetime.utcnow()
 256                     updated_episodes.append(episode)
 257
 258
 259         outdated_episodes = all_episodes - set(updated_episodes)
 260
 261         # set episodes to be outdated, where necessary
 262         for e in filter(lambda e: not e.outdated, outdated_episodes):
 263             e.outdated = True
 264             updated_episodes.append(e)
 265
 266
 267         if updated_episodes:
 268             print 'Updating', len(updated_episodes), 'episodes'
 269             self.db.save_docs(updated_episodes)
 270
 271         return all_episodes
 272
 273
 274     def _save_podcast_logo(self, cover_art):
 275         if not cover_art:
 276             return
 277
 278         try:
 279             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 280             prefix = CoverArt.get_prefix(image_sha1)
 281
 282             filename = CoverArt.get_original(prefix, image_sha1)
 283             dirname = CoverArt.get_dir(filename)
 284
 285             # get hash of existing file
 286             if os.path.exists(filename):
 287                 with open(filename) as f:
 288                     old_hash = file_hash(f).digest()
 289             else:
 290                 old_hash = ''
 291
 292             print 'LOGO @', cover_art
 293
 294             # save new cover art
 295             with open(filename, 'w') as fp:
 296                 fp.write(urllib2.urlopen(cover_art).read())
 297
 298             # get hash of new file
 299             with open(filename) as f:
 300                 new_hash = file_hash(f).digest()
 301
 302             # remove thumbnails if cover changed
 303             if old_hash != new_hash:
 304                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 305                 print 'Removing %d thumbnails' % len(thumbnails)
 306                 for f in thumbnails:
 307                     os.unlink(f)
 308
 309             return  cover_art
 310
 311         except urllib2.HTTPError as e:
 312             print e
 313
 314         except urllib2.URLError as e:
 315             print e
 316
 317
 318     @repeat_on_conflict(['podcast'])
 319     def _mark_outdated(self, podcast, msg=''):
 320         print 'mark outdated', msg
 321         podcast.outdated = True
 322         podcast.last_update = datetime.utcnow()
 323         podcast.save()
 324         self._update_episodes(podcast, [])
 325
 326
 327
 328 _none = object()
 329
 330 def update_a(obj, attrib, value):
 331     changed = getattr(obj, attrib, _none) != value
 332     setattr(obj, attrib, value)
 333     return changed
 334
 335
 336 def update_i(obj, item, value):
 337     changed = obj.get(item, _none) != value
 338     obj[item] = value
 339     return changed