mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 import os.path
  21 import urllib2
  22 import hashlib
  23 from datetime import datetime
  24 from itertools import chain
  25
  26 from mygpo.core.models import Podcast, PodcastGroup
  27 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  28          PodcastSlug
  29 from feedservice.parse import parse_feed
  30 from feedservice.parse.text import ConvertMarkdown
  31 from mygpo.utils import file_hash, split_list
  32 from mygpo.web.logo import CoverArt
  33 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
  34          episodes_for_podcast_uncached
  35 from mygpo.db.couchdb.podcast import podcast_for_url
  36 from mygpo.db.couchdb.directory import category_for_tag
  37 from mygpo.directory.tags import update_category
  38
  39 from mygpo.couch import get_main_database
  40
  41
  42
  43
  44 class PodcastUpdater(object):
  45     """ Updates a number of podcasts with data from their feeds """
  46
  47     def __init__(self, queue):
  48         """ Queue is an iterable of podcast objects """
  49         self.queue = queue
  50         self.db = get_main_database()
  51
  52
  53     def update(self):
  54         """ Run the updates """
  55
  56         for n, podcast in enumerate(self.queue):
  57
  58             if isinstance(podcast, PodcastGroup):
  59                 for m in range(len(podcast.podcasts)):
  60                     pg = PodcastGroup.get(podcast._id)
  61                     p = pg.podcasts[m]
  62                     print '{:5d} {:s}'.format(n, p.url)
  63                     self.update_podcast(p)
  64
  65             else:
  66                 print '{:5d} {:s}'.format(n, podcast.url)
  67                 self.update_podcast(podcast)
  68
  69             print
  70
  71
  72     def update_podcast(self, podcast):
  73
  74         try:
  75             parsed = parse_feed(podcast.url, text_processor=ConvertMarkdown())
  76
  77         except urllib2.HTTPError as e:
  78             if e.code in (404, 400):
  79                 self.mark_outdated(podcast)
  80                 return
  81
  82             raise
  83
  84         if not parsed:
  85             self.mark_outdated(podcast)
  86             return
  87
  88         podcast = podcast_for_url(parsed.urls[0])
  89         changed = False
  90
  91         # we need that later to decide if we can "bump" a category
  92         prev_latest_episode_timestamp = podcast.latest_episode_timestamp
  93
  94         changed |= update_a(podcast, 'title', parsed.title or podcast.title)
  95         changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
  96         changed |= update_a(podcast, 'description', parsed.description or podcast.description)
  97         changed |= update_a(podcast, 'link',  parsed.link or podcast.link)
  98         changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
  99         changed |= update_a(podcast, 'author', parsed.author or podcast.author)
 100         changed |= update_a(podcast, 'language', parsed.language or podcast.language)
 101         changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
 102         changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
 103         changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
 104         changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
 105
 106
 107         if podcast.new_location:
 108             new_podcast = Podcast.for_url(podcast.new_location)
 109             if podcast:
 110                 self.mark_outdated(podcast)
 111                 return
 112
 113             else:
 114                 podcast.urls.insert(0, podcast.new_location)
 115                 changed = True
 116
 117
 118         episodes = self.update_episodes(podcast, parsed.episodes)
 119
 120         # latest episode timestamp
 121         eps = filter(lambda e: bool(e.released), episodes)
 122         eps = sorted(eps, key=lambda e: e.released)
 123         if eps:
 124             changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
 125
 126
 127         self.update_categories(podcast, prev_latest_episode_timestamp)
 128
 129         if changed:
 130             print '      saving podcast'
 131             podcast.last_update = datetime.utcnow()
 132             podcast.save()
 133
 134
 135         assign_slug(podcast, PodcastSlug)
 136         assign_missing_episode_slugs(podcast)
 137
 138         self.save_podcast_logo(podcast.logo_url)
 139
 140
 141     def update_categories(self, podcast, prev_timestamp, min_subscribers=5):
 142         from datetime import timedelta
 143
 144         max_timestamp = datetime.utcnow() + timedelta(days=1)
 145
 146         # no episodes at all
 147         if not podcast.latest_episode_timestamp:
 148             return
 149
 150         # no new episode
 151         if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
 152             return
 153
 154         # too far in the future
 155         if podcast.latest_episode_timestamp > max_timestamp:
 156             return
 157
 158         # not enough subscribers
 159         if podcast.subscriber_count() < min_subscribers:
 160             return
 161
 162         update_category(podcast)
 163
 164
 165     def update_episodes(self, podcast, parsed_episodes):
 166
 167         all_episodes = set(episodes_for_podcast_uncached(podcast))
 168         remaining = list(all_episodes)
 169         updated_episodes = []
 170
 171         for parsed_episode in parsed_episodes:
 172
 173             url = None
 174
 175             for f in parsed_episode.files:
 176                 if f.urls:
 177                     url = f.urls[0]
 178
 179             if not url:
 180                 continue
 181
 182             guid = parsed_episode.guid
 183
 184             # pop matchin episodes out of the "existing" list
 185             matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
 186
 187             if not matching:
 188                 new_episode = episode_for_podcast_id_url(podcast.get_id(),
 189                     url, create=True)
 190                 matching = [new_episode]
 191                 all_episodes.add(new_episode)
 192
 193
 194             for episode in matching:
 195                 changed = False
 196                 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
 197                 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
 198                 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
 199                 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
 200                 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
 201                 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else None)
 202                 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
 203                 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
 204                 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
 205                 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
 206                 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
 207
 208                 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
 209                 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
 210
 211                 if changed:
 212                     episode.last_update = datetime.utcnow()
 213                     updated_episodes.append(episode)
 214
 215                 #episode.content_types = None #TODO
 216
 217
 218         outdated_episodes = all_episodes - set(updated_episodes)
 219
 220         # set episodes to be outdated, where necessary
 221         for e in filter(lambda e: not e.outdated, outdated_episodes):
 222             e.outdated = True
 223             updated_episodes.append(e)
 224
 225
 226         if updated_episodes:
 227             print '      Updating', len(updated_episodes), 'episodes'
 228             self.db.save_docs(updated_episodes)
 229
 230         return all_episodes
 231
 232
 233     def save_podcast_logo(self, cover_art):
 234         if not cover_art:
 235             return
 236
 237         try:
 238             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 239             prefix = CoverArt.get_prefix(image_sha1)
 240
 241             filename = CoverArt.get_original(prefix, image_sha1)
 242             dirname = CoverArt.get_dir(filename)
 243
 244             # get hash of existing file
 245             if os.path.exists(filename):
 246                 with open(filename) as f:
 247                     old_hash = file_hash(f).digest()
 248             else:
 249                 old_hash = ''
 250
 251             print '      LOGO @', cover_art
 252
 253             # save new cover art
 254             with open(filename, 'w') as fp:
 255                 fp.write(urllib2.urlopen(cover_art).read())
 256
 257             # get hash of new file
 258             with open(filename) as f:
 259                 new_hash = file_hash(f).digest()
 260
 261             # remove thumbnails if cover changed
 262             if old_hash != new_hash:
 263                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 264                 print '      Removing %d thumbnails' % len(thumbnails)
 265                 for f in thumbnails:
 266                     os.unlink(f)
 267
 268             return  cover_art
 269
 270         except urllib2.HTTPError as e:
 271             print e
 272
 273         except urllib2.URLError as e:
 274             print e
 275
 276
 277     def mark_outdated(self, podcast):
 278         print '      mark outdated'
 279         podcast.outdated = True
 280         podcast.last_update = datetime.utcnow()
 281         podcast.save()
 282         self.update_episodes(podcast, [])
 283
 284
 285
 286 _none = object()
 287
 288 def update_a(obj, attrib, value):
 289     changed = getattr(obj, attrib, _none) != value
 290     setattr(obj, attrib, value)
 291     return changed
 292
 293
 294 def update_i(obj, item, value):
 295     changed = obj.get(item, _none) != value
 296     obj[item] = value
 297     return changed