try to subscribe to pubsubhubbub hubs given in feeds
[mygpo.git] / mygpo / data / feeddownloader.py
blob8930d49a70effc11f4c48d1a9a580b50add46462
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain
27 from django.conf import settings
29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
30 PodcastSlug
31 from feedservice.parse import parse_feed, FetchFeedException
32 from feedservice.parse.text import ConvertMarkdown
33 from feedservice.parse.models import ParserException
34 from mygpo.utils import file_hash, split_list
35 from mygpo.web.logo import CoverArt
36 from mygpo.data.podcast import subscribe_at_hub
37 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
38 episodes_for_podcast_uncached
39 from mygpo.db.couchdb.podcast import podcast_for_url
40 from mygpo.directory.tags import update_category
41 from mygpo.decorators import repeat_on_conflict
42 from mygpo.db.couchdb import get_main_database
45 class NoPodcastCreated(Exception):
46 """ raised when no podcast obj was created for a new URL """
49 class NoEpisodesException(Exception):
50 """ raised when parsing something that doesn't contain any episodes """
53 class PodcastUpdater(object):
54 """ Updates a number of podcasts with data from their feeds """
56 def __init__(self):
57 """ Queue is an iterable of podcast objects """
58 self.db = get_main_database()
61 def update_queue(self, queue):
62 """ Fetch data for the URLs supplied as the queue iterable """
64 for n, podcast_url in enumerate(queue):
65 print n, podcast_url
66 try:
67 yield self.update(podcast_url)
69 except NoPodcastCreated as npc:
70 print 'no podcast created:', npc
72 print
75 def update(self, podcast_url):
76 """ Update the podcast for the supplied URL """
78 try:
79 parsed = self._fetch_feed(podcast_url)
80 self._validate_parsed(parsed)
82 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
84 # if we fail to parse the URL, we don't even create the
85 # podcast object
86 p = podcast_for_url(podcast_url, create=False)
87 if p:
88 # if it exists already, we mark it as outdated
89 self._mark_outdated(p)
90 return
92 else:
93 raise NoPodcastCreated(ex)
95 assert parsed, 'fetch_feed must return something'
96 p = podcast_for_url(podcast_url, create=True)
97 self._update_podcast(p, parsed)
98 return p
101 def verify_podcast_url(self, podcast_url):
102 parsed = self._fetch_feed(podcast_url)
103 self._validate_parsed(parsed)
104 return True
107 def _fetch_feed(self, podcast_url):
108 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
112 def _validate_parsed(self, parsed):
113 """ validates the parsed results and raises an exception if invalid
115 feedparser parses pretty much everything. We reject anything that
116 doesn't look like a feed"""
118 if not parsed or not parsed.episodes:
119 raise NoEpisodesException('no episodes found')
122 @repeat_on_conflict(['podcast'])
123 def _update_podcast(self, podcast, parsed):
124 """ updates a podcast according to new parser results """
126 changed = False
128 # we need that later to decide if we can "bump" a category
129 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
131 changed |= update_a(podcast, 'title', parsed.title or podcast.title)
132 changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
133 changed |= update_a(podcast, 'description', parsed.description or podcast.description)
134 changed |= update_a(podcast, 'link', parsed.link or podcast.link)
135 changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
136 changed |= update_a(podcast, 'author', parsed.author or podcast.author)
137 changed |= update_a(podcast, 'language', parsed.language or podcast.language)
138 changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
139 changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
140 changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
141 changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
142 changed |= update_a(podcast, 'flattr_url', parsed.flattr)
143 changed |= update_a(podcast, 'hub', parsed.hub)
146 if podcast.new_location:
147 new_podcast = podcast_for_url(podcast.new_location)
148 if new_podcast != podcast:
149 self._mark_outdated(podcast, 'redirected to different podcast')
150 return
152 elif not new_podcast:
153 podcast.urls.insert(0, podcast.new_location)
154 changed = True
157 episodes = self._update_episodes(podcast, parsed.episodes)
159 # latest episode timestamp
160 eps = filter(lambda e: bool(e.released), episodes)
161 eps = sorted(eps, key=lambda e: e.released)
162 if eps:
163 changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
164 changed |= update_a(podcast, 'episode_count', len(eps))
167 self._update_categories(podcast, prev_latest_episode_timestamp)
169 # try to download the logo and reset logo_url to None on http errors
170 found = self._save_podcast_logo(podcast.logo_url)
171 if not found:
172 changed |= update_a(podcast, 'logo_url', None)
174 if changed:
175 print 'saving podcast'
176 podcast.last_update = datetime.utcnow()
177 podcast.save()
180 subscribe_at_hub(podcast)
182 assign_slug(podcast, PodcastSlug)
183 assign_missing_episode_slugs(podcast)
186 def _update_categories(self, podcast, prev_timestamp):
187 """ checks some practical requirements and updates a category """
189 from datetime import timedelta
191 max_timestamp = datetime.utcnow() + timedelta(days=1)
193 # no episodes at all
194 if not podcast.latest_episode_timestamp:
195 return
197 # no new episode
198 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
199 return
201 # too far in the future
202 if podcast.latest_episode_timestamp > max_timestamp:
203 return
205 # not enough subscribers
206 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
207 return
209 update_category(podcast)
212 @repeat_on_conflict(['podcast'])
213 def _update_episodes(self, podcast, parsed_episodes):
215 all_episodes = set(episodes_for_podcast_uncached(podcast))
216 remaining = list(all_episodes)
217 updated_episodes = []
219 for parsed_episode in parsed_episodes:
221 url = None
223 for f in parsed_episode.files:
224 if f.urls:
225 url = f.urls[0]
227 if not url:
228 continue
230 guid = parsed_episode.guid
232 # pop matchin episodes out of the "existing" list
233 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
235 if not matching:
236 new_episode = episode_for_podcast_id_url(podcast.get_id(),
237 url, create=True)
238 matching = [new_episode]
239 all_episodes.add(new_episode)
242 for episode in matching:
243 changed = False
244 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
245 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
246 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
247 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
248 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
249 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
250 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
251 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
252 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
253 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
254 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
255 changed |= update_a(episode, 'flattr_url', parsed_episode.flattr)
257 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
258 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
260 if changed:
261 episode.last_update = datetime.utcnow()
262 updated_episodes.append(episode)
265 outdated_episodes = all_episodes - set(updated_episodes)
267 # set episodes to be outdated, where necessary
268 for e in filter(lambda e: not e.outdated, outdated_episodes):
269 e.outdated = True
270 updated_episodes.append(e)
273 if updated_episodes:
274 print 'Updating', len(updated_episodes), 'episodes'
275 self.db.save_docs(updated_episodes)
277 return all_episodes
280 def _save_podcast_logo(self, cover_art):
281 if not cover_art:
282 return
284 try:
285 image_sha1 = hashlib.sha1(cover_art).hexdigest()
286 prefix = CoverArt.get_prefix(image_sha1)
288 filename = CoverArt.get_original(prefix, image_sha1)
289 dirname = CoverArt.get_dir(filename)
291 # get hash of existing file
292 if os.path.exists(filename):
293 with open(filename) as f:
294 old_hash = file_hash(f).digest()
295 else:
296 old_hash = ''
298 print 'LOGO @', cover_art
300 # save new cover art
301 with open(filename, 'w') as fp:
302 fp.write(urllib2.urlopen(cover_art).read())
304 # get hash of new file
305 with open(filename) as f:
306 new_hash = file_hash(f).digest()
308 # remove thumbnails if cover changed
309 if old_hash != new_hash:
310 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
311 print 'Removing %d thumbnails' % len(thumbnails)
312 for f in thumbnails:
313 os.unlink(f)
315 return cover_art
317 except (urllib2.HTTPError, urllib2.URLError, ValueError,
318 httplib.BadStatusLine) as e:
319 print e
322 @repeat_on_conflict(['podcast'])
323 def _mark_outdated(self, podcast, msg=''):
324 print 'mark outdated', msg
325 podcast.outdated = True
326 podcast.last_update = datetime.utcnow()
327 podcast.save()
328 self._update_episodes(podcast, [])
332 _none = object()
334 def update_a(obj, attrib, value):
335 changed = getattr(obj, attrib, _none) != value
336 setattr(obj, attrib, value)
337 return changed
340 def update_i(obj, item, value):
341 changed = obj.get(item, _none) != value
342 obj[item] = value
343 return changed