use django.contrib.staticfiles everywhere
[mygpo.git] / mygpo / data / feeddownloader.py
blob6b94eb13e353b36cdbf2750024ed1f80f3c4aaa7
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain
28 from django.conf import settings
30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
31 PodcastSlug
32 from feedservice.parse import parse_feed, FetchFeedException
33 from feedservice.parse.text import ConvertMarkdown
34 from feedservice.parse.models import ParserException
35 from mygpo.utils import file_hash, split_list, deep_eq
36 from mygpo.web.logo import CoverArt
37 from mygpo.data.podcast import subscribe_at_hub
38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
39 episodes_for_podcast_uncached
40 from mygpo.db.couchdb.podcast import podcast_for_url
41 from mygpo.directory.tags import update_category
42 from mygpo.decorators import repeat_on_conflict
43 from mygpo.db.couchdb import get_main_database
45 import logging
46 logger = logging.getLogger(__name__)
49 class NoPodcastCreated(Exception):
50 """ raised when no podcast obj was created for a new URL """
53 class NoEpisodesException(Exception):
54 """ raised when parsing something that doesn't contain any episodes """
57 class PodcastUpdater(object):
58 """ Updates a number of podcasts with data from their feeds """
60 def __init__(self):
61 """ Queue is an iterable of podcast objects """
62 self.db = get_main_database()
65 def update_queue(self, queue):
66 """ Fetch data for the URLs supplied as the queue iterable """
68 for n, podcast_url in enumerate(queue):
69 logger.info('Update %d - %s', n, podcast_url)
70 try:
71 yield self.update(podcast_url)
73 except NoPodcastCreated as npc:
74 logger.info('No podcast created: %s', npc)
77 def update(self, podcast_url):
78 """ Update the podcast for the supplied URL """
80 try:
81 parsed = self._fetch_feed(podcast_url)
82 self._validate_parsed(parsed)
84 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
86 # if we fail to parse the URL, we don't even create the
87 # podcast object
88 p = podcast_for_url(podcast_url, create=False)
89 if p:
90 # if it exists already, we mark it as outdated
91 self._mark_outdated(p)
92 return
94 else:
95 raise NoPodcastCreated(ex)
97 assert parsed, 'fetch_feed must return something'
98 p = podcast_for_url(podcast_url, create=True)
99 self._update_podcast(p, parsed)
100 return p
103 def verify_podcast_url(self, podcast_url):
104 parsed = self._fetch_feed(podcast_url)
105 self._validate_parsed(parsed)
106 return True
109 def _fetch_feed(self, podcast_url):
110 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
114 def _validate_parsed(self, parsed):
115 """ validates the parsed results and raises an exception if invalid
117 feedparser parses pretty much everything. We reject anything that
118 doesn't look like a feed"""
120 if not parsed or not parsed.episodes:
121 raise NoEpisodesException('no episodes found')
124 @repeat_on_conflict(['podcast'])
125 def _update_podcast(self, podcast, parsed):
126 """ updates a podcast according to new parser results """
128 # we need that later to decide if we can "bump" a category
129 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
131 old_json = copy.deepcopy(podcast.to_json())
133 podcast.title = parsed.title or podcast.title
134 podcast.urls = list(set(podcast.urls + parsed.urls))
135 podcast.description = parsed.description or podcast.description
136 podcast.subtitle = parsed.subtitle or podcast.subtitle
137 podcast.link = parsed.link or podcast.link
138 podcast.logo_url = parsed.logo or podcast.logo_url
139 podcast.author = parsed.author or podcast.author
140 podcast.language = parsed.language or podcast.language
141 podcast.content_types = parsed.content_types or podcast.content_types
142 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
143 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
144 podcast.new_location = parsed.new_location or podcast.new_location
145 podcast.flattr_url = parsed.flattr or podcast.flattr_url
146 podcast.hub = parsed.hub or podcast.hub
147 podcast.license = parsed.license or podcast.license
150 if podcast.new_location:
151 new_podcast = podcast_for_url(podcast.new_location)
152 if new_podcast != podcast:
153 self._mark_outdated(podcast, 'redirected to different podcast')
154 return
156 elif not new_podcast:
157 podcast.urls.insert(0, podcast.new_location)
160 episodes = self._update_episodes(podcast, parsed.episodes)
162 # latest episode timestamp
163 eps = filter(lambda e: bool(e.released), episodes)
164 eps = sorted(eps, key=lambda e: e.released)
165 if eps:
166 podcast.latest_episode_timestamp = eps[-1].released
167 podcast.episode_count = len(eps)
170 self._update_categories(podcast, prev_latest_episode_timestamp)
172 # try to download the logo and reset logo_url to None on http errors
173 found = self._save_podcast_logo(podcast.logo_url)
174 if not found:
175 podcast.logo_url = None
177 if not deep_eq(old_json, podcast.to_json()):
178 logger.info('Saving podcast.')
179 podcast.last_update = datetime.utcnow()
180 podcast.save()
183 subscribe_at_hub(podcast)
185 assign_slug(podcast, PodcastSlug)
186 assign_missing_episode_slugs(podcast)
189 def _update_categories(self, podcast, prev_timestamp):
190 """ checks some practical requirements and updates a category """
192 from datetime import timedelta
194 max_timestamp = datetime.utcnow() + timedelta(days=1)
196 # no episodes at all
197 if not podcast.latest_episode_timestamp:
198 return
200 # no new episode
201 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
202 return
204 # too far in the future
205 if podcast.latest_episode_timestamp > max_timestamp:
206 return
208 # not enough subscribers
209 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
210 return
212 update_category(podcast)
215 @repeat_on_conflict(['podcast'])
216 def _update_episodes(self, podcast, parsed_episodes):
218 all_episodes = set(episodes_for_podcast_uncached(podcast))
219 remaining = list(all_episodes)
220 updated_episodes = []
222 for parsed_episode in parsed_episodes:
224 url = None
226 for f in parsed_episode.files:
227 if f.urls:
228 url = f.urls[0]
230 if not url:
231 continue
233 guid = parsed_episode.guid
235 # pop matchin episodes out of the "existing" list
236 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
238 if not matching:
239 new_episode = episode_for_podcast_id_url(podcast.get_id(),
240 url, create=True)
241 matching = [new_episode]
242 all_episodes.add(new_episode)
245 for episode in matching:
246 old_json = copy.deepcopy(episode.to_json())
248 episode.guid = parsed_episode.guid or episode.guid
249 episode.title = parsed_episode.title or episode.title
250 episode.description = parsed_episode.description or episode.description
251 episode.subtitle = parsed_episode.subtitle or episode.subtitle
252 episode.content = parsed_episode.content or parsed_episode.description or episode.content
253 episode.link = parsed_episode.link or episode.link
254 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
255 episode.author = parsed_episode.author or episode.author
256 episode.duration = parsed_episode.duration or episode.duration
257 episode.filesize = parsed_episode.files[0].filesize
258 episode.language = parsed_episode.language or episode.language
259 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
260 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
261 episode.license = parsed_episode.license or episode.license
263 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
264 episode.urls = sorted(set(episode.urls + urls), key=len)
266 if not deep_eq(old_json, episode.to_json()):
267 episode.last_update = datetime.utcnow()
268 updated_episodes.append(episode)
271 outdated_episodes = all_episodes - set(updated_episodes)
273 # set episodes to be outdated, where necessary
274 for e in filter(lambda e: not e.outdated, outdated_episodes):
275 e.outdated = True
276 updated_episodes.append(e)
279 if updated_episodes:
280 logger.info('Updating %d episodes', len(updated_episodes))
281 self.db.save_docs(updated_episodes)
283 return all_episodes
286 def _save_podcast_logo(self, cover_art):
287 if not cover_art:
288 return
290 try:
291 image_sha1 = hashlib.sha1(cover_art).hexdigest()
292 prefix = CoverArt.get_prefix(image_sha1)
294 filename = CoverArt.get_original(prefix, image_sha1)
295 dirname = CoverArt.get_dir(filename)
297 # get hash of existing file
298 if os.path.exists(filename):
299 with open(filename) as f:
300 old_hash = file_hash(f).digest()
301 else:
302 old_hash = ''
304 logger.info('Logo %s', cover_art)
306 # save new cover art
307 with open(filename, 'w') as fp:
308 fp.write(urllib2.urlopen(cover_art).read())
310 # get hash of new file
311 with open(filename) as f:
312 new_hash = file_hash(f).digest()
314 # remove thumbnails if cover changed
315 if old_hash != new_hash:
316 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
317 logger.info('Removing %d thumbnails', len(thumbnails))
318 for f in thumbnails:
319 os.unlink(f)
321 return cover_art
323 except (urllib2.HTTPError, urllib2.URLError, ValueError,
324 httplib.BadStatusLine) as e:
325 logger.warn('Exception while updating podcast: %s', str(e))
328 @repeat_on_conflict(['podcast'])
329 def _mark_outdated(self, podcast, msg=''):
330 logger.info('marking podcast outdated: %s', msg)
331 podcast.outdated = True
332 podcast.last_update = datetime.utcnow()
333 podcast.save()
334 self._update_episodes(podcast, [])