mv tags/by_user to usertags/by_user to userdata db
[mygpo.git] / mygpo / data / feeddownloader.py
blobb9ab94738e429b365df7eeb908e6a390028683fd
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain
28 from django.conf import settings
30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
31 PodcastSlug
32 from feedservice.parse import parse_feed, FetchFeedException
33 from feedservice.parse.text import ConvertMarkdown
34 from feedservice.parse.models import ParserException
35 from mygpo.utils import file_hash, split_list, deep_eq
36 from mygpo.web.logo import CoverArt
37 from mygpo.data.podcast import subscribe_at_hub
38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
39 episodes_for_podcast_uncached
40 from mygpo.db.couchdb.podcast import podcast_for_url, podcast_by_id_uncached, \
41 reload_podcast
42 from mygpo.directory.tags import update_category
43 from mygpo.decorators import repeat_on_conflict
44 from mygpo.db.couchdb import get_main_database
46 import logging
47 logger = logging.getLogger(__name__)
50 class NoPodcastCreated(Exception):
51 """ raised when no podcast obj was created for a new URL """
54 class NoEpisodesException(Exception):
55 """ raised when parsing something that doesn't contain any episodes """
58 class PodcastUpdater(object):
59 """ Updates a number of podcasts with data from their feeds """
61 def __init__(self):
62 """ Queue is an iterable of podcast objects """
63 self.db = get_main_database()
66 def update_queue(self, queue):
67 """ Fetch data for the URLs supplied as the queue iterable """
69 for n, podcast_url in enumerate(queue):
70 logger.info('Update %d - %s', n, podcast_url)
71 try:
72 yield self.update(podcast_url)
74 except NoPodcastCreated as npc:
75 logger.info('No podcast created: %s', npc)
78 def update(self, podcast_url):
79 """ Update the podcast for the supplied URL """
81 try:
82 parsed = self._fetch_feed(podcast_url)
83 self._validate_parsed(parsed)
85 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
87 # if we fail to parse the URL, we don't even create the
88 # podcast object
89 p = podcast_for_url(podcast_url, create=False)
90 if p:
91 # if it exists already, we mark it as outdated
92 self._mark_outdated(p)
93 return
95 else:
96 raise NoPodcastCreated(ex)
98 assert parsed, 'fetch_feed must return something'
99 p = podcast_for_url(podcast_url, create=True)
100 self._update_podcast(p, parsed)
101 return p
104 def verify_podcast_url(self, podcast_url):
105 parsed = self._fetch_feed(podcast_url)
106 self._validate_parsed(parsed)
107 return True
110 def _fetch_feed(self, podcast_url):
111 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
115 def _validate_parsed(self, parsed):
116 """ validates the parsed results and raises an exception if invalid
118 feedparser parses pretty much everything. We reject anything that
119 doesn't look like a feed"""
121 if not parsed or not parsed.episodes:
122 raise NoEpisodesException('no episodes found')
125 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
126 def _update_podcast(self, podcast, parsed):
127 """ updates a podcast according to new parser results """
129 # we need that later to decide if we can "bump" a category
130 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
132 old_json = copy.deepcopy(podcast.to_json())
134 podcast.title = parsed.title or podcast.title
135 podcast.urls = list(set(podcast.urls + parsed.urls))
136 podcast.description = parsed.description or podcast.description
137 podcast.subtitle = parsed.subtitle or podcast.subtitle
138 podcast.link = parsed.link or podcast.link
139 podcast.logo_url = parsed.logo or podcast.logo_url
140 podcast.author = parsed.author or podcast.author
141 podcast.language = parsed.language or podcast.language
142 podcast.content_types = parsed.content_types or podcast.content_types
143 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
144 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
145 podcast.new_location = parsed.new_location or podcast.new_location
146 podcast.flattr_url = parsed.flattr or podcast.flattr_url
147 podcast.hub = parsed.hub or podcast.hub
148 podcast.license = parsed.license or podcast.license
151 if podcast.new_location:
152 new_podcast = podcast_for_url(podcast.new_location)
153 if new_podcast != podcast:
154 self._mark_outdated(podcast, 'redirected to different podcast')
155 return
157 elif not new_podcast:
158 podcast.urls.insert(0, podcast.new_location)
161 episodes = self._update_episodes(podcast, parsed.episodes)
163 # latest episode timestamp
164 eps = filter(lambda e: bool(e.released), episodes)
165 eps = sorted(eps, key=lambda e: e.released)
166 if eps:
167 podcast.latest_episode_timestamp = eps[-1].released
168 podcast.episode_count = len(eps)
171 self._update_categories(podcast, prev_latest_episode_timestamp)
173 # try to download the logo and reset logo_url to None on http errors
174 found = self._save_podcast_logo(podcast.logo_url)
175 if not found:
176 podcast.logo_url = None
178 if not deep_eq(old_json, podcast.to_json()):
179 logger.info('Saving podcast.')
180 podcast.last_update = datetime.utcnow()
181 podcast.save()
184 subscribe_at_hub(podcast)
186 assign_slug(podcast, PodcastSlug)
187 assign_missing_episode_slugs(podcast)
190 def _update_categories(self, podcast, prev_timestamp):
191 """ checks some practical requirements and updates a category """
193 from datetime import timedelta
195 max_timestamp = datetime.utcnow() + timedelta(days=1)
197 # no episodes at all
198 if not podcast.latest_episode_timestamp:
199 return
201 # no new episode
202 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
203 return
205 # too far in the future
206 if podcast.latest_episode_timestamp > max_timestamp:
207 return
209 # not enough subscribers
210 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
211 return
213 update_category(podcast)
216 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
217 def _update_episodes(self, podcast, parsed_episodes):
219 all_episodes = set(episodes_for_podcast_uncached(podcast))
220 remaining = list(all_episodes)
221 updated_episodes = []
223 for parsed_episode in parsed_episodes:
225 url = None
227 for f in parsed_episode.files:
228 if f.urls:
229 url = f.urls[0]
231 if not url:
232 continue
234 guid = parsed_episode.guid
236 # pop matchin episodes out of the "existing" list
237 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
239 if not matching:
240 new_episode = episode_for_podcast_id_url(podcast.get_id(),
241 url, create=True)
242 matching = [new_episode]
243 all_episodes.add(new_episode)
246 for episode in matching:
247 old_json = copy.deepcopy(episode.to_json())
249 episode.guid = parsed_episode.guid or episode.guid
250 episode.title = parsed_episode.title or episode.title
251 episode.description = parsed_episode.description or episode.description
252 episode.subtitle = parsed_episode.subtitle or episode.subtitle
253 episode.content = parsed_episode.content or parsed_episode.description or episode.content
254 episode.link = parsed_episode.link or episode.link
255 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
256 episode.author = parsed_episode.author or episode.author
257 episode.duration = parsed_episode.duration or episode.duration
258 episode.filesize = parsed_episode.files[0].filesize
259 episode.language = parsed_episode.language or episode.language
260 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
261 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
262 episode.license = parsed_episode.license or episode.license
264 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
265 episode.urls = sorted(set(episode.urls + urls), key=len)
267 if not deep_eq(old_json, episode.to_json()):
268 episode.last_update = datetime.utcnow()
269 updated_episodes.append(episode)
272 outdated_episodes = all_episodes - set(updated_episodes)
274 # set episodes to be outdated, where necessary
275 for e in filter(lambda e: not e.outdated, outdated_episodes):
276 e.outdated = True
277 updated_episodes.append(e)
280 if updated_episodes:
281 logger.info('Updating %d episodes', len(updated_episodes))
282 self.db.save_docs(updated_episodes)
284 return all_episodes
287 def _save_podcast_logo(self, cover_art):
288 if not cover_art:
289 return
291 try:
292 image_sha1 = hashlib.sha1(cover_art).hexdigest()
293 prefix = CoverArt.get_prefix(image_sha1)
295 filename = CoverArt.get_original(prefix, image_sha1)
296 dirname = CoverArt.get_dir(filename)
298 # get hash of existing file
299 if os.path.exists(filename):
300 with open(filename) as f:
301 old_hash = file_hash(f).digest()
302 else:
303 old_hash = ''
305 logger.info('Logo %s', cover_art)
307 # save new cover art
308 with open(filename, 'w') as fp:
309 fp.write(urllib2.urlopen(cover_art).read())
311 # get hash of new file
312 with open(filename) as f:
313 new_hash = file_hash(f).digest()
315 # remove thumbnails if cover changed
316 if old_hash != new_hash:
317 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
318 logger.info('Removing %d thumbnails', len(thumbnails))
319 for f in thumbnails:
320 os.unlink(f)
322 return cover_art
324 except (urllib2.HTTPError, urllib2.URLError, ValueError,
325 httplib.BadStatusLine) as e:
326 logger.warn('Exception while updating podcast: %s', str(e))
329 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
330 def _mark_outdated(self, podcast, msg=''):
331 logger.info('marking podcast outdated: %s', msg)
332 podcast.outdated = True
333 podcast.last_update = datetime.utcnow()
334 podcast.save()
335 self._update_episodes(podcast, [])