move get_main_database() to mygpo.db.couchdb
[mygpo.git] / mygpo / data / feeddownloader.py
blobab9f850bb0cae476b78990e053aa1574d2589bdd
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain
27 from django.conf import settings
29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
30 PodcastSlug
31 from feedservice.parse import parse_feed, FetchFeedException
32 from feedservice.parse.text import ConvertMarkdown
33 from feedservice.parse.models import ParserException
34 from mygpo.utils import file_hash, split_list
35 from mygpo.web.logo import CoverArt
36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
37 episodes_for_podcast_uncached
38 from mygpo.db.couchdb.podcast import podcast_for_url
39 from mygpo.directory.tags import update_category
40 from mygpo.decorators import repeat_on_conflict
41 from mygpo.db.couchdb import get_main_database
43 import socket
44 socket.setdefaulttimeout(300)
47 class NoPodcastCreated(Exception):
48 """ raised when no podcast obj was created for a new URL """
51 class NoEpisodesException(Exception):
52 """ raised when parsing something that doesn't contain any episodes """
55 class PodcastUpdater(object):
56 """ Updates a number of podcasts with data from their feeds """
58 def __init__(self):
59 """ Queue is an iterable of podcast objects """
60 self.db = get_main_database()
63 def update_queue(self, queue):
64 """ Fetch data for the URLs supplied as the queue iterable """
66 for n, podcast_url in enumerate(queue):
67 print n, podcast_url
68 try:
69 self.update(podcast_url)
71 except NoPodcastCreated as npc:
72 print 'no podcast created:', npc
74 print
77 def update(self, podcast_url):
78 """ Update the podcast for the supplied URL """
80 try:
81 parsed = self._fetch_feed(podcast_url)
82 self._validate_parsed(parsed)
84 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
86 # if we fail to parse the URL, we don't even create the
87 # podcast object
88 p = podcast_for_url(podcast_url, create=False)
89 if p:
90 # if it exists already, we mark it as outdated
91 self._mark_outdated(p)
92 return
94 else:
95 raise NoPodcastCreated(ex)
97 assert parsed, 'fetch_feed must return something'
98 p = podcast_for_url(podcast_url, create=True)
99 self._update_podcast(p, parsed)
100 return p
103 def verify_podcast_url(self, podcast_url):
104 parsed = self._fetch_feed(podcast_url)
105 self._validate_parsed(parsed)
106 return True
109 def _fetch_feed(self, podcast_url):
110 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
114 def _validate_parsed(self, parsed):
115 """ validates the parsed results and raises an exception if invalid
117 feedparser parses pretty much everything. We reject anything that
118 doesn't look like a feed"""
120 if not parsed or not parsed.episodes:
121 raise NoEpisodesException('no episodes found')
124 @repeat_on_conflict(['podcast'])
125 def _update_podcast(self, podcast, parsed):
126 """ updates a podcast according to new parser results """
128 changed = False
130 # we need that later to decide if we can "bump" a category
131 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
133 changed |= update_a(podcast, 'title', parsed.title or podcast.title)
134 changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
135 changed |= update_a(podcast, 'description', parsed.description or podcast.description)
136 changed |= update_a(podcast, 'link', parsed.link or podcast.link)
137 changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
138 changed |= update_a(podcast, 'author', parsed.author or podcast.author)
139 changed |= update_a(podcast, 'language', parsed.language or podcast.language)
140 changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
141 changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
142 changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
143 changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
144 changed |= update_a(podcast, 'flattr_url', parsed.flattr)
147 if podcast.new_location:
148 new_podcast = podcast_for_url(podcast.new_location)
149 if new_podcast != podcast:
150 self._mark_outdated(podcast, 'redirected to different podcast')
151 return
153 elif not new_podcast:
154 podcast.urls.insert(0, podcast.new_location)
155 changed = True
158 episodes = self._update_episodes(podcast, parsed.episodes)
160 # latest episode timestamp
161 eps = filter(lambda e: bool(e.released), episodes)
162 eps = sorted(eps, key=lambda e: e.released)
163 if eps:
164 changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
165 changed |= update_a(podcast, 'episode_count', len(eps))
168 self._update_categories(podcast, prev_latest_episode_timestamp)
170 # try to download the logo and reset logo_url to None on http errors
171 found = self._save_podcast_logo(podcast.logo_url)
172 if not found:
173 changed |= update_a(podcast, 'logo_url', None)
175 if changed:
176 print 'saving podcast'
177 podcast.last_update = datetime.utcnow()
178 podcast.save()
181 assign_slug(podcast, PodcastSlug)
182 assign_missing_episode_slugs(podcast)
185 def _update_categories(self, podcast, prev_timestamp):
186 """ checks some practical requirements and updates a category """
188 from datetime import timedelta
190 max_timestamp = datetime.utcnow() + timedelta(days=1)
192 # no episodes at all
193 if not podcast.latest_episode_timestamp:
194 return
196 # no new episode
197 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
198 return
200 # too far in the future
201 if podcast.latest_episode_timestamp > max_timestamp:
202 return
204 # not enough subscribers
205 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
206 return
208 update_category(podcast)
211 @repeat_on_conflict(['podcast'])
212 def _update_episodes(self, podcast, parsed_episodes):
214 all_episodes = set(episodes_for_podcast_uncached(podcast))
215 remaining = list(all_episodes)
216 updated_episodes = []
218 for parsed_episode in parsed_episodes:
220 url = None
222 for f in parsed_episode.files:
223 if f.urls:
224 url = f.urls[0]
226 if not url:
227 continue
229 guid = parsed_episode.guid
231 # pop matchin episodes out of the "existing" list
232 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
234 if not matching:
235 new_episode = episode_for_podcast_id_url(podcast.get_id(),
236 url, create=True)
237 matching = [new_episode]
238 all_episodes.add(new_episode)
241 for episode in matching:
242 changed = False
243 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
244 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
245 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
246 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
247 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
248 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
249 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
250 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
251 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
252 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
253 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
254 changed |= update_a(episode, 'flattr_url', parsed_episode.flattr)
256 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
257 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
259 if changed:
260 episode.last_update = datetime.utcnow()
261 updated_episodes.append(episode)
264 outdated_episodes = all_episodes - set(updated_episodes)
266 # set episodes to be outdated, where necessary
267 for e in filter(lambda e: not e.outdated, outdated_episodes):
268 e.outdated = True
269 updated_episodes.append(e)
272 if updated_episodes:
273 print 'Updating', len(updated_episodes), 'episodes'
274 self.db.save_docs(updated_episodes)
276 return all_episodes
279 def _save_podcast_logo(self, cover_art):
280 if not cover_art:
281 return
283 try:
284 image_sha1 = hashlib.sha1(cover_art).hexdigest()
285 prefix = CoverArt.get_prefix(image_sha1)
287 filename = CoverArt.get_original(prefix, image_sha1)
288 dirname = CoverArt.get_dir(filename)
290 # get hash of existing file
291 if os.path.exists(filename):
292 with open(filename) as f:
293 old_hash = file_hash(f).digest()
294 else:
295 old_hash = ''
297 print 'LOGO @', cover_art
299 # save new cover art
300 with open(filename, 'w') as fp:
301 fp.write(urllib2.urlopen(cover_art).read())
303 # get hash of new file
304 with open(filename) as f:
305 new_hash = file_hash(f).digest()
307 # remove thumbnails if cover changed
308 if old_hash != new_hash:
309 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
310 print 'Removing %d thumbnails' % len(thumbnails)
311 for f in thumbnails:
312 os.unlink(f)
314 return cover_art
316 except (urllib2.HTTPError, urllib2.URLError, ValueError,
317 httplib.BadStatusLine) as e:
318 print e
321 @repeat_on_conflict(['podcast'])
322 def _mark_outdated(self, podcast, msg=''):
323 print 'mark outdated', msg
324 podcast.outdated = True
325 podcast.last_update = datetime.utcnow()
326 podcast.save()
327 self._update_episodes(podcast, [])
331 _none = object()
333 def update_a(obj, attrib, value):
334 changed = getattr(obj, attrib, _none) != value
335 setattr(obj, attrib, value)
336 return changed
339 def update_i(obj, item, value):
340 changed = obj.get(item, _none) != value
341 obj[item] = value
342 return changed