move socket timeout to feed-downloader command
[mygpo.git] / mygpo / data / feeddownloader.py
blobad7050d73ad5aa2781ab6bd831fd97ff658f4832
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain
27 from django.conf import settings
29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
30 PodcastSlug
31 from feedservice.parse import parse_feed, FetchFeedException
32 from feedservice.parse.text import ConvertMarkdown
33 from feedservice.parse.models import ParserException
34 from mygpo.utils import file_hash, split_list
35 from mygpo.web.logo import CoverArt
36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
37 episodes_for_podcast_uncached
38 from mygpo.db.couchdb.podcast import podcast_for_url
39 from mygpo.directory.tags import update_category
40 from mygpo.decorators import repeat_on_conflict
41 from mygpo.db.couchdb import get_main_database
44 class NoPodcastCreated(Exception):
45 """ raised when no podcast obj was created for a new URL """
48 class NoEpisodesException(Exception):
49 """ raised when parsing something that doesn't contain any episodes """
52 class PodcastUpdater(object):
53 """ Updates a number of podcasts with data from their feeds """
55 def __init__(self):
56 """ Queue is an iterable of podcast objects """
57 self.db = get_main_database()
60 def update_queue(self, queue):
61 """ Fetch data for the URLs supplied as the queue iterable """
63 for n, podcast_url in enumerate(queue):
64 print n, podcast_url
65 try:
66 yield self.update(podcast_url)
68 except NoPodcastCreated as npc:
69 print 'no podcast created:', npc
71 print
74 def update(self, podcast_url):
75 """ Update the podcast for the supplied URL """
77 try:
78 parsed = self._fetch_feed(podcast_url)
79 self._validate_parsed(parsed)
81 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
83 # if we fail to parse the URL, we don't even create the
84 # podcast object
85 p = podcast_for_url(podcast_url, create=False)
86 if p:
87 # if it exists already, we mark it as outdated
88 self._mark_outdated(p)
89 return
91 else:
92 raise NoPodcastCreated(ex)
94 assert parsed, 'fetch_feed must return something'
95 p = podcast_for_url(podcast_url, create=True)
96 self._update_podcast(p, parsed)
97 return p
100 def verify_podcast_url(self, podcast_url):
101 parsed = self._fetch_feed(podcast_url)
102 self._validate_parsed(parsed)
103 return True
106 def _fetch_feed(self, podcast_url):
107 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
111 def _validate_parsed(self, parsed):
112 """ validates the parsed results and raises an exception if invalid
114 feedparser parses pretty much everything. We reject anything that
115 doesn't look like a feed"""
117 if not parsed or not parsed.episodes:
118 raise NoEpisodesException('no episodes found')
121 @repeat_on_conflict(['podcast'])
122 def _update_podcast(self, podcast, parsed):
123 """ updates a podcast according to new parser results """
125 changed = False
127 # we need that later to decide if we can "bump" a category
128 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
130 changed |= update_a(podcast, 'title', parsed.title or podcast.title)
131 changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
132 changed |= update_a(podcast, 'description', parsed.description or podcast.description)
133 changed |= update_a(podcast, 'link', parsed.link or podcast.link)
134 changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
135 changed |= update_a(podcast, 'author', parsed.author or podcast.author)
136 changed |= update_a(podcast, 'language', parsed.language or podcast.language)
137 changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
138 changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
139 changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
140 changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
141 changed |= update_a(podcast, 'flattr_url', parsed.flattr)
144 if podcast.new_location:
145 new_podcast = podcast_for_url(podcast.new_location)
146 if new_podcast != podcast:
147 self._mark_outdated(podcast, 'redirected to different podcast')
148 return
150 elif not new_podcast:
151 podcast.urls.insert(0, podcast.new_location)
152 changed = True
155 episodes = self._update_episodes(podcast, parsed.episodes)
157 # latest episode timestamp
158 eps = filter(lambda e: bool(e.released), episodes)
159 eps = sorted(eps, key=lambda e: e.released)
160 if eps:
161 changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
162 changed |= update_a(podcast, 'episode_count', len(eps))
165 self._update_categories(podcast, prev_latest_episode_timestamp)
167 # try to download the logo and reset logo_url to None on http errors
168 found = self._save_podcast_logo(podcast.logo_url)
169 if not found:
170 changed |= update_a(podcast, 'logo_url', None)
172 if changed:
173 print 'saving podcast'
174 podcast.last_update = datetime.utcnow()
175 podcast.save()
178 assign_slug(podcast, PodcastSlug)
179 assign_missing_episode_slugs(podcast)
182 def _update_categories(self, podcast, prev_timestamp):
183 """ checks some practical requirements and updates a category """
185 from datetime import timedelta
187 max_timestamp = datetime.utcnow() + timedelta(days=1)
189 # no episodes at all
190 if not podcast.latest_episode_timestamp:
191 return
193 # no new episode
194 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
195 return
197 # too far in the future
198 if podcast.latest_episode_timestamp > max_timestamp:
199 return
201 # not enough subscribers
202 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
203 return
205 update_category(podcast)
208 @repeat_on_conflict(['podcast'])
209 def _update_episodes(self, podcast, parsed_episodes):
211 all_episodes = set(episodes_for_podcast_uncached(podcast))
212 remaining = list(all_episodes)
213 updated_episodes = []
215 for parsed_episode in parsed_episodes:
217 url = None
219 for f in parsed_episode.files:
220 if f.urls:
221 url = f.urls[0]
223 if not url:
224 continue
226 guid = parsed_episode.guid
228 # pop matchin episodes out of the "existing" list
229 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
231 if not matching:
232 new_episode = episode_for_podcast_id_url(podcast.get_id(),
233 url, create=True)
234 matching = [new_episode]
235 all_episodes.add(new_episode)
238 for episode in matching:
239 changed = False
240 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
241 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
242 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
243 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
244 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
245 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
246 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
247 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
248 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
249 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
250 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
251 changed |= update_a(episode, 'flattr_url', parsed_episode.flattr)
253 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
254 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
256 if changed:
257 episode.last_update = datetime.utcnow()
258 updated_episodes.append(episode)
261 outdated_episodes = all_episodes - set(updated_episodes)
263 # set episodes to be outdated, where necessary
264 for e in filter(lambda e: not e.outdated, outdated_episodes):
265 e.outdated = True
266 updated_episodes.append(e)
269 if updated_episodes:
270 print 'Updating', len(updated_episodes), 'episodes'
271 self.db.save_docs(updated_episodes)
273 return all_episodes
276 def _save_podcast_logo(self, cover_art):
277 if not cover_art:
278 return
280 try:
281 image_sha1 = hashlib.sha1(cover_art).hexdigest()
282 prefix = CoverArt.get_prefix(image_sha1)
284 filename = CoverArt.get_original(prefix, image_sha1)
285 dirname = CoverArt.get_dir(filename)
287 # get hash of existing file
288 if os.path.exists(filename):
289 with open(filename) as f:
290 old_hash = file_hash(f).digest()
291 else:
292 old_hash = ''
294 print 'LOGO @', cover_art
296 # save new cover art
297 with open(filename, 'w') as fp:
298 fp.write(urllib2.urlopen(cover_art).read())
300 # get hash of new file
301 with open(filename) as f:
302 new_hash = file_hash(f).digest()
304 # remove thumbnails if cover changed
305 if old_hash != new_hash:
306 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
307 print 'Removing %d thumbnails' % len(thumbnails)
308 for f in thumbnails:
309 os.unlink(f)
311 return cover_art
313 except (urllib2.HTTPError, urllib2.URLError, ValueError,
314 httplib.BadStatusLine) as e:
315 print e
318 @repeat_on_conflict(['podcast'])
319 def _mark_outdated(self, podcast, msg=''):
320 print 'mark outdated', msg
321 podcast.outdated = True
322 podcast.last_update = datetime.utcnow()
323 podcast.save()
324 self._update_episodes(podcast, [])
328 _none = object()
330 def update_a(obj, attrib, value):
331 changed = getattr(obj, attrib, _none) != value
332 setattr(obj, attrib, value)
333 return changed
336 def update_i(obj, item, value):
337 changed = obj.get(item, _none) != value
338 obj[item] = value
339 return changed