feed-downloader fixes
[mygpo.git] / mygpo / data / feeddownloader.py
blob13ac802c481134a6df2fd7047617bc7ba4243c15
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain
27 from django.conf import settings
29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
30 PodcastSlug
31 from feedservice.parse import parse_feed, FetchFeedException
32 from feedservice.parse.text import ConvertMarkdown
33 from feedservice.parse.models import ParserException
34 from mygpo.utils import file_hash, split_list
35 from mygpo.web.logo import CoverArt
36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
37 episodes_for_podcast_uncached
38 from mygpo.db.couchdb.podcast import podcast_for_url
39 from mygpo.directory.tags import update_category
40 from mygpo.decorators import repeat_on_conflict
41 from mygpo.couch import get_main_database
43 import socket
44 socket.setdefaulttimeout(300)
47 class NoPodcastCreated(Exception):
48 """ raised when no podcast obj was created for a new URL """
51 class NoEpisodesException(Exception):
52 """ raised when parsing something that doesn't contain any episodes """
55 class PodcastUpdater(object):
56 """ Updates a number of podcasts with data from their feeds """
58 def __init__(self):
59 """ Queue is an iterable of podcast objects """
60 self.db = get_main_database()
63 def update_queue(self, queue):
64 """ Fetch data for the URLs supplied as the queue iterable """
66 for n, podcast_url in enumerate(queue):
67 print n, podcast_url
68 try:
69 self.update(podcast_url)
71 except NoPodcastCreated as npc:
72 print 'no podcast created:', npc
74 print
77 def update(self, podcast_url):
78 """ Update the podcast for the supplied URL """
80 try:
81 parsed = self._fetch_feed(podcast_url)
82 self._validate_parsed(parsed)
84 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
86 # if we fail to parse the URL, we don't even create the
87 # podcast object
88 p = podcast_for_url(podcast_url, create=False)
89 if p:
90 # if it exists already, we mark it as outdated
91 self._mark_outdated(p)
92 return
94 else:
95 raise NoPodcastCreated(ex)
97 assert parsed, 'fetch_feed must return something'
98 p = podcast_for_url(podcast_url, create=True)
99 self._update_podcast(p, parsed)
100 return p
103 def verify_podcast_url(self, podcast_url):
104 parsed = self._fetch_feed(podcast_url)
105 self._validate_parsed(parsed)
106 return True
109 def _fetch_feed(self, podcast_url):
110 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
114 def _validate_parsed(self, parsed):
115 """ validates the parsed results and raises an exception if invalid
117 feedparser parses pretty much everything. We reject anything that
118 doesn't look like a feed"""
120 if not parsed or not parsed.episodes:
121 raise NoEpisodesException('no episodes found')
124 @repeat_on_conflict(['podcast'])
125 def _update_podcast(self, podcast, parsed):
126 """ updates a podcast according to new parser results """
128 changed = False
130 # we need that later to decide if we can "bump" a category
131 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
133 changed |= update_a(podcast, 'title', parsed.title or podcast.title)
134 changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
135 changed |= update_a(podcast, 'description', parsed.description or podcast.description)
136 changed |= update_a(podcast, 'link', parsed.link or podcast.link)
137 changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
138 changed |= update_a(podcast, 'author', parsed.author or podcast.author)
139 changed |= update_a(podcast, 'language', parsed.language or podcast.language)
140 changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
141 changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
142 changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
143 changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
146 if podcast.new_location:
147 new_podcast = podcast_for_url(podcast.new_location)
148 if new_podcast != podcast:
149 self._mark_outdated(podcast, 'redirected to different podcast')
150 return
152 elif not new_podcast:
153 podcast.urls.insert(0, podcast.new_location)
154 changed = True
157 episodes = self._update_episodes(podcast, parsed.episodes)
159 # latest episode timestamp
160 eps = filter(lambda e: bool(e.released), episodes)
161 eps = sorted(eps, key=lambda e: e.released)
162 if eps:
163 changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
164 changed |= update_a(podcast, 'episode_count', len(eps))
167 self._update_categories(podcast, prev_latest_episode_timestamp)
169 # try to download the logo and reset logo_url to None on http errors
170 found = self._save_podcast_logo(podcast.logo_url)
171 if not found:
172 changed |= update_a(podcast, 'logo_url', None)
174 if changed:
175 print 'saving podcast'
176 podcast.last_update = datetime.utcnow()
177 podcast.save()
180 assign_slug(podcast, PodcastSlug)
181 assign_missing_episode_slugs(podcast)
184 def _update_categories(self, podcast, prev_timestamp):
185 """ checks some practical requirements and updates a category """
187 from datetime import timedelta
189 max_timestamp = datetime.utcnow() + timedelta(days=1)
191 # no episodes at all
192 if not podcast.latest_episode_timestamp:
193 return
195 # no new episode
196 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
197 return
199 # too far in the future
200 if podcast.latest_episode_timestamp > max_timestamp:
201 return
203 # not enough subscribers
204 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
205 return
207 update_category(podcast)
210 @repeat_on_conflict(['podcast'])
211 def _update_episodes(self, podcast, parsed_episodes):
213 all_episodes = set(episodes_for_podcast_uncached(podcast))
214 remaining = list(all_episodes)
215 updated_episodes = []
217 for parsed_episode in parsed_episodes:
219 url = None
221 for f in parsed_episode.files:
222 if f.urls:
223 url = f.urls[0]
225 if not url:
226 continue
228 guid = parsed_episode.guid
230 # pop matchin episodes out of the "existing" list
231 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
233 if not matching:
234 new_episode = episode_for_podcast_id_url(podcast.get_id(),
235 url, create=True)
236 matching = [new_episode]
237 all_episodes.add(new_episode)
240 for episode in matching:
241 changed = False
242 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
243 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
244 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
245 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
246 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
247 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
248 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
249 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
250 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
251 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
252 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
254 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
255 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
257 if changed:
258 episode.last_update = datetime.utcnow()
259 updated_episodes.append(episode)
262 outdated_episodes = all_episodes - set(updated_episodes)
264 # set episodes to be outdated, where necessary
265 for e in filter(lambda e: not e.outdated, outdated_episodes):
266 e.outdated = True
267 updated_episodes.append(e)
270 if updated_episodes:
271 print 'Updating', len(updated_episodes), 'episodes'
272 self.db.save_docs(updated_episodes)
274 return all_episodes
277 def _save_podcast_logo(self, cover_art):
278 if not cover_art:
279 return
281 try:
282 image_sha1 = hashlib.sha1(cover_art).hexdigest()
283 prefix = CoverArt.get_prefix(image_sha1)
285 filename = CoverArt.get_original(prefix, image_sha1)
286 dirname = CoverArt.get_dir(filename)
288 # get hash of existing file
289 if os.path.exists(filename):
290 with open(filename) as f:
291 old_hash = file_hash(f).digest()
292 else:
293 old_hash = ''
295 print 'LOGO @', cover_art
297 # save new cover art
298 with open(filename, 'w') as fp:
299 fp.write(urllib2.urlopen(cover_art).read())
301 # get hash of new file
302 with open(filename) as f:
303 new_hash = file_hash(f).digest()
305 # remove thumbnails if cover changed
306 if old_hash != new_hash:
307 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
308 print 'Removing %d thumbnails' % len(thumbnails)
309 for f in thumbnails:
310 os.unlink(f)
312 return cover_art
314 except (urllib2.HTTPError, urllib2.URLError, ValueError,
315 httplib.BadStatusLine) as e:
316 print e
319 @repeat_on_conflict(['podcast'])
320 def _mark_outdated(self, podcast, msg=''):
321 print 'mark outdated', msg
322 podcast.outdated = True
323 podcast.last_update = datetime.utcnow()
324 podcast.save()
325 self._update_episodes(podcast, [])
329 _none = object()
331 def update_a(obj, attrib, value):
332 changed = getattr(obj, attrib, _none) != value
333 setattr(obj, attrib, value)
334 return changed
337 def update_i(obj, item, value):
338 changed = obj.get(item, _none) != value
339 obj[item] = value
340 return changed