add page to manually add missing podcasts
[mygpo.git] / mygpo / data / feeddownloader.py
blob871a7989542c958cf2a6b7259ecaea214b46aa55
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain
27 from django.conf import settings
29 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
30 PodcastSlug
31 from feedservice.parse import parse_feed, FetchFeedException
32 from feedservice.parse.text import ConvertMarkdown
33 from feedservice.parse.models import ParserException
34 from mygpo.utils import file_hash, split_list
35 from mygpo.web.logo import CoverArt
36 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
37 episodes_for_podcast_uncached
38 from mygpo.db.couchdb.podcast import podcast_for_url
39 from mygpo.directory.tags import update_category
40 from mygpo.decorators import repeat_on_conflict
41 from mygpo.couch import get_main_database
43 import socket
44 socket.setdefaulttimeout(30)
47 class NoPodcastCreated(Exception):
48 """ raised when no podcast obj was created for a new URL """
51 class NoEpisodesException(Exception):
52 """ raised when parsing something that doesn't contain any episodes """
55 class PodcastUpdater(object):
56 """ Updates a number of podcasts with data from their feeds """
58 def __init__(self):
59 """ Queue is an iterable of podcast objects """
60 self.db = get_main_database()
63 def update_queue(self, queue):
64 """ Fetch data for the URLs supplied as the queue iterable """
66 for n, podcast_url in enumerate(queue):
67 print n, podcast_url
68 try:
69 self.update(podcast_url)
71 except NoPodcastCreated as npc:
72 print 'no podcast created:', npc
73 print
76 def update(self, podcast_url):
77 """ Update the podcast for the supplied URL """
79 try:
80 parsed = self._fetch_feed(podcast_url)
81 self._validate_parsed(parsed)
83 except (ParserException, FetchFeedException) as ex:
84 # if we fail to parse the URL, we don't even create the
85 # podcast object
86 p = podcast_for_url(podcast_url, create=False)
87 if p:
88 # if it exists already, we mark it as outdated
89 self._mark_outdated(p)
91 else:
92 raise NoPodcastCreated(ex)
94 assert parsed, 'fetch_feed must return something'
95 p = podcast_for_url(podcast_url, create=True)
96 self._update_podcast(p, parsed)
97 return p
100 def verify_podcast_url(self, podcast_url):
101 parsed = self._fetch_feed(podcast_url)
102 self._validate_parsed(parsed)
103 return True
106 def _fetch_feed(self, podcast_url):
107 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
111 def _validate_parsed(self, parsed):
112 """ validates the parsed results and raises an exception if invalid
114 feedparser parses pretty much everything. We reject anything that
115 doesn't look like a feed"""
117 if not parsed.episodes:
118 raise NoEpisodesException('no episodes found')
121 @repeat_on_conflict(['podcast'])
122 def _update_podcast(self, podcast, parsed):
123 """ updates a podcast according to new parser results """
125 changed = False
127 # we need that later to decide if we can "bump" a category
128 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
130 changed |= update_a(podcast, 'title', parsed.title or podcast.title)
131 changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
132 changed |= update_a(podcast, 'description', parsed.description or podcast.description)
133 changed |= update_a(podcast, 'link', parsed.link or podcast.link)
134 changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
135 changed |= update_a(podcast, 'author', parsed.author or podcast.author)
136 changed |= update_a(podcast, 'language', parsed.language or podcast.language)
137 changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
138 changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
139 changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
140 changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
143 if podcast.new_location:
144 new_podcast = podcast_for_url(podcast.new_location)
145 if new_podcast != podcast:
146 self._mark_outdated(podcast, 'redirected to different podcast')
147 return
149 elif not new_podcast:
150 podcast.urls.insert(0, podcast.new_location)
151 changed = True
154 episodes = self._update_episodes(podcast, parsed.episodes)
156 # latest episode timestamp
157 eps = filter(lambda e: bool(e.released), episodes)
158 eps = sorted(eps, key=lambda e: e.released)
159 if eps:
160 changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
161 changed |= update_a(podcast, 'episode_count', len(eps))
164 self._update_categories(podcast, prev_latest_episode_timestamp)
166 # try to download the logo and reset logo_url to None on http errors
167 found = self._save_podcast_logo(podcast.logo_url)
168 if not found:
169 changed |= update_a(podcast, 'logo_url', None)
171 if changed:
172 print 'saving podcast'
173 podcast.last_update = datetime.utcnow()
174 podcast.save()
177 assign_slug(podcast, PodcastSlug)
178 assign_missing_episode_slugs(podcast)
181 def _update_categories(self, podcast, prev_timestamp):
182 """ checks some practical requirements and updates a category """
184 from datetime import timedelta
186 max_timestamp = datetime.utcnow() + timedelta(days=1)
188 # no episodes at all
189 if not podcast.latest_episode_timestamp:
190 return
192 # no new episode
193 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
194 return
196 # too far in the future
197 if podcast.latest_episode_timestamp > max_timestamp:
198 return
200 # not enough subscribers
201 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
202 return
204 update_category(podcast)
207 @repeat_on_conflict(['podcast'])
208 def _update_episodes(self, podcast, parsed_episodes):
210 all_episodes = set(episodes_for_podcast_uncached(podcast))
211 remaining = list(all_episodes)
212 updated_episodes = []
214 for parsed_episode in parsed_episodes:
216 url = None
218 for f in parsed_episode.files:
219 if f.urls:
220 url = f.urls[0]
222 if not url:
223 continue
225 guid = parsed_episode.guid
227 # pop matchin episodes out of the "existing" list
228 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
230 if not matching:
231 new_episode = episode_for_podcast_id_url(podcast.get_id(),
232 url, create=True)
233 matching = [new_episode]
234 all_episodes.add(new_episode)
237 for episode in matching:
238 changed = False
239 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
240 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
241 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
242 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
243 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
244 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released)
245 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
246 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
247 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
248 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
249 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
251 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
252 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
254 if changed:
255 episode.last_update = datetime.utcnow()
256 updated_episodes.append(episode)
259 outdated_episodes = all_episodes - set(updated_episodes)
261 # set episodes to be outdated, where necessary
262 for e in filter(lambda e: not e.outdated, outdated_episodes):
263 e.outdated = True
264 updated_episodes.append(e)
267 if updated_episodes:
268 print 'Updating', len(updated_episodes), 'episodes'
269 self.db.save_docs(updated_episodes)
271 return all_episodes
274 def _save_podcast_logo(self, cover_art):
275 if not cover_art:
276 return
278 try:
279 image_sha1 = hashlib.sha1(cover_art).hexdigest()
280 prefix = CoverArt.get_prefix(image_sha1)
282 filename = CoverArt.get_original(prefix, image_sha1)
283 dirname = CoverArt.get_dir(filename)
285 # get hash of existing file
286 if os.path.exists(filename):
287 with open(filename) as f:
288 old_hash = file_hash(f).digest()
289 else:
290 old_hash = ''
292 print 'LOGO @', cover_art
294 # save new cover art
295 with open(filename, 'w') as fp:
296 fp.write(urllib2.urlopen(cover_art).read())
298 # get hash of new file
299 with open(filename) as f:
300 new_hash = file_hash(f).digest()
302 # remove thumbnails if cover changed
303 if old_hash != new_hash:
304 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
305 print 'Removing %d thumbnails' % len(thumbnails)
306 for f in thumbnails:
307 os.unlink(f)
309 return cover_art
311 except urllib2.HTTPError as e:
312 print e
314 except urllib2.URLError as e:
315 print e
318 @repeat_on_conflict(['podcast'])
319 def _mark_outdated(self, podcast, msg=''):
320 print 'mark outdated', msg
321 podcast.outdated = True
322 podcast.last_update = datetime.utcnow()
323 podcast.save()
324 self._update_episodes(podcast, [])
328 _none = object()
330 def update_a(obj, attrib, value):
331 changed = getattr(obj, attrib, _none) != value
332 setattr(obj, attrib, value)
333 return changed
336 def update_i(obj, item, value):
337 changed = obj.get(item, _none) != value
338 obj[item] = value
339 return changed