Update podcasts based on avg update interval
[mygpo.git] / mygpo / data / feeddownloader.py
blob81b525726106aa95dd2dd7944938b97c56809b76
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain
28 from django.conf import settings
30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
31 PodcastSlug
32 from mygpo.core.models import DEFAULT_UPDATE_INTERVAL, MIN_UPDATE_INTERVAL, \
33 MAX_UPDATE_INTERVAL
34 from feedservice.parse import parse_feed, FetchFeedException
35 from feedservice.parse.text import ConvertMarkdown
36 from feedservice.parse.models import ParserException
37 from mygpo.utils import file_hash, deep_eq
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
41 episodes_for_podcast_current
42 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
43 from mygpo.directory.tags import update_category
44 from mygpo.decorators import repeat_on_conflict
45 from mygpo.db.couchdb import get_main_database, bulk_save_retry
47 import logging
48 logger = logging.getLogger(__name__)
51 class NoPodcastCreated(Exception):
52 """ raised when no podcast obj was created for a new URL """
55 class NoEpisodesException(Exception):
56 """ raised when parsing something that doesn't contain any episodes """
59 class PodcastUpdater(object):
60 """ Updates a number of podcasts with data from their feeds """
62 def __init__(self):
63 """ Queue is an iterable of podcast objects """
64 self.db = get_main_database()
67 def update_queue(self, queue):
68 """ Fetch data for the URLs supplied as the queue iterable """
70 for n, podcast_url in enumerate(queue, 1):
71 logger.info('Update %d - %s', n, podcast_url)
72 try:
73 yield self.update(podcast_url)
75 except NoPodcastCreated as npc:
76 logger.info('No podcast created: %s', npc)
79 def update(self, podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = self._fetch_feed(podcast_url)
84 self._validate_parsed(parsed)
86 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
88 # if we fail to parse the URL, we don't even create the
89 # podcast object
90 p = podcast_for_url(podcast_url, create=False)
91 if p:
92 # if it exists already, we mark it as outdated
93 self._mark_outdated(p, 'error while fetching feed: %s' %
94 str(ex))
95 return p
97 else:
98 raise NoPodcastCreated(ex)
100 assert parsed, 'fetch_feed must return something'
101 p = podcast_for_url(podcast_url, create=True)
102 episodes = self._update_episodes(p, parsed.episodes)
103 self._update_podcast(p, parsed, episodes)
104 return p
107 def verify_podcast_url(self, podcast_url):
108 parsed = self._fetch_feed(podcast_url)
109 self._validate_parsed(parsed)
110 return True
113 def _fetch_feed(self, podcast_url):
114 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
118 def _validate_parsed(self, parsed):
119 """ validates the parsed results and raises an exception if invalid
121 feedparser parses pretty much everything. We reject anything that
122 doesn't look like a feed"""
124 if not parsed or not parsed.episodes:
125 raise NoEpisodesException('no episodes found')
128 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
129 def _update_podcast(self, podcast, parsed, episodes):
130 """ updates a podcast according to new parser results """
132 # we need that later to decide if we can "bump" a category
133 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
135 old_json = copy.deepcopy(podcast.to_json())
137 podcast.title = parsed.title or podcast.title
138 podcast.urls = list(set(podcast.urls + parsed.urls))
139 podcast.description = parsed.description or podcast.description
140 podcast.subtitle = parsed.subtitle or podcast.subtitle
141 podcast.link = parsed.link or podcast.link
142 podcast.logo_url = parsed.logo or podcast.logo_url
143 podcast.author = parsed.author or podcast.author
144 podcast.language = parsed.language or podcast.language
145 podcast.content_types = parsed.content_types or podcast.content_types
146 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
147 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
148 podcast.new_location = parsed.new_location or podcast.new_location
149 podcast.flattr_url = parsed.flattr or podcast.flattr_url
150 podcast.hub = parsed.hub or podcast.hub
151 podcast.license = parsed.license or podcast.license
154 if podcast.new_location:
155 new_podcast = podcast_for_url(podcast.new_location)
156 if new_podcast != podcast:
157 self._mark_outdated(podcast, 'redirected to different podcast')
158 return
160 elif not new_podcast:
161 podcast.urls.insert(0, podcast.new_location)
164 logger.info('Retrieved %d episodes in total', len(episodes))
166 # latest episode timestamp
167 eps = filter(lambda e: bool(e.released), episodes)
168 eps = sorted(eps, key=lambda e: e.released)
170 podcast.update_interval = get_update_interval(eps)
172 if eps:
173 podcast.latest_episode_timestamp = eps[-1].released
174 podcast.episode_count = len(eps)
177 self._update_categories(podcast, prev_latest_episode_timestamp)
179 # try to download the logo and reset logo_url to None on http errors
180 found = self._save_podcast_logo(podcast.logo_url)
181 if not found:
182 podcast.logo_url = None
184 if not deep_eq(old_json, podcast.to_json()):
185 logger.info('Saving podcast.')
186 podcast.last_update = datetime.utcnow()
187 podcast.save()
190 subscribe_at_hub(podcast)
192 assign_slug(podcast, PodcastSlug)
193 assign_missing_episode_slugs(podcast)
196 def _update_categories(self, podcast, prev_timestamp):
197 """ checks some practical requirements and updates a category """
199 from datetime import timedelta
201 max_timestamp = datetime.utcnow() + timedelta(days=1)
203 # no episodes at all
204 if not podcast.latest_episode_timestamp:
205 return
207 # no new episode
208 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
209 return
211 # too far in the future
212 if podcast.latest_episode_timestamp > max_timestamp:
213 return
215 # not enough subscribers
216 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
217 return
219 update_category(podcast)
222 def _update_episodes(self, podcast, parsed_episodes):
224 pid = podcast.get_id()
226 # list of (obj, fun) where fun is the function to update obj
227 changes = []
228 logger.info('Parsed %d episodes', len(parsed_episodes))
230 for n, parsed in enumerate(parsed_episodes, 1):
232 url = get_episode_url(parsed)
233 if not url:
234 logger.info('Skipping episode %d for missing URL', n)
235 continue
237 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
238 episode = episode_for_podcast_id_url(pid, url, create=True)
240 update_episode = get_episode_update_function(parsed, episode,
241 podcast)
242 changes.append((episode, update_episode))
244 # determine which episodes have been found
245 updated_episodes = [e for (e, f) in changes]
246 logging.info('Updating %d episodes with new data', len(updated_episodes))
248 # and mark the remaining ones outdated
249 current_episodes = set(episodes_for_podcast_current(podcast, limit=100))
250 outdated_episodes = current_episodes - set(updated_episodes)
251 logging.info('Marking %d episodes as outdated', len(outdated_episodes))
252 changes.extend((e, mark_outdated) for e in outdated_episodes)
254 logging.info('Saving %d changes', len(changes))
255 bulk_save_retry(changes, self.db)
257 return updated_episodes
260 def _save_podcast_logo(self, cover_art):
261 if not cover_art:
262 return
264 try:
265 image_sha1 = hashlib.sha1(cover_art).hexdigest()
266 prefix = CoverArt.get_prefix(image_sha1)
268 filename = CoverArt.get_original(prefix, image_sha1)
269 dirname = CoverArt.get_dir(filename)
271 # get hash of existing file
272 if os.path.exists(filename):
273 with open(filename) as f:
274 old_hash = file_hash(f).digest()
275 else:
276 old_hash = ''
278 logger.info('Logo %s', cover_art)
280 # save new cover art
281 with open(filename, 'w') as fp:
282 fp.write(urllib2.urlopen(cover_art).read())
284 # get hash of new file
285 with open(filename) as f:
286 new_hash = file_hash(f).digest()
288 # remove thumbnails if cover changed
289 if old_hash != new_hash:
290 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
291 logger.info('Removing %d thumbnails', len(thumbnails))
292 for f in thumbnails:
293 os.unlink(f)
295 return cover_art
297 except (urllib2.HTTPError, urllib2.URLError, ValueError,
298 httplib.BadStatusLine) as e:
299 logger.warn('Exception while updating podcast: %s', str(e))
302 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
303 def _mark_outdated(self, podcast, msg=''):
304 logger.info('marking podcast outdated: %s', msg)
305 podcast.outdated = True
306 podcast.last_update = datetime.utcnow()
307 podcast.save()
308 self._update_episodes(podcast, [])
311 def get_episode_url(parsed_episode):
312 """ returns the URL of a parsed episode """
313 for f in parsed_episode.files:
314 if f.urls:
315 return f.urls[0]
316 return None
319 def get_episode_update_function(parsed_episode, episode, podcast):
320 """ returns an update function that can be passed to bulk_save_retry """
322 def update_episode(episode):
323 """ updates "episode" with the data from "parsed_episode" """
325 # copy the json so we can determine if there have been any changes
326 old_json = copy.deepcopy(episode.to_json())
328 episode.guid = parsed_episode.guid or episode.guid
329 episode.title = parsed_episode.title or episode.title
330 episode.description = parsed_episode.description or episode.description
331 episode.subtitle = parsed_episode.subtitle or episode.subtitle
332 episode.content = parsed_episode.content or parsed_episode.description or episode.content
333 episode.link = parsed_episode.link or episode.link
334 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
335 episode.author = parsed_episode.author or episode.author
336 episode.duration = parsed_episode.duration or episode.duration
337 episode.filesize = parsed_episode.files[0].filesize
338 episode.language = parsed_episode.language or episode.language or \
339 podcast.language
340 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
341 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
342 episode.license = parsed_episode.license or episode.license
344 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
345 episode.urls = sorted(set(episode.urls + urls), key=len)
347 # if nothing changed we return None to indicate no required action
348 if deep_eq(old_json, episode.to_json()):
349 return None
351 # set the last_update only if there have been changed above
352 episode.last_update = datetime.utcnow()
353 return episode
355 return update_episode
357 def mark_outdated(obj):
358 """ marks obj outdated if its not already """
359 if obj.outdated:
360 return None
362 obj.outdated = True
363 obj.last_update = datetime.utcnow()
364 return obj
367 def get_update_interval(episodes):
368 """ calculates the avg interval between new episodes """
370 count = len(episodes)
371 if count <= 1:
372 logger.info('%d episodes, using default interval of %dh',
373 count, DEFAULT_UPDATE_INTERVAL)
374 return DEFAULT_UPDATE_INTERVAL
376 earliest = episodes[0]
377 latest = episodes[-1]
379 timespan_s = (latest.released - earliest.released).total_seconds()
380 timespan_h = timespan_s / 60 / 60
382 interval = int(timespan_h / count)
383 logger.info('%d episodes in %d days => %dh interval', count,
384 timespan_h / 24, interval)
386 # place interval between {MIN,MAX}_UPDATE_INTERVAL
387 interval = max(interval, MIN_UPDATE_INTERVAL)
388 interval = min(interval, MAX_UPDATE_INTERVAL)
390 return interval