replace deprecated distribute with setuptools
[mygpo.git] / mygpo / data / feeddownloader.py
blob7b633b146ae21a22e01402a67531fd9adcadb96e
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain, islice
27 import socket
29 from django.conf import settings
31 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
32 PodcastSlug
33 from mygpo.core.models import DEFAULT_UPDATE_INTERVAL, MIN_UPDATE_INTERVAL, \
34 MAX_UPDATE_INTERVAL
35 from feedservice.parse import parse_feed, FetchFeedException
36 from feedservice.parse.text import ConvertMarkdown
37 from feedservice.parse.models import ParserException
38 from feedservice.parse.vimeo import VimeoError
39 from mygpo.utils import file_hash, deep_eq
40 from mygpo.web.logo import CoverArt
41 from mygpo.data.podcast import subscribe_at_hub
42 from mygpo.pubsub.models import SubscriptionError
43 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
44 episodes_for_podcast_current, episode_count_for_podcast
45 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
46 from mygpo.directory.tags import update_category
47 from mygpo.decorators import repeat_on_conflict
48 from mygpo.db.couchdb import get_main_database, bulk_save_retry
50 import logging
51 logger = logging.getLogger(__name__)
53 MAX_EPISODES_UPDATE=200
55 class NoPodcastCreated(Exception):
56 """ raised when no podcast obj was created for a new URL """
59 class NoEpisodesException(Exception):
60 """ raised when parsing something that doesn't contain any episodes """
63 class PodcastUpdater(object):
64 """ Updates a number of podcasts with data from their feeds """
66 def __init__(self):
67 """ Queue is an iterable of podcast objects """
68 self.db = get_main_database()
71 def update_queue(self, queue):
72 """ Fetch data for the URLs supplied as the queue iterable """
74 for n, podcast_url in enumerate(queue, 1):
75 logger.info('Update %d - %s', n, podcast_url)
76 try:
77 yield self.update(podcast_url)
79 except NoPodcastCreated as npc:
80 logger.info('No podcast created: %s', npc)
83 def update(self, podcast_url):
84 """ Update the podcast for the supplied URL """
86 try:
87 parsed = self._fetch_feed(podcast_url)
88 self._validate_parsed(parsed)
90 except (ParserException, FetchFeedException, NoEpisodesException,
91 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
92 #TODO: catch valueError (for invalid Ipv6 in feedservice)
94 if isinstance(ex, VimeoError):
95 logger.exception('Problem when updating Vimeo feed %s',
96 podcast_url)
98 # if we fail to parse the URL, we don't even create the
99 # podcast object
100 p = podcast_for_url(podcast_url, create=False)
101 if p:
102 # if it exists already, we mark it as outdated
103 self._mark_outdated(p, 'error while fetching feed: %s' %
104 str(ex))
105 return p
107 else:
108 raise NoPodcastCreated(ex)
110 assert parsed, 'fetch_feed must return something'
111 p = podcast_for_url(podcast_url, create=True)
112 episodes = self._update_episodes(p, parsed.episodes)
113 self._update_podcast(p, parsed, episodes)
114 return p
117 def verify_podcast_url(self, podcast_url):
118 parsed = self._fetch_feed(podcast_url)
119 self._validate_parsed(parsed)
120 return True
123 def _fetch_feed(self, podcast_url):
124 import socket
125 t = socket.getdefaulttimeout()
126 socket.setdefaulttimeout(10)
127 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
128 socket.setdefaulttimeout(t)
132 def _validate_parsed(self, parsed):
133 """ validates the parsed results and raises an exception if invalid
135 feedparser parses pretty much everything. We reject anything that
136 doesn't look like a feed"""
138 if not parsed or not parsed.episodes:
139 raise NoEpisodesException('no episodes found')
142 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
143 def _update_podcast(self, podcast, parsed, episodes):
144 """ updates a podcast according to new parser results """
146 # we need that later to decide if we can "bump" a category
147 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
149 old_json = copy.deepcopy(podcast.to_json())
151 podcast.title = parsed.title or podcast.title
152 podcast.urls = list(set(podcast.urls + parsed.urls))
153 podcast.description = parsed.description or podcast.description
154 podcast.subtitle = parsed.subtitle or podcast.subtitle
155 podcast.link = parsed.link or podcast.link
156 podcast.logo_url = parsed.logo or podcast.logo_url
157 podcast.author = parsed.author or podcast.author
158 podcast.language = parsed.language or podcast.language
159 podcast.content_types = parsed.content_types or podcast.content_types
160 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
161 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
162 podcast.new_location = parsed.new_location or podcast.new_location
163 podcast.flattr_url = parsed.flattr or podcast.flattr_url
164 podcast.hub = parsed.hub or podcast.hub
165 podcast.license = parsed.license or podcast.license
168 if podcast.new_location:
169 new_podcast = podcast_for_url(podcast.new_location)
170 if new_podcast != podcast:
171 self._mark_outdated(podcast, 'redirected to different podcast')
172 return
174 elif not new_podcast:
175 podcast.urls.insert(0, podcast.new_location)
178 logger.info('Retrieved %d episodes in total', len(episodes))
180 # latest episode timestamp
181 eps = filter(lambda e: bool(e.released), episodes)
182 eps = sorted(eps, key=lambda e: e.released)
184 podcast.update_interval = get_update_interval(eps)
186 if eps:
187 podcast.latest_episode_timestamp = eps[-1].released
189 podcast.episode_count = episode_count_for_podcast(podcast)
192 self._update_categories(podcast, prev_latest_episode_timestamp)
194 # try to download the logo and reset logo_url to None on http errors
195 found = self._save_podcast_logo(podcast.logo_url)
196 if not found:
197 podcast.logo_url = None
199 # The podcast is always saved (not just when there are changes) because
200 # we need to record the last update
201 logger.info('Saving podcast.')
202 podcast.last_update = datetime.utcnow()
203 podcast.save()
206 try:
207 subscribe_at_hub(podcast)
208 except SubscriptionError as se:
209 logger.warn('subscribing to hub failed: %s', str(se))
211 assign_slug(podcast, PodcastSlug)
212 assign_missing_episode_slugs(podcast)
215 def _update_categories(self, podcast, prev_timestamp):
216 """ checks some practical requirements and updates a category """
218 from datetime import timedelta
220 max_timestamp = datetime.utcnow() + timedelta(days=1)
222 # no episodes at all
223 if not podcast.latest_episode_timestamp:
224 return
226 # no new episode
227 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
228 return
230 # too far in the future
231 if podcast.latest_episode_timestamp > max_timestamp:
232 return
234 # not enough subscribers
235 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
236 return
238 update_category(podcast)
241 def _update_episodes(self, podcast, parsed_episodes):
243 pid = podcast.get_id()
245 # list of (obj, fun) where fun is the function to update obj
246 changes = []
247 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
248 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
249 len(episodes_to_update))
251 for n, parsed in enumerate(episodes_to_update, 1):
253 url = get_episode_url(parsed)
254 if not url:
255 logger.info('Skipping episode %d for missing URL', n)
256 continue
258 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
259 episode = episode_for_podcast_id_url(pid, url, create=True)
261 update_episode = get_episode_update_function(parsed, episode,
262 podcast)
263 changes.append((episode, update_episode))
265 # determine which episodes have been found
266 updated_episodes = [e for (e, f) in changes]
267 logger.info('Updating %d episodes with new data', len(updated_episodes))
269 # and mark the remaining ones outdated
270 current_episodes = set(episodes_for_podcast_current(podcast, limit=500))
271 outdated_episodes = current_episodes - set(updated_episodes)
272 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
273 changes.extend((e, mark_outdated) for e in outdated_episodes)
275 logger.info('Saving %d changes', len(changes))
276 bulk_save_retry(changes, self.db)
278 return updated_episodes
281 def _save_podcast_logo(self, cover_art):
282 if not cover_art:
283 return
285 try:
286 image_sha1 = hashlib.sha1(cover_art).hexdigest()
287 prefix = CoverArt.get_prefix(image_sha1)
289 filename = CoverArt.get_original(prefix, image_sha1)
290 dirname = CoverArt.get_dir(filename)
292 # get hash of existing file
293 if os.path.exists(filename):
294 with open(filename) as f:
295 old_hash = file_hash(f).digest()
296 else:
297 old_hash = ''
299 logger.info('Logo %s', cover_art)
301 # save new cover art
302 with open(filename, 'w') as fp:
303 fp.write(urllib2.urlopen(cover_art).read())
305 # get hash of new file
306 with open(filename) as f:
307 new_hash = file_hash(f).digest()
309 # remove thumbnails if cover changed
310 if old_hash != new_hash:
311 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
312 logger.info('Removing %d thumbnails', len(thumbnails))
313 for f in thumbnails:
314 os.unlink(f)
316 return cover_art
318 except (urllib2.HTTPError, urllib2.URLError, ValueError,
319 httplib.BadStatusLine, socket.error, IOError) as e:
320 logger.warn('Exception while updating podcast logo: %s', str(e))
323 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
324 def _mark_outdated(self, podcast, msg=''):
325 logger.info('marking podcast outdated: %s', msg)
326 podcast.outdated = True
327 podcast.last_update = datetime.utcnow()
328 podcast.save()
329 self._update_episodes(podcast, [])
332 def get_episode_url(parsed_episode):
333 """ returns the URL of a parsed episode """
334 for f in parsed_episode.files:
335 if f.urls:
336 return f.urls[0]
337 return None
340 def get_episode_update_function(parsed_episode, episode, podcast):
341 """ returns an update function that can be passed to bulk_save_retry """
343 def update_episode(episode):
344 """ updates "episode" with the data from "parsed_episode" """
346 # copy the json so we can determine if there have been any changes
347 old_json = copy.deepcopy(episode.to_json())
349 episode.guid = parsed_episode.guid or episode.guid
350 episode.description = parsed_episode.description or episode.description
351 episode.subtitle = parsed_episode.subtitle or episode.subtitle
352 episode.content = parsed_episode.content or parsed_episode.description or episode.content
353 episode.link = parsed_episode.link or episode.link
354 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
355 episode.author = parsed_episode.author or episode.author
356 episode.duration = parsed_episode.duration or episode.duration
357 episode.filesize = parsed_episode.files[0].filesize
358 episode.language = parsed_episode.language or episode.language or \
359 podcast.language
360 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
361 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
362 episode.license = parsed_episode.license or episode.license
364 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
365 episode.urls = sorted(set(episode.urls + urls), key=len)
367 episode.title = parsed_episode.title or episode.title or \
368 file_basename_no_extension(episode.url)
370 # if nothing changed we return None to indicate no required action
371 if deep_eq(old_json, episode.to_json()):
372 return None
374 # set the last_update only if there have been changed above
375 episode.last_update = datetime.utcnow()
376 return episode
378 return update_episode
380 def mark_outdated(obj):
381 """ marks obj outdated if its not already """
382 if obj.outdated:
383 return None
385 obj.outdated = True
386 obj.last_update = datetime.utcnow()
387 return obj
390 def get_update_interval(episodes):
391 """ calculates the avg interval between new episodes """
393 count = len(episodes)
394 if not count:
395 logger.info('no episodes, using default interval of %dh',
396 DEFAULT_UPDATE_INTERVAL)
397 return DEFAULT_UPDATE_INTERVAL
399 earliest = episodes[0]
400 now = datetime.utcnow()
402 timespan_s = (now - earliest.released).total_seconds()
403 timespan_h = timespan_s / 60 / 60
405 interval = int(timespan_h / count)
406 logger.info('%d episodes in %d days => %dh interval', count,
407 timespan_h / 24, interval)
409 # place interval between {MIN,MAX}_UPDATE_INTERVAL
410 interval = max(interval, MIN_UPDATE_INTERVAL)
411 interval = min(interval, MAX_UPDATE_INTERVAL)
413 return interval
416 def file_basename_no_extension(filename):
417 """ Returns filename without extension
419 >>> file_basename_no_extension('/home/me/file.txt')
420 'file'
422 >>> file_basename_no_extension('file')
423 'file'
425 base = os.path.basename(filename)
426 name, extension = os.path.splitext(base)
427 return name