[Feeds] fix variable name in feeddownloader
[mygpo.git] / mygpo / data / feeddownloader.py
blob6f24fb1bd572ce86897826bbf581389c20388da8
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 from urlparse import urljoin
23 import httplib
24 import hashlib
25 from datetime import datetime, timedelta
26 from itertools import chain, islice
27 import socket
28 import requests
30 from django.db import transaction
31 from django.conf import settings
33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
36 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
37 from mygpo.utils import file_hash, to_maxlength
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.data.tasks import update_related_podcasts
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE = 200
50 class UpdatePodcastException(Exception):
51 pass
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield update_podcast(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update_podcast(podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = _fetch_feed(podcast_url)
84 _validate_parsed(parsed)
86 except requests.exceptions.RequestException as re:
87 logging.exception('Error while fetching response from feedservice')
89 except NoEpisodesException as nee:
90 logging.warn('No episode found while parsing podcast')
92 # if we fail to parse the URL, we don't even create the
93 # podcast object
94 try:
95 p = Podcast.objects.get(urls__url=podcast_url)
96 # if it exists already, we mark it as outdated
97 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
98 return p
100 except Podcast.DoesNotExist:
101 raise NoPodcastCreated(nee)
103 assert parsed, 'fetch_feed must return something'
104 p = Podcast.objects.get_or_create_for_url(podcast_url)
105 episodes = _update_episodes(p, parsed.get('episodes', []))
106 max_episode_order = _order_episodes(p)
107 _update_podcast(p, parsed, episodes, max_episode_order)
108 return p
111 def verify_podcast_url(podcast_url):
112 parsed = _fetch_feed(podcast_url)
113 _validate_parsed(parsed)
114 return True
117 def _fetch_feed(podcast_url):
118 params = {'url': podcast_url}
119 headers = {
120 'Accept': 'application/json',
122 # markdown and other parameters?
123 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
124 r = requests.get(url, params=params, headers=headers, timeout=10)
125 return r.json()[0]
128 def _validate_parsed(parsed):
129 """ validates the parsed results and raises an exception if invalid
131 feedparser parses pretty much everything. We reject anything that
132 doesn't look like a feed"""
134 if not parsed or not parsed.get('episodes', []):
135 raise NoEpisodesException('no episodes found')
138 def _update_podcast(podcast, parsed, episodes, max_episode_order):
139 """ updates a podcast according to new parser results """
141 # we need that later to decide if we can "bump" a category
142 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
144 podcast.title = parsed.get('title') or podcast.title
145 podcast.description = parsed.get('description') or podcast.description
146 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
147 podcast.link = parsed.get('link') or podcast.link
148 podcast.logo_url = parsed.get('logo') or podcast.logo_url
149 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
150 podcast.author)
151 podcast.language = to_maxlength(Podcast, 'language',
152 parsed.get('language') or podcast.language)
153 podcast.content_types = ','.join(parsed.get('content_types') or
154 podcast.content_types)
155 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
156 podcast.common_episode_title = to_maxlength(
157 Podcast,
158 'common_episode_title',
159 parsed.get('common_title') or podcast.common_episode_title)
160 podcast.new_location = parsed.get('new_location') or podcast.new_location
161 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
162 parsed.get('flattr') or
163 podcast.flattr_url)
164 podcast.hub = parsed.get('hub') or podcast.hub
165 podcast.license = parsed.get('license') or podcast.license
166 podcast.max_episode_order = max_episode_order
168 podcast.add_missing_urls(parsed.get('urls', []))
170 if podcast.new_location:
171 try:
172 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
173 if new_podcast != podcast:
174 _mark_outdated(podcast, 'redirected to different podcast')
175 return
176 except Podcast.DoesNotExist:
177 podcast.set_url(podcast.new_location)
179 # latest episode timestamp
180 episodes = Episode.objects.filter(podcast=podcast,
181 released__isnull=False)\
182 .order_by('released')
184 podcast.update_interval = get_update_interval(episodes)
186 latest_episode = episodes.last()
187 if latest_episode:
188 podcast.latest_episode_timestamp = latest_episode.released
190 # podcast.episode_count is not update here on purpose. It is, instead,
191 # continuously updated when creating new episodes in
192 # EpisodeManager.get_or_create_for_url
194 _update_categories(podcast, prev_latest_episode_timestamp)
196 # try to download the logo and reset logo_url to None on http errors
197 found = _save_podcast_logo(podcast.logo_url)
198 if not found:
199 podcast.logo_url = None
201 # The podcast is always saved (not just when there are changes) because
202 # we need to record the last update
203 logger.info('Saving podcast.')
204 podcast.last_update = datetime.utcnow()
205 podcast.save()
207 try:
208 subscribe_at_hub(podcast)
209 except SubscriptionError as se:
210 logger.warn('subscribing to hub failed: %s', str(se))
212 if not podcast.slug:
213 slug = PodcastSlug(podcast).get_slug()
214 if slug:
215 podcast.add_slug(slug)
217 assign_missing_episode_slugs(podcast)
218 update_related_podcasts.delay(podcast)
221 def _update_categories(podcast, prev_timestamp):
222 """ checks some practical requirements and updates a category """
224 max_timestamp = datetime.utcnow() + timedelta(days=1)
226 # no episodes at all
227 if not podcast.latest_episode_timestamp:
228 return
230 # no new episode
231 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
232 return
234 # too far in the future
235 if podcast.latest_episode_timestamp > max_timestamp:
236 return
238 # not enough subscribers
239 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
240 return
242 update_category(podcast)
245 def _update_episodes(podcast, parsed_episodes):
247 pid = podcast.get_id()
249 # list of (obj, fun) where fun is the function to update obj
250 updated_episodes = []
251 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
252 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
253 len(episodes_to_update))
255 logger.info('Updating %d episodes', len(episodes_to_update))
256 for n, parsed in enumerate(episodes_to_update, 1):
258 url = get_episode_url(parsed)
259 if not url:
260 logger.info('Skipping episode %d for missing URL', n)
261 continue
263 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
265 episode = Episode.objects.get_or_create_for_url(podcast, url)
267 update_episode(parsed, episode, podcast)
268 updated_episodes.append(episode)
270 # and mark the remaining ones outdated
271 current_episodes = Episode.objects.filter(podcast=podcast,
272 outdated=False)[:500]
273 outdated_episodes = set(current_episodes) - set(updated_episodes)
275 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
276 for episode in outdated_episodes:
277 mark_outdated(episode)
280 @transaction.atomic
281 def _order_episodes(podcast):
282 """ Reorder the podcast's episode according to release timestamp
284 Returns the highest order value (corresponding to the most recent
285 episode) """
287 num_episodes = podcast.episode_count
288 if not num_episodes:
289 return 0
291 episodes = podcast.episode_set.all().extra(select={
292 'has_released': 'released IS NOT NULL',
294 .order_by('-has_released', '-released', 'pk')\
295 .only('pk')
297 for n, episode in enumerate(episodes.iterator(), 1):
298 # assign ``order`` from higher (most recent) to 0 (oldest)
299 # None means "unknown"
300 new_order = num_episodes - n
302 # optimize for new episodes that are newer than all existing
303 if episode.order == new_order:
304 continue
306 logger.info('Updating order from {} to {}'.format(episode.order,
307 new_order))
308 episode.order = new_order
309 episode.save()
311 return num_episodes - 1
314 def _save_podcast_logo(cover_art):
315 if not cover_art:
316 return
318 try:
319 image_sha1 = hashlib.sha1(cover_art).hexdigest()
320 prefix = CoverArt.get_prefix(image_sha1)
322 filename = CoverArt.get_original(prefix, image_sha1)
323 dirname = CoverArt.get_dir(filename)
325 # get hash of existing file
326 if os.path.exists(filename):
327 with open(filename) as f:
328 old_hash = file_hash(f).digest()
329 else:
330 old_hash = ''
332 logger.info('Logo %s', cover_art)
334 # save new cover art
335 with open(filename, 'w') as fp:
336 fp.write(urllib2.urlopen(cover_art).read())
338 # get hash of new file
339 with open(filename) as f:
340 new_hash = file_hash(f).digest()
342 # remove thumbnails if cover changed
343 if old_hash != new_hash:
344 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
345 logger.info('Removing %d thumbnails', len(thumbnails))
346 for f in thumbnails:
347 os.unlink(f)
349 return cover_art
351 except (urllib2.HTTPError, urllib2.URLError, ValueError,
352 httplib.BadStatusLine, socket.error, IOError) as e:
353 logger.warn('Exception while updating podcast logo: %s', str(e))
356 def _mark_outdated(podcast, msg=''):
357 logger.info('marking podcast outdated: %s', msg)
358 podcast.outdated = True
359 podcast.last_update = datetime.utcnow()
360 podcast.save()
361 _update_episodes(podcast, [])
364 def get_episode_url(parsed_episode):
365 """ returns the URL of a parsed episode """
366 for f in parsed_episode.get('files', []):
367 if f.get('urls', []):
368 return f['urls'][0]
369 return None
372 def update_episode(parsed_episode, episode, podcast):
373 """ updates "episode" with the data from "parsed_episode" """
375 # TODO: check if there have been any changes, to avoid unnecessary updates
376 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
377 episode.guid)
378 episode.description = parsed_episode.get('description') or \
379 episode.description
380 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
381 episode.content = parsed_episode.get('content') or \
382 parsed_episode.get('description') or episode.content
383 episode.link = to_maxlength(Episode, 'link',
384 parsed_episode.get('link') or episode.link)
385 episode.released = datetime.utcfromtimestamp(
386 parsed_episode.get('released')) if parsed_episode.get('released') \
387 else episode.released
388 episode.author = to_maxlength(Episode, 'author',
389 parsed_episode.get('author') or
390 episode.author)
391 episode.duration = parsed_episode.get('duration') or episode.duration
392 episode.filesize = parsed_episode['files'][0]['filesize']
393 episode.language = parsed_episode.get('language') or \
394 episode.language or podcast.language
395 episode.mimetypes = ','.join(list(set(
396 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
398 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
399 parsed_episode.get('flattr') or
400 episode.flattr_url)
401 episode.license = parsed_episode.get('license') or episode.license
403 episode.title = to_maxlength(Episode, 'title',
404 parsed_episode.get('title') or
405 episode.title or
406 file_basename_no_extension(episode.url))
408 episode.last_update = datetime.utcnow()
409 episode.save()
411 parsed_urls = list(chain.from_iterable(
412 f.get('urls', []) for f in parsed_episode.get('files', [])))
413 episode.add_missing_urls(parsed_urls)
416 def mark_outdated(obj):
417 """ marks obj outdated if its not already """
418 if obj.outdated:
419 return None
421 obj.outdated = True
422 obj.last_update = datetime.utcnow()
423 obj.save()
426 def get_update_interval(episodes):
427 """ calculates the avg interval between new episodes """
429 count = len(episodes)
430 if not count:
431 logger.info('no episodes, using default interval of %dh',
432 DEFAULT_UPDATE_INTERVAL)
433 return DEFAULT_UPDATE_INTERVAL
435 earliest = episodes[0]
436 now = datetime.utcnow()
438 timespan_s = (now - earliest.released).total_seconds()
439 timespan_h = timespan_s / 60 / 60
441 interval = int(timespan_h / count)
442 logger.info('%d episodes in %d days => %dh interval', count,
443 timespan_h / 24, interval)
445 # place interval between {MIN,MAX}_UPDATE_INTERVAL
446 interval = max(interval, MIN_UPDATE_INTERVAL)
447 interval = min(interval, MAX_UPDATE_INTERVAL)
449 return interval
452 def file_basename_no_extension(filename):
453 """ Returns filename without extension
455 >>> file_basename_no_extension('/home/me/file.txt')
456 'file'
458 >>> file_basename_no_extension('file')
459 'file'
461 base = os.path.basename(filename)
462 name, extension = os.path.splitext(base)
463 return name