Remove unused license preamble
[mygpo.git] / mygpo / data / feeddownloader.py
blob0f09a78fc4e730ca94ad662d2cc45ba4e60aebf4
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import os.path
5 import urllib.request, urllib.error, urllib.parse
6 from urllib.parse import urljoin
7 import http.client
8 import hashlib
9 from datetime import datetime, timedelta
10 from itertools import chain, islice
11 import socket
12 import requests
14 from django.db import transaction
15 from django.conf import settings
17 from mygpo.podcasts.models import Podcast, Episode
18 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
19 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
20 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
21 from mygpo.utils import file_hash, to_maxlength
22 from mygpo.web.logo import CoverArt
23 from mygpo.data.podcast import subscribe_at_hub
24 from mygpo.data.tasks import update_related_podcasts
25 from mygpo.pubsub.models import SubscriptionError
26 from mygpo.directory.tags import update_category
28 import logging
29 logger = logging.getLogger(__name__)
31 MAX_EPISODES_UPDATE = 200
34 class UpdatePodcastException(Exception):
35 pass
38 class NoPodcastCreated(Exception):
39 """ raised when no podcast obj was created for a new URL """
42 class NoEpisodesException(Exception):
43 """ raised when parsing something that doesn't contain any episodes """
46 def update_podcasts(queue):
47 """ Fetch data for the URLs supplied as the queue iterable """
49 for n, podcast_url in enumerate(queue, 1):
50 logger.info('Update %d - %s', n, podcast_url)
51 try:
52 yield update_podcast(podcast_url)
54 except NoPodcastCreated as npc:
55 logger.info('No podcast created: %s', npc)
57 except:
58 logger.exception('Error while updating podcast "%s"',
59 podcast_url)
60 raise
63 def update_podcast(podcast_url):
64 """ Update the podcast for the supplied URL """
66 try:
67 parsed = _fetch_feed(podcast_url)
68 _validate_parsed(parsed)
70 except requests.exceptions.RequestException as re:
71 logging.exception('Error while fetching response from feedservice')
73 # if we fail to parse the URL, we don't even create the
74 # podcast object
75 try:
76 p = Podcast.objects.get(urls__url=podcast_url)
77 # if it exists already, we mark it as outdated
78 _mark_outdated(p, 'error while fetching feed: %s' % str(re))
79 p.last_update = datetime.utcnow()
80 p.save()
81 return p
83 except Podcast.DoesNotExist:
84 raise NoPodcastCreated(re)
86 except NoEpisodesException as nee:
87 logging.warn('No episode found while parsing podcast')
89 # if we fail to parse the URL, we don't even create the
90 # podcast object
91 try:
92 p = Podcast.objects.get(urls__url=podcast_url)
93 # if it exists already, we mark it as outdated
94 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
95 return p
97 except Podcast.DoesNotExist:
98 raise NoPodcastCreated(nee)
100 assert parsed, 'fetch_feed must return something'
101 p = Podcast.objects.get_or_create_for_url(podcast_url)
102 episodes = _update_episodes(p, parsed.get('episodes', []))
103 p.refresh_from_db()
104 p.episode_count = Episode.objects.filter(podcast=p).count()
105 p.save()
106 max_episode_order = _order_episodes(p)
107 _update_podcast(p, parsed, episodes, max_episode_order)
108 return p
111 def verify_podcast_url(podcast_url):
112 parsed = _fetch_feed(podcast_url)
113 _validate_parsed(parsed)
114 return True
117 def _fetch_feed(podcast_url):
118 params = {
119 'url': podcast_url,
120 'process_text': 'markdown',
122 headers = {
123 'Accept': 'application/json',
125 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
126 r = requests.get(url, params=params, headers=headers, timeout=10)
128 if r.status_code != 200:
129 logger.error('Feed-service status code for "%s" was %s', podcast_url,
130 r.status_code)
131 return None
133 try:
134 return r.json()[0]
135 except ValueError:
136 logger.exception(
137 'Feed-service error while parsing response for url "%s": %s',
138 podcast_url, r.text,
140 raise
143 def _validate_parsed(parsed):
144 """ validates the parsed results and raises an exception if invalid
146 feedparser parses pretty much everything. We reject anything that
147 doesn't look like a feed"""
149 if not parsed or not parsed.get('episodes', []):
150 raise NoEpisodesException('no episodes found')
153 def _update_podcast(podcast, parsed, episodes, max_episode_order):
154 """ updates a podcast according to new parser results """
156 # we need that later to decide if we can "bump" a category
157 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
159 podcast.title = parsed.get('title') or podcast.title
160 podcast.description = parsed.get('description') or podcast.description
161 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
162 podcast.link = parsed.get('link') or podcast.link
163 podcast.logo_url = parsed.get('logo') or podcast.logo_url
164 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
165 podcast.author)
166 podcast.language = to_maxlength(Podcast, 'language',
167 parsed.get('language') or podcast.language)
168 podcast.content_types = ','.join(parsed.get('content_types')) or \
169 podcast.content_types
170 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
171 podcast.common_episode_title = to_maxlength(
172 Podcast,
173 'common_episode_title',
174 parsed.get('common_title') or podcast.common_episode_title)
175 podcast.new_location = parsed.get('new_location') or podcast.new_location
176 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
177 parsed.get('flattr') or
178 podcast.flattr_url)
179 podcast.hub = parsed.get('hub') or podcast.hub
180 podcast.license = parsed.get('license') or podcast.license
181 podcast.max_episode_order = max_episode_order
183 podcast.add_missing_urls(parsed.get('urls', []))
185 if podcast.new_location:
186 try:
187 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
188 if new_podcast != podcast:
189 _mark_outdated(podcast, 'redirected to different podcast')
190 return
191 except Podcast.DoesNotExist:
192 podcast.set_url(podcast.new_location)
194 # latest episode timestamp
195 episodes = Episode.objects.filter(podcast=podcast,
196 released__isnull=False)\
197 .order_by('released')
199 podcast.update_interval = get_update_interval(episodes)
201 latest_episode = episodes.last()
202 if latest_episode:
203 podcast.latest_episode_timestamp = latest_episode.released
205 # podcast.episode_count is not update here on purpose. It is, instead,
206 # continuously updated when creating new episodes in
207 # EpisodeManager.get_or_create_for_url
209 _update_categories(podcast, prev_latest_episode_timestamp)
211 # try to download the logo and reset logo_url to None on http errors
212 found = _save_podcast_logo(podcast.logo_url)
213 if not found:
214 podcast.logo_url = None
216 # The podcast is always saved (not just when there are changes) because
217 # we need to record the last update
218 logger.info('Saving podcast.')
219 podcast.last_update = datetime.utcnow()
220 podcast.save()
222 try:
223 subscribe_at_hub(podcast)
224 except SubscriptionError as se:
225 logger.warn('subscribing to hub failed: %s', str(se))
227 assign_slug(podcast)
228 assign_missing_episode_slugs(podcast)
229 update_related_podcasts.delay(podcast)
232 def assign_slug(podcast):
233 if podcast.slug:
234 return
236 for slug in PodcastSlugs(podcast):
237 try:
238 with transaction.atomic():
239 podcast.add_slug(slug)
240 break
242 except:
243 continue
246 def assign_missing_episode_slugs(podcast):
247 common_title = podcast.get_common_episode_title()
249 episodes = Episode.objects.filter(podcast=podcast, slugs__isnull=True)
251 for episode in episodes:
253 for slug in EpisodeSlugs(episode, common_title):
254 try:
255 with transaction.atomic():
256 episode.set_slug(slug)
257 break
259 except:
260 continue
263 def _update_categories(podcast, prev_timestamp):
264 """ checks some practical requirements and updates a category """
266 max_timestamp = datetime.utcnow() + timedelta(days=1)
268 # no episodes at all
269 if not podcast.latest_episode_timestamp:
270 return
272 # no new episode
273 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
274 return
276 # too far in the future
277 if podcast.latest_episode_timestamp > max_timestamp:
278 return
280 # not enough subscribers
281 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
282 return
284 update_category(podcast)
287 def _update_episodes(podcast, parsed_episodes):
289 pid = podcast.get_id()
291 # list of (obj, fun) where fun is the function to update obj
292 updated_episodes = []
293 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
294 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
295 len(episodes_to_update))
297 logger.info('Updating %d episodes', len(episodes_to_update))
298 for n, parsed in enumerate(episodes_to_update, 1):
300 url = get_episode_url(parsed)
301 if not url:
302 logger.info('Skipping episode %d for missing URL', n)
303 continue
305 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
307 episode = Episode.objects.get_or_create_for_url(podcast, url)
309 update_episode(parsed, episode, podcast)
310 updated_episodes.append(episode)
312 # and mark the remaining ones outdated
313 current_episodes = Episode.objects.filter(podcast=podcast,
314 outdated=False)[:500]
315 outdated_episodes = set(current_episodes) - set(updated_episodes)
317 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
318 for episode in outdated_episodes:
319 mark_outdated(episode)
322 @transaction.atomic
323 def _order_episodes(podcast):
324 """ Reorder the podcast's episode according to release timestamp
326 Returns the highest order value (corresponding to the most recent
327 episode) """
329 num_episodes = podcast.episode_count
330 if not num_episodes:
331 return 0
333 episodes = podcast.episode_set.all().extra(select={
334 'has_released': 'released IS NOT NULL',
336 .order_by('-has_released', '-released', 'pk')\
337 .only('pk')
339 for n, episode in enumerate(episodes.iterator(), 1):
340 # assign ``order`` from higher (most recent) to 0 (oldest)
341 # None means "unknown"
342 new_order = num_episodes - n
344 # optimize for new episodes that are newer than all existing
345 if episode.order == new_order:
346 continue
348 logger.info('Updating order from {} to {}'.format(episode.order,
349 new_order))
350 episode.order = new_order
351 episode.save()
353 return num_episodes - 1
356 def _save_podcast_logo(cover_art):
357 if not cover_art:
358 return
360 try:
361 image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
362 prefix = CoverArt.get_prefix(image_sha1)
364 filename = CoverArt.get_original(prefix, image_sha1)
365 dirname = CoverArt.get_dir(filename)
367 # get hash of existing file
368 if os.path.exists(filename):
369 with open(filename, 'rb') as f:
370 old_hash = file_hash(f).digest()
371 else:
372 old_hash = ''
374 logger.info('Logo %s', cover_art)
376 # save new cover art
377 with open(filename, 'wb') as fp:
378 fp.write(urllib.request.urlopen(cover_art).read())
380 # get hash of new file
381 with open(filename, 'rb') as f:
382 new_hash = file_hash(f).digest()
384 # remove thumbnails if cover changed
385 if old_hash != new_hash:
386 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
387 logger.info('Removing %d thumbnails', len(thumbnails))
388 for f in thumbnails:
389 os.unlink(f)
391 return cover_art
393 except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
394 http.client.HTTPException, socket.error, IOError) as e:
395 logger.warn('Exception while updating podcast logo: %s', str(e))
398 def _mark_outdated(podcast, msg=''):
399 logger.info('marking podcast outdated: %s', msg)
400 podcast.outdated = True
401 podcast.last_update = datetime.utcnow()
402 podcast.save()
403 _update_episodes(podcast, [])
406 def get_episode_url(parsed_episode):
407 """ returns the URL of a parsed episode """
408 for f in parsed_episode.get('files', []):
409 if f.get('urls', []):
410 return f['urls'][0]
411 return None
414 def update_episode(parsed_episode, episode, podcast):
415 """ updates "episode" with the data from "parsed_episode" """
417 # TODO: check if there have been any changes, to avoid unnecessary updates
418 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
419 episode.guid)
420 episode.description = parsed_episode.get('description') or \
421 episode.description
422 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
423 episode.content = parsed_episode.get('content') or \
424 parsed_episode.get('description') or episode.content
425 episode.link = to_maxlength(Episode, 'link',
426 parsed_episode.get('link') or episode.link)
427 episode.released = datetime.utcfromtimestamp(
428 parsed_episode.get('released')) if parsed_episode.get('released') \
429 else episode.released
430 episode.author = to_maxlength(Episode, 'author',
431 parsed_episode.get('author') or
432 episode.author)
433 episode.duration = parsed_episode.get('duration') or episode.duration
434 episode.filesize = parsed_episode['files'][0]['filesize']
435 episode.language = parsed_episode.get('language') or \
436 episode.language or podcast.language
437 episode.mimetypes = ','.join(list(set(
438 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
440 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
441 parsed_episode.get('flattr') or
442 episode.flattr_url)
443 episode.license = parsed_episode.get('license') or episode.license
445 episode.title = to_maxlength(Episode, 'title',
446 parsed_episode.get('title') or
447 episode.title or
448 file_basename_no_extension(episode.url))
450 episode.last_update = datetime.utcnow()
451 episode.save()
453 parsed_urls = list(chain.from_iterable(
454 f.get('urls', []) for f in parsed_episode.get('files', [])))
455 episode.add_missing_urls(parsed_urls)
458 def mark_outdated(obj):
459 """ marks obj outdated if its not already """
460 if obj.outdated:
461 return None
463 obj.outdated = True
464 obj.last_update = datetime.utcnow()
465 obj.save()
468 def get_update_interval(episodes):
469 """ calculates the avg interval between new episodes """
471 count = len(episodes)
472 if not count:
473 logger.info('no episodes, using default interval of %dh',
474 DEFAULT_UPDATE_INTERVAL)
475 return DEFAULT_UPDATE_INTERVAL
477 earliest = episodes[0]
478 now = datetime.utcnow()
480 timespan_s = (now - earliest.released).total_seconds()
481 timespan_h = timespan_s / 60 / 60
483 interval = int(timespan_h / count)
484 logger.info('%d episodes in %d days => %dh interval', count,
485 timespan_h / 24, interval)
487 # place interval between {MIN,MAX}_UPDATE_INTERVAL
488 interval = max(interval, MIN_UPDATE_INTERVAL)
489 interval = min(interval, MAX_UPDATE_INTERVAL)
491 return interval
494 def file_basename_no_extension(filename):
495 """ Returns filename without extension
497 >>> file_basename_no_extension('/home/me/file.txt')
498 'file'
500 >>> file_basename_no_extension('file')
501 'file'
503 base = os.path.basename(filename)
504 name, extension = os.path.splitext(base)
505 return name