Fix generating candidate Slugs for podcast update
[mygpo.git] / mygpo / data / feeddownloader.py
blob34639ea8a35294b8f390c078ee893aec6937dbfe
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib.request, urllib.error, urllib.parse
22 from urllib.parse import urljoin
23 import http.client
24 import hashlib
25 from datetime import datetime, timedelta
26 from itertools import chain, islice
27 import socket
28 import requests
30 from django.db import transaction
31 from django.conf import settings
33 from mygpo.podcasts.models import Podcast, Episode
34 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
36 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
37 from mygpo.utils import file_hash, to_maxlength
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.data.tasks import update_related_podcasts
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE = 200
50 class UpdatePodcastException(Exception):
51 pass
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield update_podcast(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update_podcast(podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = _fetch_feed(podcast_url)
84 _validate_parsed(parsed)
86 except requests.exceptions.RequestException as re:
87 logging.exception('Error while fetching response from feedservice')
89 # if we fail to parse the URL, we don't even create the
90 # podcast object
91 try:
92 p = Podcast.objects.get(urls__url=podcast_url)
93 # if it exists already, we mark it as outdated
94 _mark_outdated(p, 'error while fetching feed: %s' % str(re))
95 p.last_update = datetime.utcnow()
96 p.save()
97 return p
99 except Podcast.DoesNotExist:
100 raise NoPodcastCreated(re)
102 except NoEpisodesException as nee:
103 logging.warn('No episode found while parsing podcast')
105 # if we fail to parse the URL, we don't even create the
106 # podcast object
107 try:
108 p = Podcast.objects.get(urls__url=podcast_url)
109 # if it exists already, we mark it as outdated
110 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
111 return p
113 except Podcast.DoesNotExist:
114 raise NoPodcastCreated(nee)
116 assert parsed, 'fetch_feed must return something'
117 p = Podcast.objects.get_or_create_for_url(podcast_url)
118 episodes = _update_episodes(p, parsed.get('episodes', []))
119 p.refresh_from_db()
120 p.episode_count = Episode.objects.filter(podcast=p).count()
121 p.save()
122 max_episode_order = _order_episodes(p)
123 _update_podcast(p, parsed, episodes, max_episode_order)
124 return p
127 def verify_podcast_url(podcast_url):
128 parsed = _fetch_feed(podcast_url)
129 _validate_parsed(parsed)
130 return True
133 def _fetch_feed(podcast_url):
134 params = {
135 'url': podcast_url,
136 'process_text': 'markdown',
138 headers = {
139 'Accept': 'application/json',
141 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
142 r = requests.get(url, params=params, headers=headers, timeout=10)
144 if r.status_code != 200:
145 logger.error('Feed-service status code for "%s" was %s', podcast_url,
146 r.status_code)
147 return None
149 try:
150 return r.json()[0]
151 except ValueError:
152 logger.exception(
153 'Feed-service error while parsing response for url "%s": %s',
154 podcast_url, r.text,
156 raise
159 def _validate_parsed(parsed):
160 """ validates the parsed results and raises an exception if invalid
162 feedparser parses pretty much everything. We reject anything that
163 doesn't look like a feed"""
165 if not parsed or not parsed.get('episodes', []):
166 raise NoEpisodesException('no episodes found')
169 def _update_podcast(podcast, parsed, episodes, max_episode_order):
170 """ updates a podcast according to new parser results """
172 # we need that later to decide if we can "bump" a category
173 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
175 podcast.title = parsed.get('title') or podcast.title
176 podcast.description = parsed.get('description') or podcast.description
177 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
178 podcast.link = parsed.get('link') or podcast.link
179 podcast.logo_url = parsed.get('logo') or podcast.logo_url
180 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
181 podcast.author)
182 podcast.language = to_maxlength(Podcast, 'language',
183 parsed.get('language') or podcast.language)
184 podcast.content_types = ','.join(parsed.get('content_types')) or \
185 podcast.content_types
186 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
187 podcast.common_episode_title = to_maxlength(
188 Podcast,
189 'common_episode_title',
190 parsed.get('common_title') or podcast.common_episode_title)
191 podcast.new_location = parsed.get('new_location') or podcast.new_location
192 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
193 parsed.get('flattr') or
194 podcast.flattr_url)
195 podcast.hub = parsed.get('hub') or podcast.hub
196 podcast.license = parsed.get('license') or podcast.license
197 podcast.max_episode_order = max_episode_order
199 podcast.add_missing_urls(parsed.get('urls', []))
201 if podcast.new_location:
202 try:
203 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
204 if new_podcast != podcast:
205 _mark_outdated(podcast, 'redirected to different podcast')
206 return
207 except Podcast.DoesNotExist:
208 podcast.set_url(podcast.new_location)
210 # latest episode timestamp
211 episodes = Episode.objects.filter(podcast=podcast,
212 released__isnull=False)\
213 .order_by('released')
215 podcast.update_interval = get_update_interval(episodes)
217 latest_episode = episodes.last()
218 if latest_episode:
219 podcast.latest_episode_timestamp = latest_episode.released
221 # podcast.episode_count is not update here on purpose. It is, instead,
222 # continuously updated when creating new episodes in
223 # EpisodeManager.get_or_create_for_url
225 _update_categories(podcast, prev_latest_episode_timestamp)
227 # try to download the logo and reset logo_url to None on http errors
228 found = _save_podcast_logo(podcast.logo_url)
229 if not found:
230 podcast.logo_url = None
232 # The podcast is always saved (not just when there are changes) because
233 # we need to record the last update
234 logger.info('Saving podcast.')
235 podcast.last_update = datetime.utcnow()
236 podcast.save()
238 try:
239 subscribe_at_hub(podcast)
240 except SubscriptionError as se:
241 logger.warn('subscribing to hub failed: %s', str(se))
243 assign_slug(podcast)
244 assign_missing_episode_slugs(podcast)
245 update_related_podcasts.delay(podcast)
248 def assign_slug(podcast):
249 if podcast.slug:
250 return
252 for slug in PodcastSlugs(podcast):
253 try:
254 with transaction.atomic():
255 podcast.add_slug(slug)
256 break
258 except:
259 continue
262 def assign_missing_episode_slugs(podcast):
263 common_title = podcast.get_common_episode_title()
265 episodes = Episode.objects.filter(podcast=podcast, slugs__isnull=True)
267 for episode in episodes:
269 for slug in EpisodeSlugs(episode, common_title):
270 try:
271 with transaction.atomic():
272 episode.set_slug(slug)
273 break
275 except:
276 continue
279 def _update_categories(podcast, prev_timestamp):
280 """ checks some practical requirements and updates a category """
282 max_timestamp = datetime.utcnow() + timedelta(days=1)
284 # no episodes at all
285 if not podcast.latest_episode_timestamp:
286 return
288 # no new episode
289 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
290 return
292 # too far in the future
293 if podcast.latest_episode_timestamp > max_timestamp:
294 return
296 # not enough subscribers
297 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
298 return
300 update_category(podcast)
303 def _update_episodes(podcast, parsed_episodes):
305 pid = podcast.get_id()
307 # list of (obj, fun) where fun is the function to update obj
308 updated_episodes = []
309 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
310 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
311 len(episodes_to_update))
313 logger.info('Updating %d episodes', len(episodes_to_update))
314 for n, parsed in enumerate(episodes_to_update, 1):
316 url = get_episode_url(parsed)
317 if not url:
318 logger.info('Skipping episode %d for missing URL', n)
319 continue
321 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
323 episode = Episode.objects.get_or_create_for_url(podcast, url)
325 update_episode(parsed, episode, podcast)
326 updated_episodes.append(episode)
328 # and mark the remaining ones outdated
329 current_episodes = Episode.objects.filter(podcast=podcast,
330 outdated=False)[:500]
331 outdated_episodes = set(current_episodes) - set(updated_episodes)
333 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
334 for episode in outdated_episodes:
335 mark_outdated(episode)
338 @transaction.atomic
339 def _order_episodes(podcast):
340 """ Reorder the podcast's episode according to release timestamp
342 Returns the highest order value (corresponding to the most recent
343 episode) """
345 num_episodes = podcast.episode_count
346 if not num_episodes:
347 return 0
349 episodes = podcast.episode_set.all().extra(select={
350 'has_released': 'released IS NOT NULL',
352 .order_by('-has_released', '-released', 'pk')\
353 .only('pk')
355 for n, episode in enumerate(episodes.iterator(), 1):
356 # assign ``order`` from higher (most recent) to 0 (oldest)
357 # None means "unknown"
358 new_order = num_episodes - n
360 # optimize for new episodes that are newer than all existing
361 if episode.order == new_order:
362 continue
364 logger.info('Updating order from {} to {}'.format(episode.order,
365 new_order))
366 episode.order = new_order
367 episode.save()
369 return num_episodes - 1
372 def _save_podcast_logo(cover_art):
373 if not cover_art:
374 return
376 try:
377 image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
378 prefix = CoverArt.get_prefix(image_sha1)
380 filename = CoverArt.get_original(prefix, image_sha1)
381 dirname = CoverArt.get_dir(filename)
383 # get hash of existing file
384 if os.path.exists(filename):
385 with open(filename, 'rb') as f:
386 old_hash = file_hash(f).digest()
387 else:
388 old_hash = ''
390 logger.info('Logo %s', cover_art)
392 # save new cover art
393 with open(filename, 'wb') as fp:
394 fp.write(urllib.request.urlopen(cover_art).read())
396 # get hash of new file
397 with open(filename, 'rb') as f:
398 new_hash = file_hash(f).digest()
400 # remove thumbnails if cover changed
401 if old_hash != new_hash:
402 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
403 logger.info('Removing %d thumbnails', len(thumbnails))
404 for f in thumbnails:
405 os.unlink(f)
407 return cover_art
409 except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
410 http.client.HTTPException, socket.error, IOError) as e:
411 logger.warn('Exception while updating podcast logo: %s', str(e))
414 def _mark_outdated(podcast, msg=''):
415 logger.info('marking podcast outdated: %s', msg)
416 podcast.outdated = True
417 podcast.last_update = datetime.utcnow()
418 podcast.save()
419 _update_episodes(podcast, [])
422 def get_episode_url(parsed_episode):
423 """ returns the URL of a parsed episode """
424 for f in parsed_episode.get('files', []):
425 if f.get('urls', []):
426 return f['urls'][0]
427 return None
430 def update_episode(parsed_episode, episode, podcast):
431 """ updates "episode" with the data from "parsed_episode" """
433 # TODO: check if there have been any changes, to avoid unnecessary updates
434 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
435 episode.guid)
436 episode.description = parsed_episode.get('description') or \
437 episode.description
438 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
439 episode.content = parsed_episode.get('content') or \
440 parsed_episode.get('description') or episode.content
441 episode.link = to_maxlength(Episode, 'link',
442 parsed_episode.get('link') or episode.link)
443 episode.released = datetime.utcfromtimestamp(
444 parsed_episode.get('released')) if parsed_episode.get('released') \
445 else episode.released
446 episode.author = to_maxlength(Episode, 'author',
447 parsed_episode.get('author') or
448 episode.author)
449 episode.duration = parsed_episode.get('duration') or episode.duration
450 episode.filesize = parsed_episode['files'][0]['filesize']
451 episode.language = parsed_episode.get('language') or \
452 episode.language or podcast.language
453 episode.mimetypes = ','.join(list(set(
454 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
456 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
457 parsed_episode.get('flattr') or
458 episode.flattr_url)
459 episode.license = parsed_episode.get('license') or episode.license
461 episode.title = to_maxlength(Episode, 'title',
462 parsed_episode.get('title') or
463 episode.title or
464 file_basename_no_extension(episode.url))
466 episode.last_update = datetime.utcnow()
467 episode.save()
469 parsed_urls = list(chain.from_iterable(
470 f.get('urls', []) for f in parsed_episode.get('files', [])))
471 episode.add_missing_urls(parsed_urls)
474 def mark_outdated(obj):
475 """ marks obj outdated if its not already """
476 if obj.outdated:
477 return None
479 obj.outdated = True
480 obj.last_update = datetime.utcnow()
481 obj.save()
484 def get_update_interval(episodes):
485 """ calculates the avg interval between new episodes """
487 count = len(episodes)
488 if not count:
489 logger.info('no episodes, using default interval of %dh',
490 DEFAULT_UPDATE_INTERVAL)
491 return DEFAULT_UPDATE_INTERVAL
493 earliest = episodes[0]
494 now = datetime.utcnow()
496 timespan_s = (now - earliest.released).total_seconds()
497 timespan_h = timespan_s / 60 / 60
499 interval = int(timespan_h / count)
500 logger.info('%d episodes in %d days => %dh interval', count,
501 timespan_h / 24, interval)
503 # place interval between {MIN,MAX}_UPDATE_INTERVAL
504 interval = max(interval, MIN_UPDATE_INTERVAL)
505 interval = min(interval, MAX_UPDATE_INTERVAL)
507 return interval
510 def file_basename_no_extension(filename):
511 """ Returns filename without extension
513 >>> file_basename_no_extension('/home/me/file.txt')
514 'file'
516 >>> file_basename_no_extension('file')
517 'file'
519 base = os.path.basename(filename)
520 name, extension = os.path.splitext(base)
521 return name