Fix calculation of episode_count during podcast updates
[mygpo.git] / mygpo / data / feeddownloader.py
blobcbd8dd9b8f2aa18be750599ec70f0f2ef83fcec1
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib.request, urllib.error, urllib.parse
22 from urllib.parse import urljoin
23 import http.client
24 import hashlib
25 from datetime import datetime, timedelta
26 from itertools import chain, islice
27 import socket
28 import requests
30 from django.db import transaction
31 from django.conf import settings
33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
36 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
37 from mygpo.utils import file_hash, to_maxlength
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.data.tasks import update_related_podcasts
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE = 200
50 class UpdatePodcastException(Exception):
51 pass
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield update_podcast(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update_podcast(podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = _fetch_feed(podcast_url)
84 _validate_parsed(parsed)
86 except requests.exceptions.RequestException as re:
87 logging.exception('Error while fetching response from feedservice')
88 return
90 except NoEpisodesException as nee:
91 logging.warn('No episode found while parsing podcast')
93 # if we fail to parse the URL, we don't even create the
94 # podcast object
95 try:
96 p = Podcast.objects.get(urls__url=podcast_url)
97 # if it exists already, we mark it as outdated
98 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
99 return p
101 except Podcast.DoesNotExist:
102 raise NoPodcastCreated(nee)
104 assert parsed, 'fetch_feed must return something'
105 p = Podcast.objects.get_or_create_for_url(podcast_url)
106 episodes = _update_episodes(p, parsed.get('episodes', []))
107 p.refresh_from_db()
108 p.episode_count = Episode.objects.filter(podcast=p).count()
109 p.save()
110 max_episode_order = _order_episodes(p)
111 _update_podcast(p, parsed, episodes, max_episode_order)
112 return p
115 def verify_podcast_url(podcast_url):
116 parsed = _fetch_feed(podcast_url)
117 _validate_parsed(parsed)
118 return True
121 def _fetch_feed(podcast_url):
122 params = {
123 'url': podcast_url,
124 'process_text': 'markdown',
126 headers = {
127 'Accept': 'application/json',
129 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
130 r = requests.get(url, params=params, headers=headers, timeout=10)
131 try:
132 return r.json()[0]
133 except ValueError:
134 logger.exception('Error while parsing response: {}', r.text)
135 raise
138 def _validate_parsed(parsed):
139 """ validates the parsed results and raises an exception if invalid
141 feedparser parses pretty much everything. We reject anything that
142 doesn't look like a feed"""
144 if not parsed or not parsed.get('episodes', []):
145 raise NoEpisodesException('no episodes found')
148 def _update_podcast(podcast, parsed, episodes, max_episode_order):
149 """ updates a podcast according to new parser results """
151 # we need that later to decide if we can "bump" a category
152 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
154 podcast.title = parsed.get('title') or podcast.title
155 podcast.description = parsed.get('description') or podcast.description
156 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
157 podcast.link = parsed.get('link') or podcast.link
158 podcast.logo_url = parsed.get('logo') or podcast.logo_url
159 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
160 podcast.author)
161 podcast.language = to_maxlength(Podcast, 'language',
162 parsed.get('language') or podcast.language)
163 podcast.content_types = ','.join(parsed.get('content_types')) or \
164 podcast.content_types
165 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
166 podcast.common_episode_title = to_maxlength(
167 Podcast,
168 'common_episode_title',
169 parsed.get('common_title') or podcast.common_episode_title)
170 podcast.new_location = parsed.get('new_location') or podcast.new_location
171 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
172 parsed.get('flattr') or
173 podcast.flattr_url)
174 podcast.hub = parsed.get('hub') or podcast.hub
175 podcast.license = parsed.get('license') or podcast.license
176 podcast.max_episode_order = max_episode_order
178 podcast.add_missing_urls(parsed.get('urls', []))
180 if podcast.new_location:
181 try:
182 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
183 if new_podcast != podcast:
184 _mark_outdated(podcast, 'redirected to different podcast')
185 return
186 except Podcast.DoesNotExist:
187 podcast.set_url(podcast.new_location)
189 # latest episode timestamp
190 episodes = Episode.objects.filter(podcast=podcast,
191 released__isnull=False)\
192 .order_by('released')
194 podcast.update_interval = get_update_interval(episodes)
196 latest_episode = episodes.last()
197 if latest_episode:
198 podcast.latest_episode_timestamp = latest_episode.released
200 # podcast.episode_count is not update here on purpose. It is, instead,
201 # continuously updated when creating new episodes in
202 # EpisodeManager.get_or_create_for_url
204 _update_categories(podcast, prev_latest_episode_timestamp)
206 # try to download the logo and reset logo_url to None on http errors
207 found = _save_podcast_logo(podcast.logo_url)
208 if not found:
209 podcast.logo_url = None
211 # The podcast is always saved (not just when there are changes) because
212 # we need to record the last update
213 logger.info('Saving podcast.')
214 podcast.last_update = datetime.utcnow()
215 podcast.save()
217 try:
218 subscribe_at_hub(podcast)
219 except SubscriptionError as se:
220 logger.warn('subscribing to hub failed: %s', str(se))
222 if not podcast.slug:
223 slug = PodcastSlug(podcast).get_slug()
224 if slug:
225 podcast.add_slug(slug)
227 assign_missing_episode_slugs(podcast)
228 update_related_podcasts.delay(podcast)
231 def _update_categories(podcast, prev_timestamp):
232 """ checks some practical requirements and updates a category """
234 max_timestamp = datetime.utcnow() + timedelta(days=1)
236 # no episodes at all
237 if not podcast.latest_episode_timestamp:
238 return
240 # no new episode
241 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
242 return
244 # too far in the future
245 if podcast.latest_episode_timestamp > max_timestamp:
246 return
248 # not enough subscribers
249 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
250 return
252 update_category(podcast)
255 def _update_episodes(podcast, parsed_episodes):
257 pid = podcast.get_id()
259 # list of (obj, fun) where fun is the function to update obj
260 updated_episodes = []
261 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
262 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
263 len(episodes_to_update))
265 logger.info('Updating %d episodes', len(episodes_to_update))
266 for n, parsed in enumerate(episodes_to_update, 1):
268 url = get_episode_url(parsed)
269 if not url:
270 logger.info('Skipping episode %d for missing URL', n)
271 continue
273 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
275 episode = Episode.objects.get_or_create_for_url(podcast, url)
277 update_episode(parsed, episode, podcast)
278 updated_episodes.append(episode)
280 # and mark the remaining ones outdated
281 current_episodes = Episode.objects.filter(podcast=podcast,
282 outdated=False)[:500]
283 outdated_episodes = set(current_episodes) - set(updated_episodes)
285 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
286 for episode in outdated_episodes:
287 mark_outdated(episode)
290 @transaction.atomic
291 def _order_episodes(podcast):
292 """ Reorder the podcast's episode according to release timestamp
294 Returns the highest order value (corresponding to the most recent
295 episode) """
297 num_episodes = podcast.episode_count
298 if not num_episodes:
299 return 0
301 episodes = podcast.episode_set.all().extra(select={
302 'has_released': 'released IS NOT NULL',
304 .order_by('-has_released', '-released', 'pk')\
305 .only('pk')
307 for n, episode in enumerate(episodes.iterator(), 1):
308 # assign ``order`` from higher (most recent) to 0 (oldest)
309 # None means "unknown"
310 new_order = num_episodes - n
312 # optimize for new episodes that are newer than all existing
313 if episode.order == new_order:
314 continue
316 logger.info('Updating order from {} to {}'.format(episode.order,
317 new_order))
318 episode.order = new_order
319 episode.save()
321 return num_episodes - 1
324 def _save_podcast_logo(cover_art):
325 if not cover_art:
326 return
328 try:
329 image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
330 prefix = CoverArt.get_prefix(image_sha1)
332 filename = CoverArt.get_original(prefix, image_sha1)
333 dirname = CoverArt.get_dir(filename)
335 # get hash of existing file
336 if os.path.exists(filename):
337 with open(filename) as f:
338 old_hash = file_hash(f).digest()
339 else:
340 old_hash = ''
342 logger.info('Logo %s', cover_art)
344 # save new cover art
345 with open(filename, 'wb') as fp:
346 fp.write(urllib.request.urlopen(cover_art).read())
348 # get hash of new file
349 with open(filename) as f:
350 new_hash = file_hash(f).digest()
352 # remove thumbnails if cover changed
353 if old_hash != new_hash:
354 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
355 logger.info('Removing %d thumbnails', len(thumbnails))
356 for f in thumbnails:
357 os.unlink(f)
359 return cover_art
361 except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
362 http.client.HTTPException, socket.error, IOError) as e:
363 logger.warn('Exception while updating podcast logo: %s', str(e))
366 def _mark_outdated(podcast, msg=''):
367 logger.info('marking podcast outdated: %s', msg)
368 podcast.outdated = True
369 podcast.last_update = datetime.utcnow()
370 podcast.save()
371 _update_episodes(podcast, [])
374 def get_episode_url(parsed_episode):
375 """ returns the URL of a parsed episode """
376 for f in parsed_episode.get('files', []):
377 if f.get('urls', []):
378 return f['urls'][0]
379 return None
382 def update_episode(parsed_episode, episode, podcast):
383 """ updates "episode" with the data from "parsed_episode" """
385 # TODO: check if there have been any changes, to avoid unnecessary updates
386 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
387 episode.guid)
388 episode.description = parsed_episode.get('description') or \
389 episode.description
390 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
391 episode.content = parsed_episode.get('content') or \
392 parsed_episode.get('description') or episode.content
393 episode.link = to_maxlength(Episode, 'link',
394 parsed_episode.get('link') or episode.link)
395 episode.released = datetime.utcfromtimestamp(
396 parsed_episode.get('released')) if parsed_episode.get('released') \
397 else episode.released
398 episode.author = to_maxlength(Episode, 'author',
399 parsed_episode.get('author') or
400 episode.author)
401 episode.duration = parsed_episode.get('duration') or episode.duration
402 episode.filesize = parsed_episode['files'][0]['filesize']
403 episode.language = parsed_episode.get('language') or \
404 episode.language or podcast.language
405 episode.mimetypes = ','.join(list(set(
406 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
408 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
409 parsed_episode.get('flattr') or
410 episode.flattr_url)
411 episode.license = parsed_episode.get('license') or episode.license
413 episode.title = to_maxlength(Episode, 'title',
414 parsed_episode.get('title') or
415 episode.title or
416 file_basename_no_extension(episode.url))
418 episode.last_update = datetime.utcnow()
419 episode.save()
421 parsed_urls = list(chain.from_iterable(
422 f.get('urls', []) for f in parsed_episode.get('files', [])))
423 episode.add_missing_urls(parsed_urls)
426 def mark_outdated(obj):
427 """ marks obj outdated if its not already """
428 if obj.outdated:
429 return None
431 obj.outdated = True
432 obj.last_update = datetime.utcnow()
433 obj.save()
436 def get_update_interval(episodes):
437 """ calculates the avg interval between new episodes """
439 count = len(episodes)
440 if not count:
441 logger.info('no episodes, using default interval of %dh',
442 DEFAULT_UPDATE_INTERVAL)
443 return DEFAULT_UPDATE_INTERVAL
445 earliest = episodes[0]
446 now = datetime.utcnow()
448 timespan_s = (now - earliest.released).total_seconds()
449 timespan_h = timespan_s / 60 / 60
451 interval = int(timespan_h / count)
452 logger.info('%d episodes in %d days => %dh interval', count,
453 timespan_h / 24, interval)
455 # place interval between {MIN,MAX}_UPDATE_INTERVAL
456 interval = max(interval, MIN_UPDATE_INTERVAL)
457 interval = min(interval, MAX_UPDATE_INTERVAL)
459 return interval
462 def file_basename_no_extension(filename):
463 """ Returns filename without extension
465 >>> file_basename_no_extension('/home/me/file.txt')
466 'file'
468 >>> file_basename_no_extension('file')
469 'file'
471 base = os.path.basename(filename)
472 name, extension = os.path.splitext(base)
473 return name