Merge pull request #793 from gpodder/remove-advertise
[mygpo.git] / mygpo / data / feeddownloader.py
blob000a807449385bec96831ae24dad303394775870
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import os.path
5 import urllib.request
6 import urllib.error
7 from urllib.parse import urljoin
8 import hashlib
9 from datetime import datetime, timedelta
10 from itertools import chain, islice
11 import requests
13 from django.db import transaction
14 from django.conf import settings
16 from mygpo.podcasts.models import Podcast, Episode
17 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
18 from mygpo.podcasts.models import (
19 DEFAULT_UPDATE_INTERVAL,
20 MIN_UPDATE_INTERVAL,
21 MAX_UPDATE_INTERVAL,
23 from mygpo.utils import to_maxlength
24 from mygpo.web.logo import CoverArt
25 from mygpo.data.podcast import subscribe_at_hub
26 from mygpo.data.tasks import update_related_podcasts
27 from mygpo.pubsub.models import SubscriptionError
28 from mygpo.directory.tags import update_category
29 from mygpo.search import get_index_fields
31 from . import models
33 import logging
35 logger = logging.getLogger(__name__)
37 MAX_EPISODES_UPDATE = 200
40 class UpdatePodcastException(Exception):
41 pass
44 class NoPodcastCreated(Exception):
45 """raised when no podcast obj was created for a new URL"""
48 class NoEpisodesException(Exception):
49 """raised when parsing something that doesn't contain any episodes"""
52 def update_podcasts(queue):
53 """Fetch data for the URLs supplied as the queue iterable"""
55 for n, podcast_url in enumerate(queue, 1):
56 logger.info("Update %d - %s", n, podcast_url)
57 if not podcast_url:
58 logger.warning("Podcast URL empty, skipping")
59 continue
61 try:
62 updater = PodcastUpdater(podcast_url)
63 yield updater.update_podcast()
65 except NoPodcastCreated as npc:
66 logger.info("No podcast created: %s", npc)
68 except NoEpisodesException as nee:
69 logger.info(f"No episodes found when parsing {podcast_url}")
70 continue
72 except GeneratorExit:
73 pass
75 except:
76 logger.exception('Error while updating podcast "%s"', podcast_url)
77 raise
80 class PodcastUpdater(object):
81 """Updates the podcast specified by the podcast_url"""
83 def __init__(self, podcast_url):
84 self.podcast_url = (
85 (podcast_url[:2046] + "..") if len(podcast_url) > 2048 else podcast_url
88 def update_podcast(self):
89 """Update the podcast"""
91 with models.PodcastUpdateResult(podcast_url=self.podcast_url) as res:
93 parsed, podcast, created = self.parse_feed()
95 if not podcast:
96 res.podcast_created = False
97 res.error_message = '"{}" could not be parsed'.format(self.podcast_url)
99 return
101 res.podcast = podcast
102 res.podcast_created = created
104 res.episodes_added = 0
105 episode_updater = MultiEpisodeUpdater(podcast, res)
107 if not parsed:
108 # if it exists already, we mark it as outdated
109 self._mark_outdated(
110 podcast, "error while fetching feed", episode_updater
112 return
114 episode_updater.update_episodes(parsed.get("episodes", []))
116 podcast.refresh_from_db()
117 podcast.episode_count = episode_updater.count_episodes()
118 podcast.save()
120 episode_updater.order_episodes()
122 self._update_podcast(podcast, parsed, episode_updater, res)
124 return podcast
126 def parse_feed(self):
127 try:
128 parsed = self._fetch_feed()
129 self._validate_parsed(parsed)
131 except (requests.exceptions.RequestException, NoEpisodesException) as ex:
132 logger.warn("Error while fetching/parsing feed", exc_info=True)
134 # if we fail to parse the URL, we don't even create the
135 # podcast object
136 try:
137 p = Podcast.objects.get(urls__url=self.podcast_url)
138 return (None, p, False)
140 except Podcast.DoesNotExist as pdne:
141 raise NoPodcastCreated(ex) from pdne
143 # Parsing went well, get podcast
144 podcast, created = Podcast.objects.get_or_create_for_url(self.podcast_url)
146 return (parsed, podcast, created)
148 def _fetch_feed(self):
149 params = {"url": self.podcast_url, "process_text": "markdown"}
150 headers = {"Accept": "application/json"}
151 url = urljoin(settings.FEEDSERVICE_URL, "parse")
152 r = requests.get(url, params=params, headers=headers, timeout=30)
154 if r.status_code != 200:
155 logger.error(
156 'Feed-service status code for "{}" was {}'.format(url, r.status_code)
158 return None
160 try:
161 return r.json()[0]
162 except ValueError:
163 logger.exception(
164 'Feed-service error while parsing response for url "%s": %s',
165 self.podcast_url,
166 r.text,
168 raise
170 def _validate_parsed(self, parsed):
171 """validates the parsed results and raises an exception if invalid
173 feedparser parses pretty much everything. We reject anything that
174 doesn't look like a feed"""
176 if not parsed or not parsed.get("episodes", []):
177 raise NoEpisodesException("no episodes found")
179 def _update_podcast(self, podcast, parsed, episode_updater, update_result):
180 """updates a podcast according to new parser results"""
182 # we need that later to decide if we can "bump" a category
183 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
185 # will later be used to see whether the index is outdated
186 old_index_fields = get_index_fields(podcast)
188 podcast.title = parsed.get("title") or podcast.title
189 podcast.description = parsed.get("description") or podcast.description
190 podcast.subtitle = parsed.get("subtitle") or podcast.subtitle
191 podcast.link = parsed.get("link") or podcast.link
192 podcast.logo_url = parsed.get("logo") or podcast.logo_url
194 podcast.author = to_maxlength(
195 Podcast, "author", parsed.get("author") or podcast.author
198 podcast.language = to_maxlength(
199 Podcast, "language", parsed.get("language") or podcast.language
202 podcast.content_types = (
203 ",".join(parsed.get("content_types")) or podcast.content_types
206 # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
208 podcast.common_episode_title = to_maxlength(
209 Podcast,
210 "common_episode_title",
211 parsed.get("common_title") or podcast.common_episode_title,
214 podcast.new_location = parsed.get("new_location") or podcast.new_location
215 podcast.flattr_url = to_maxlength(
216 Podcast, "flattr_url", parsed.get("flattr") or podcast.flattr_url
218 podcast.hub = parsed.get("hub") or podcast.hub
219 podcast.license = parsed.get("license") or podcast.license
220 podcast.max_episode_order = episode_updater.max_episode_order
222 podcast.add_missing_urls(parsed.get("urls", []))
224 if podcast.new_location:
225 try:
226 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
228 if new_podcast != podcast:
229 self._mark_outdated(
230 podcast, "redirected to different podcast", episode_updater
232 return
233 except Podcast.DoesNotExist:
234 podcast.set_url(podcast.new_location)
236 # latest episode timestamp
237 episodes = Episode.objects.filter(
238 podcast=podcast, released__isnull=False
239 ).order_by("released")
241 # Determine update interval
243 # Update interval is based on intervals between episodes
244 podcast.update_interval = episode_updater.get_update_interval(episodes)
246 # factor is increased / decreased depending on whether the latest
247 # update has returned episodes
248 if update_result.episodes_added == 0: # no episodes, incr factor
249 newfactor = podcast.update_interval_factor * 1.2
250 podcast.update_interval_factor = min(1000, newfactor) # never above 1000
251 elif update_result.episodes_added > 1: # new episodes, decr factor
252 newfactor = podcast.update_interval_factor / 1.2
253 podcast.update_interval_factor = max(1, newfactor) # never below 1
255 latest_episode = episodes.last()
256 if latest_episode:
257 podcast.latest_episode_timestamp = latest_episode.released
259 # podcast.episode_count is not update here on purpose. It is, instead,
260 # continuously updated when creating new episodes in
261 # EpisodeManager.get_or_create_for_url
263 self._update_categories(podcast, prev_latest_episode_timestamp)
265 # try to download the logo and reset logo_url to None on http errors
266 found = CoverArt.save_podcast_logo(podcast.logo_url)
267 if not found:
268 podcast.logo_url = None
270 # check if search index should be considered out of date
271 new_index_fields = get_index_fields(podcast)
272 if list(old_index_fields.items()) != list(new_index_fields.items()):
273 podcast.search_index_uptodate = False
275 # The podcast is always saved (not just when there are changes) because
276 # we need to record the last update
277 logger.info("Saving podcast.")
278 podcast.last_update = datetime.utcnow()
279 podcast.save()
281 try:
282 subscribe_at_hub(podcast)
283 except SubscriptionError as se:
284 logger.warning("subscribing to hub failed: %s", str(se))
286 self.assign_slug(podcast)
287 episode_updater.assign_missing_episode_slugs()
288 update_related_podcasts.delay(podcast.pk)
290 def assign_slug(self, podcast):
291 if podcast.slug:
292 return
294 for slug in PodcastSlugs(podcast):
295 try:
296 with transaction.atomic():
297 podcast.add_slug(slug)
298 break
300 except:
301 continue
303 def _update_categories(self, podcast, prev_timestamp):
304 """checks some practical requirements and updates a category"""
306 max_timestamp = datetime.utcnow() + timedelta(days=1)
308 # no episodes at all
309 if not podcast.latest_episode_timestamp:
310 return
312 # no new episode
313 if prev_timestamp and (podcast.latest_episode_timestamp <= prev_timestamp):
314 return
316 # too far in the future
317 if podcast.latest_episode_timestamp > max_timestamp:
318 return
320 # not enough subscribers
321 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
322 return
324 update_category(podcast)
326 def _mark_outdated(self, podcast, msg, episode_updater):
327 logger.info("marking podcast outdated: %s", msg)
328 podcast.outdated = True
329 podcast.last_update = datetime.utcnow()
330 podcast.save()
331 episode_updater.update_episodes([])
334 class MultiEpisodeUpdater(object):
335 def __init__(self, podcast, update_result):
336 self.podcast = podcast
337 self.update_result = update_result
338 self.updated_episodes = []
339 self.max_episode_order = None
341 def update_episodes(self, parsed_episodes):
343 pid = self.podcast.get_id()
345 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
346 logger.info(
347 "Parsed %d (%d) episodes", len(parsed_episodes), len(episodes_to_update)
350 logger.info("Updating %d episodes", len(episodes_to_update))
351 for n, parsed in enumerate(episodes_to_update, 1):
353 url = self.get_episode_url(parsed)
354 if not url:
355 logger.info("Skipping episode %d for missing URL", n)
356 continue
358 logger.info("Updating episode %d / %d", n, len(parsed_episodes))
360 episode, created = Episode.objects.get_or_create_for_url(self.podcast, url)
362 if created:
363 self.update_result.episodes_added += 1
365 updater = EpisodeUpdater(episode, self.podcast)
366 updater.update_episode(parsed)
368 self.updated_episodes.append(episode)
370 # and mark the remaining ones outdated
371 current_episodes = Episode.objects.filter(podcast=self.podcast, outdated=False)[
372 :500
374 outdated_episodes = set(current_episodes) - set(self.updated_episodes)
376 logger.info("Marking %d episodes as outdated", len(outdated_episodes))
377 for episode in outdated_episodes:
378 updater = EpisodeUpdater(episode, self.podcast)
379 updater.mark_outdated()
381 @transaction.atomic
382 def order_episodes(self):
383 """Reorder the podcast's episode according to release timestamp
385 Returns the highest order value (corresponding to the most recent
386 episode)"""
388 num_episodes = self.podcast.episode_count
389 if not num_episodes:
390 return 0
392 episodes = (
393 self.podcast.episode_set.all()
394 .extra(select={"has_released": "released IS NOT NULL"})
395 .order_by("-has_released", "-released", "pk")
396 .only("pk")
399 for n, episode in enumerate(episodes.iterator(), 1):
400 # assign ``order`` from higher (most recent) to 0 (oldest)
401 # None means "unknown"
402 new_order = num_episodes - n
404 # optimize for new episodes that are newer than all existing
405 if episode.order == new_order:
406 continue
408 logger.info("Updating order from {} to {}".format(episode.order, new_order))
409 episode.order = new_order
410 episode.save()
412 self.max_episode_order = num_episodes - 1
414 def get_episode_url(self, parsed_episode):
415 """returns the URL of a parsed episode"""
416 for f in parsed_episode.get("files", []):
417 if f.get("urls", []):
418 return f["urls"][0]
419 return None
421 def count_episodes(self):
422 return Episode.objects.filter(podcast=self.podcast).count()
424 def get_update_interval(self, episodes):
425 """calculates the avg interval between new episodes"""
427 count = episodes.count()
428 if not count:
429 logger.info(
430 "no episodes, using default interval of %dh", DEFAULT_UPDATE_INTERVAL
432 return DEFAULT_UPDATE_INTERVAL
434 earliest = episodes.first()
435 now = datetime.utcnow()
437 timespan_s = (now - earliest.released).total_seconds()
438 timespan_h = timespan_s / 60 / 60
440 interval = int(timespan_h / count)
441 logger.info(
442 "%d episodes in %d days => %dh interval", count, timespan_h / 24, interval
445 # place interval between {MIN,MAX}_UPDATE_INTERVAL
446 interval = max(interval, MIN_UPDATE_INTERVAL)
447 interval = min(interval, MAX_UPDATE_INTERVAL)
449 return interval
451 def assign_missing_episode_slugs(self):
452 common_title = self.podcast.get_common_episode_title()
454 episodes = Episode.objects.filter(podcast=self.podcast, slugs__isnull=True)
456 for episode in episodes:
458 for slug in EpisodeSlugs(episode, common_title):
459 try:
460 with transaction.atomic():
461 episode.set_slug(slug)
462 break
464 except:
465 continue
468 class EpisodeUpdater(object):
469 """Updates an individual episode"""
471 def __init__(self, episode, podcast):
472 self.episode = episode
473 self.podcast = podcast
475 def update_episode(self, parsed_episode):
476 """updates "episode" with the data from "parsed_episode" """
478 # TODO: check if there have been any changes, to
479 # avoid unnecessary updates
480 self.episode.guid = to_maxlength(
481 Episode, "guid", parsed_episode.get("guid") or self.episode.guid
484 self.episode.description = (
485 parsed_episode.get("description") or self.episode.description
488 self.episode.subtitle = parsed_episode.get("subtitle") or self.episode.subtitle
490 self.episode.content = (
491 parsed_episode.get("content")
492 or parsed_episode.get("description")
493 or self.episode.content
496 self.episode.link = to_maxlength(
497 Episode, "link", parsed_episode.get("link") or self.episode.link
500 self.episode.released = (
501 datetime.utcfromtimestamp(parsed_episode.get("released"))
502 if parsed_episode.get("released")
503 else self.episode.released
506 self.episode.author = to_maxlength(
507 Episode, "author", parsed_episode.get("author") or self.episode.author
510 self.episode.duration = parsed_episode.get("duration") or self.episode.duration
512 self.episode.filesize = parsed_episode["files"][0]["filesize"]
514 self.episode.language = (
515 parsed_episode.get("language")
516 or self.episode.language
517 or self.podcast.language
520 mimetypes = [f["mimetype"] for f in parsed_episode.get("files", [])]
521 self.episode.mimetypes = ",".join(list(set(filter(None, mimetypes))))
523 self.episode.flattr_url = to_maxlength(
524 Episode,
525 "flattr_url",
526 parsed_episode.get("flattr") or self.episode.flattr_url,
529 self.episode.license = parsed_episode.get("license") or self.episode.license
531 self.episode.title = to_maxlength(
532 Episode,
533 "title",
534 parsed_episode.get("title")
535 or self.episode.title
536 or file_basename_no_extension(self.episode.url),
539 self.episode.last_update = datetime.utcnow()
540 self.episode.save()
542 parsed_urls = list(
543 chain.from_iterable(
544 f.get("urls", []) for f in parsed_episode.get("files", [])
547 self.episode.add_missing_urls(parsed_urls)
549 def mark_outdated(self):
550 """marks the episode outdated if its not already"""
551 if self.episode.outdated:
552 return None
554 self.episode.outdated = True
555 self.episode.last_update = datetime.utcnow()
556 self.episode.save()
559 def file_basename_no_extension(filename):
560 """Returns filename without extension
562 >>> file_basename_no_extension('/home/me/file.txt')
563 'file'
565 >>> file_basename_no_extension('file')
566 'file'
568 base = os.path.basename(filename)
569 name, extension = os.path.splitext(base)
570 return name
573 def verify_podcast_url(url):
574 updater = PodcastUpdater(url)
575 parsed = updater._fetch_feed()
576 updater._validate_parsed(parsed)
577 return True