2 # -*- coding: utf-8 -*-
7 from urllib
.parse
import urljoin
9 from datetime
import datetime
, timedelta
10 from itertools
import chain
, islice
13 from django
.db
import transaction
14 from django
.conf
import settings
16 from mygpo
.podcasts
.models
import Podcast
, Episode
17 from mygpo
.core
.slugs
import PodcastSlugs
, EpisodeSlugs
18 from mygpo
.podcasts
.models
import (
19 DEFAULT_UPDATE_INTERVAL
,
23 from mygpo
.utils
import to_maxlength
24 from mygpo
.web
.logo
import CoverArt
25 from mygpo
.data
.podcast
import subscribe_at_hub
26 from mygpo
.data
.tasks
import update_related_podcasts
27 from mygpo
.pubsub
.models
import SubscriptionError
28 from mygpo
.directory
.tags
import update_category
29 from mygpo
.search
import get_index_fields
35 logger
= logging
.getLogger(__name__
)
37 MAX_EPISODES_UPDATE
= 200
40 class UpdatePodcastException(Exception):
44 class NoPodcastCreated(Exception):
45 """raised when no podcast obj was created for a new URL"""
48 class NoEpisodesException(Exception):
49 """raised when parsing something that doesn't contain any episodes"""
52 def update_podcasts(queue
):
53 """Fetch data for the URLs supplied as the queue iterable"""
55 for n
, podcast_url
in enumerate(queue
, 1):
56 logger
.info("Update %d - %s", n
, podcast_url
)
58 logger
.warning("Podcast URL empty, skipping")
62 updater
= PodcastUpdater(podcast_url
)
63 yield updater
.update_podcast()
65 except NoPodcastCreated
as npc
:
66 logger
.info("No podcast created: %s", npc
)
68 except NoEpisodesException
as nee
:
69 logger
.info(f
"No episodes found when parsing {podcast_url}")
76 logger
.exception('Error while updating podcast "%s"', podcast_url
)
80 class PodcastUpdater(object):
81 """Updates the podcast specified by the podcast_url"""
83 def __init__(self
, podcast_url
):
85 (podcast_url
[:2046] + "..") if len(podcast_url
) > 2048 else podcast_url
88 def update_podcast(self
):
89 """Update the podcast"""
91 with models
.PodcastUpdateResult(podcast_url
=self
.podcast_url
) as res
:
93 parsed
, podcast
, created
= self
.parse_feed()
96 res
.podcast_created
= False
97 res
.error_message
= '"{}" could not be parsed'.format(self
.podcast_url
)
101 res
.podcast
= podcast
102 res
.podcast_created
= created
104 res
.episodes_added
= 0
105 episode_updater
= MultiEpisodeUpdater(podcast
, res
)
108 # if it exists already, we mark it as outdated
110 podcast
, "error while fetching feed", episode_updater
114 episode_updater
.update_episodes(parsed
.get("episodes", []))
116 podcast
.refresh_from_db()
117 podcast
.episode_count
= episode_updater
.count_episodes()
120 episode_updater
.order_episodes()
122 self
._update
_podcast
(podcast
, parsed
, episode_updater
, res
)
126 def parse_feed(self
):
128 parsed
= self
._fetch
_feed
()
129 self
._validate
_parsed
(parsed
)
131 except (requests
.exceptions
.RequestException
, NoEpisodesException
) as ex
:
132 logger
.warn("Error while fetching/parsing feed", exc_info
=True)
134 # if we fail to parse the URL, we don't even create the
137 p
= Podcast
.objects
.get(urls__url
=self
.podcast_url
)
138 return (None, p
, False)
140 except Podcast
.DoesNotExist
as pdne
:
141 raise NoPodcastCreated(ex
) from pdne
143 # Parsing went well, get podcast
144 podcast
, created
= Podcast
.objects
.get_or_create_for_url(self
.podcast_url
)
146 return (parsed
, podcast
, created
)
148 def _fetch_feed(self
):
149 params
= {"url": self
.podcast_url
, "process_text": "markdown"}
150 headers
= {"Accept": "application/json"}
151 url
= urljoin(settings
.FEEDSERVICE_URL
, "parse")
152 r
= requests
.get(url
, params
=params
, headers
=headers
, timeout
=30)
154 if r
.status_code
!= 200:
156 'Feed-service status code for "{}" was {}'.format(url
, r
.status_code
)
164 'Feed-service error while parsing response for url "%s": %s',
170 def _validate_parsed(self
, parsed
):
171 """validates the parsed results and raises an exception if invalid
173 feedparser parses pretty much everything. We reject anything that
174 doesn't look like a feed"""
176 if not parsed
or not parsed
.get("episodes", []):
177 raise NoEpisodesException("no episodes found")
179 def _update_podcast(self
, podcast
, parsed
, episode_updater
, update_result
):
180 """updates a podcast according to new parser results"""
182 # we need that later to decide if we can "bump" a category
183 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
185 # will later be used to see whether the index is outdated
186 old_index_fields
= get_index_fields(podcast
)
188 podcast
.title
= parsed
.get("title") or podcast
.title
189 podcast
.description
= parsed
.get("description") or podcast
.description
190 podcast
.subtitle
= parsed
.get("subtitle") or podcast
.subtitle
191 podcast
.link
= parsed
.get("link") or podcast
.link
192 podcast
.logo_url
= parsed
.get("logo") or podcast
.logo_url
194 podcast
.author
= to_maxlength(
195 Podcast
, "author", parsed
.get("author") or podcast
.author
198 podcast
.language
= to_maxlength(
199 Podcast
, "language", parsed
.get("language") or podcast
.language
202 podcast
.content_types
= (
203 ",".join(parsed
.get("content_types")) or podcast
.content_types
206 # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
208 podcast
.common_episode_title
= to_maxlength(
210 "common_episode_title",
211 parsed
.get("common_title") or podcast
.common_episode_title
,
214 podcast
.new_location
= parsed
.get("new_location") or podcast
.new_location
215 podcast
.flattr_url
= to_maxlength(
216 Podcast
, "flattr_url", parsed
.get("flattr") or podcast
.flattr_url
218 podcast
.hub
= parsed
.get("hub") or podcast
.hub
219 podcast
.license
= parsed
.get("license") or podcast
.license
220 podcast
.max_episode_order
= episode_updater
.max_episode_order
222 podcast
.add_missing_urls(parsed
.get("urls", []))
224 if podcast
.new_location
:
226 new_podcast
= Podcast
.objects
.get(urls__url
=podcast
.new_location
)
228 if new_podcast
!= podcast
:
230 podcast
, "redirected to different podcast", episode_updater
233 except Podcast
.DoesNotExist
:
234 podcast
.set_url(podcast
.new_location
)
236 # latest episode timestamp
237 episodes
= Episode
.objects
.filter(
238 podcast
=podcast
, released__isnull
=False
239 ).order_by("released")
241 # Determine update interval
243 # Update interval is based on intervals between episodes
244 podcast
.update_interval
= episode_updater
.get_update_interval(episodes
)
246 # factor is increased / decreased depending on whether the latest
247 # update has returned episodes
248 if update_result
.episodes_added
== 0: # no episodes, incr factor
249 newfactor
= podcast
.update_interval_factor
* 1.2
250 podcast
.update_interval_factor
= min(1000, newfactor
) # never above 1000
251 elif update_result
.episodes_added
> 1: # new episodes, decr factor
252 newfactor
= podcast
.update_interval_factor
/ 1.2
253 podcast
.update_interval_factor
= max(1, newfactor
) # never below 1
255 latest_episode
= episodes
.last()
257 podcast
.latest_episode_timestamp
= latest_episode
.released
259 # podcast.episode_count is not update here on purpose. It is, instead,
260 # continuously updated when creating new episodes in
261 # EpisodeManager.get_or_create_for_url
263 self
._update
_categories
(podcast
, prev_latest_episode_timestamp
)
265 # try to download the logo and reset logo_url to None on http errors
266 found
= CoverArt
.save_podcast_logo(podcast
.logo_url
)
268 podcast
.logo_url
= None
270 # check if search index should be considered out of date
271 new_index_fields
= get_index_fields(podcast
)
272 if list(old_index_fields
.items()) != list(new_index_fields
.items()):
273 podcast
.search_index_uptodate
= False
275 # The podcast is always saved (not just when there are changes) because
276 # we need to record the last update
277 logger
.info("Saving podcast.")
278 podcast
.last_update
= datetime
.utcnow()
282 subscribe_at_hub(podcast
)
283 except SubscriptionError
as se
:
284 logger
.warning("subscribing to hub failed: %s", str(se
))
286 self
.assign_slug(podcast
)
287 episode_updater
.assign_missing_episode_slugs()
288 update_related_podcasts
.delay(podcast
.pk
)
290 def assign_slug(self
, podcast
):
294 for slug
in PodcastSlugs(podcast
):
296 with transaction
.atomic():
297 podcast
.add_slug(slug
)
303 def _update_categories(self
, podcast
, prev_timestamp
):
304 """checks some practical requirements and updates a category"""
306 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
309 if not podcast
.latest_episode_timestamp
:
313 if prev_timestamp
and (podcast
.latest_episode_timestamp
<= prev_timestamp
):
316 # too far in the future
317 if podcast
.latest_episode_timestamp
> max_timestamp
:
320 # not enough subscribers
321 if podcast
.subscriber_count() < settings
.MIN_SUBSCRIBERS_CATEGORY
:
324 update_category(podcast
)
326 def _mark_outdated(self
, podcast
, msg
, episode_updater
):
327 logger
.info("marking podcast outdated: %s", msg
)
328 podcast
.outdated
= True
329 podcast
.last_update
= datetime
.utcnow()
331 episode_updater
.update_episodes([])
334 class MultiEpisodeUpdater(object):
335 def __init__(self
, podcast
, update_result
):
336 self
.podcast
= podcast
337 self
.update_result
= update_result
338 self
.updated_episodes
= []
339 self
.max_episode_order
= None
341 def update_episodes(self
, parsed_episodes
):
343 pid
= self
.podcast
.get_id()
345 episodes_to_update
= list(islice(parsed_episodes
, 0, MAX_EPISODES_UPDATE
))
347 "Parsed %d (%d) episodes", len(parsed_episodes
), len(episodes_to_update
)
350 logger
.info("Updating %d episodes", len(episodes_to_update
))
351 for n
, parsed
in enumerate(episodes_to_update
, 1):
353 url
= self
.get_episode_url(parsed
)
355 logger
.info("Skipping episode %d for missing URL", n
)
358 logger
.info("Updating episode %d / %d", n
, len(parsed_episodes
))
360 episode
, created
= Episode
.objects
.get_or_create_for_url(self
.podcast
, url
)
363 self
.update_result
.episodes_added
+= 1
365 updater
= EpisodeUpdater(episode
, self
.podcast
)
366 updater
.update_episode(parsed
)
368 self
.updated_episodes
.append(episode
)
370 # and mark the remaining ones outdated
371 current_episodes
= Episode
.objects
.filter(podcast
=self
.podcast
, outdated
=False)[
374 outdated_episodes
= set(current_episodes
) - set(self
.updated_episodes
)
376 logger
.info("Marking %d episodes as outdated", len(outdated_episodes
))
377 for episode
in outdated_episodes
:
378 updater
= EpisodeUpdater(episode
, self
.podcast
)
379 updater
.mark_outdated()
382 def order_episodes(self
):
383 """Reorder the podcast's episode according to release timestamp
385 Returns the highest order value (corresponding to the most recent
388 num_episodes
= self
.podcast
.episode_count
393 self
.podcast
.episode_set
.all()
394 .extra(select
={"has_released": "released IS NOT NULL"})
395 .order_by("-has_released", "-released", "pk")
399 for n
, episode
in enumerate(episodes
.iterator(), 1):
400 # assign ``order`` from higher (most recent) to 0 (oldest)
401 # None means "unknown"
402 new_order
= num_episodes
- n
404 # optimize for new episodes that are newer than all existing
405 if episode
.order
== new_order
:
408 logger
.info("Updating order from {} to {}".format(episode
.order
, new_order
))
409 episode
.order
= new_order
412 self
.max_episode_order
= num_episodes
- 1
414 def get_episode_url(self
, parsed_episode
):
415 """returns the URL of a parsed episode"""
416 for f
in parsed_episode
.get("files", []):
417 if f
.get("urls", []):
421 def count_episodes(self
):
422 return Episode
.objects
.filter(podcast
=self
.podcast
).count()
424 def get_update_interval(self
, episodes
):
425 """calculates the avg interval between new episodes"""
427 count
= episodes
.count()
430 "no episodes, using default interval of %dh", DEFAULT_UPDATE_INTERVAL
432 return DEFAULT_UPDATE_INTERVAL
434 earliest
= episodes
.first()
435 now
= datetime
.utcnow()
437 timespan_s
= (now
- earliest
.released
).total_seconds()
438 timespan_h
= timespan_s
/ 60 / 60
440 interval
= int(timespan_h
/ count
)
442 "%d episodes in %d days => %dh interval", count
, timespan_h
/ 24, interval
445 # place interval between {MIN,MAX}_UPDATE_INTERVAL
446 interval
= max(interval
, MIN_UPDATE_INTERVAL
)
447 interval
= min(interval
, MAX_UPDATE_INTERVAL
)
451 def assign_missing_episode_slugs(self
):
452 common_title
= self
.podcast
.get_common_episode_title()
454 episodes
= Episode
.objects
.filter(podcast
=self
.podcast
, slugs__isnull
=True)
456 for episode
in episodes
:
458 for slug
in EpisodeSlugs(episode
, common_title
):
460 with transaction
.atomic():
461 episode
.set_slug(slug
)
468 class EpisodeUpdater(object):
469 """Updates an individual episode"""
471 def __init__(self
, episode
, podcast
):
472 self
.episode
= episode
473 self
.podcast
= podcast
475 def update_episode(self
, parsed_episode
):
476 """updates "episode" with the data from "parsed_episode" """
478 # TODO: check if there have been any changes, to
479 # avoid unnecessary updates
480 self
.episode
.guid
= to_maxlength(
481 Episode
, "guid", parsed_episode
.get("guid") or self
.episode
.guid
484 self
.episode
.description
= (
485 parsed_episode
.get("description") or self
.episode
.description
488 self
.episode
.subtitle
= parsed_episode
.get("subtitle") or self
.episode
.subtitle
490 self
.episode
.content
= (
491 parsed_episode
.get("content")
492 or parsed_episode
.get("description")
493 or self
.episode
.content
496 self
.episode
.link
= to_maxlength(
497 Episode
, "link", parsed_episode
.get("link") or self
.episode
.link
500 self
.episode
.released
= (
501 datetime
.utcfromtimestamp(parsed_episode
.get("released"))
502 if parsed_episode
.get("released")
503 else self
.episode
.released
506 self
.episode
.author
= to_maxlength(
507 Episode
, "author", parsed_episode
.get("author") or self
.episode
.author
510 self
.episode
.duration
= parsed_episode
.get("duration") or self
.episode
.duration
512 self
.episode
.filesize
= parsed_episode
["files"][0]["filesize"]
514 self
.episode
.language
= (
515 parsed_episode
.get("language")
516 or self
.episode
.language
517 or self
.podcast
.language
520 mimetypes
= [f
["mimetype"] for f
in parsed_episode
.get("files", [])]
521 self
.episode
.mimetypes
= ",".join(list(set(filter(None, mimetypes
))))
523 self
.episode
.flattr_url
= to_maxlength(
526 parsed_episode
.get("flattr") or self
.episode
.flattr_url
,
529 self
.episode
.license
= parsed_episode
.get("license") or self
.episode
.license
531 self
.episode
.title
= to_maxlength(
534 parsed_episode
.get("title")
535 or self
.episode
.title
536 or file_basename_no_extension(self
.episode
.url
),
539 self
.episode
.last_update
= datetime
.utcnow()
544 f
.get("urls", []) for f
in parsed_episode
.get("files", [])
547 self
.episode
.add_missing_urls(parsed_urls
)
549 def mark_outdated(self
):
550 """marks the episode outdated if its not already"""
551 if self
.episode
.outdated
:
554 self
.episode
.outdated
= True
555 self
.episode
.last_update
= datetime
.utcnow()
559 def file_basename_no_extension(filename
):
560 """Returns filename without extension
562 >>> file_basename_no_extension('/home/me/file.txt')
565 >>> file_basename_no_extension('file')
568 base
= os
.path
.basename(filename
)
569 name
, extension
= os
.path
.splitext(base
)
573 def verify_podcast_url(url
):
574 updater
= PodcastUpdater(url
)
575 parsed
= updater
._fetch
_feed
()
576 updater
._validate
_parsed
(parsed
)