2 # -*- coding: utf-8 -*-
8 from urllib
.parse
import urljoin
11 from datetime
import datetime
, timedelta
12 from itertools
import chain
, islice
16 from django
.db
import transaction
17 from django
.conf
import settings
19 from mygpo
.podcasts
.models
import Podcast
, Episode
20 from mygpo
.core
.slugs
import PodcastSlugs
, EpisodeSlugs
21 from mygpo
.podcasts
.models
import DEFAULT_UPDATE_INTERVAL
, \
22 MIN_UPDATE_INTERVAL
, MAX_UPDATE_INTERVAL
23 from mygpo
.utils
import file_hash
, to_maxlength
24 from mygpo
.web
.logo
import CoverArt
25 from mygpo
.data
.podcast
import subscribe_at_hub
26 from mygpo
.data
.tasks
import update_related_podcasts
27 from mygpo
.pubsub
.models
import SubscriptionError
28 from mygpo
.directory
.tags
import update_category
29 from mygpo
.search
import get_index_fields
34 logger
= logging
.getLogger(__name__
)
36 MAX_EPISODES_UPDATE
= 200
39 class UpdatePodcastException(Exception):
43 class NoPodcastCreated(Exception):
44 """ raised when no podcast obj was created for a new URL """
47 class NoEpisodesException(Exception):
48 """ raised when parsing something that doesn't contain any episodes """
51 def update_podcasts(queue
):
52 """ Fetch data for the URLs supplied as the queue iterable """
54 for n
, podcast_url
in enumerate(queue
, 1):
55 logger
.info('Update %d - %s', n
, podcast_url
)
57 logger
.warn('Podcast URL empty, skipping')
61 updater
= PodcastUpdater(podcast_url
)
62 yield updater
.update_podcast()
64 except NoPodcastCreated
as npc
:
65 logger
.info('No podcast created: %s', npc
)
68 logger
.exception('Error while updating podcast "%s"',
73 class PodcastUpdater(object):
74 """ Updates the podcast specified by the podcast_url """
76 def __init__(self
, podcast_url
):
77 self
.podcast_url
= podcast_url
79 def update_podcast(self
):
80 """ Update the podcast """
82 with models
.PodcastUpdateResult(podcast_url
=self
.podcast_url
) as res
:
84 parsed
, podcast
, created
= self
.parse_feed()
87 res
.podcast_created
= False
88 res
.error_message
= '"{}" could not be parsed'.format(
94 res
.podcast_created
= created
96 res
.episodes_added
= 0
97 episode_updater
= MultiEpisodeUpdater(podcast
, res
)
100 # if it exists already, we mark it as outdated
103 'error while fetching feed',
107 episode_updater
.update_episodes(parsed
.get('episodes', []))
109 podcast
.refresh_from_db()
110 podcast
.episode_count
= episode_updater
.count_episodes()
113 episode_updater
.order_episodes()
115 self
._update
_podcast
(podcast
, parsed
, episode_updater
)
119 def parse_feed(self
):
121 parsed
= self
._fetch
_feed
()
122 self
._validate
_parsed
(parsed
)
124 except (requests
.exceptions
.RequestException
,
125 NoEpisodesException
) as ex
:
126 logging
.exception('Error while fetching/parsing feed')
128 # if we fail to parse the URL, we don't even create the
131 p
= Podcast
.objects
.get(urls__url
=self
.podcast_url
)
132 return (None, p
, False)
134 except Podcast
.DoesNotExist
as pdne
:
135 raise NoPodcastCreated(ex
) from pdne
137 # Parsing went well, get podcast
138 podcast
, created
= Podcast
.objects
.get_or_create_for_url(
141 return (parsed
, podcast
, created
)
143 def _fetch_feed(self
):
145 'url': self
.podcast_url
,
146 'process_text': 'markdown',
149 'Accept': 'application/json',
151 url
= urljoin(settings
.FEEDSERVICE_URL
, 'parse')
152 r
= requests
.get(url
, params
=params
, headers
=headers
, timeout
=30)
154 if r
.status_code
!= 200:
155 logger
.error('Feed-service status code for "{}" was {}'.format(
156 podcast_url
, r
.status_code
))
163 'Feed-service error while parsing response for url "%s": %s',
168 def _validate_parsed(self
, parsed
):
169 """ validates the parsed results and raises an exception if invalid
171 feedparser parses pretty much everything. We reject anything that
172 doesn't look like a feed"""
174 if not parsed
or not parsed
.get('episodes', []):
175 raise NoEpisodesException('no episodes found')
177 def _update_podcast(self
, podcast
, parsed
, episode_updater
):
178 """ updates a podcast according to new parser results """
180 # we need that later to decide if we can "bump" a category
181 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
183 # will later be used to see whether the index is outdated
184 old_index_fields
= get_index_fields(podcast
)
186 podcast
.title
= parsed
.get('title') or podcast
.title
187 podcast
.description
= parsed
.get('description') or podcast
.description
188 podcast
.subtitle
= parsed
.get('subtitle') or podcast
.subtitle
189 podcast
.link
= parsed
.get('link') or podcast
.link
190 podcast
.logo_url
= parsed
.get('logo') or podcast
.logo_url
192 podcast
.author
= to_maxlength(
194 parsed
.get('author') or podcast
.author
)
196 podcast
.language
= to_maxlength(
198 parsed
.get('language') or podcast
.language
)
200 podcast
.content_types
= (','.join(parsed
.get('content_types')) or
201 podcast
.content_types
)
203 # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
205 podcast
.common_episode_title
= to_maxlength(
207 'common_episode_title',
208 parsed
.get('common_title') or podcast
.common_episode_title
)
210 podcast
.new_location
= (parsed
.get('new_location') or
211 podcast
.new_location
)
212 podcast
.flattr_url
= to_maxlength(Podcast
, 'flattr_url',
213 parsed
.get('flattr') or
215 podcast
.hub
= parsed
.get('hub') or podcast
.hub
216 podcast
.license
= parsed
.get('license') or podcast
.license
217 podcast
.max_episode_order
= episode_updater
.max_episode_order
219 podcast
.add_missing_urls(parsed
.get('urls', []))
221 if podcast
.new_location
:
223 new_podcast
= Podcast
.objects
.get(
224 urls__url
=podcast
.new_location
227 if new_podcast
!= podcast
:
230 'redirected to different podcast',
234 except Podcast
.DoesNotExist
:
235 podcast
.set_url(podcast
.new_location
)
237 # latest episode timestamp
238 episodes
= Episode
.objects
.filter(podcast
=podcast
,
239 released__isnull
=False)\
240 .order_by('released')
242 # Determine update interval
244 # Update interval is based on intervals between episodes
245 podcast
.update_interval
= episode_updater
.get_update_interval(episodes
)
247 # factor is increased / decreased depending on whether the latest
248 # update has returned episodes
249 if episode_updater
.episodes_added
== 0: # no episodes, incr factor
250 podcast
.update_interval_factor
*= 1.2
251 elif episode_updater
.episodes_added
> 1: # new episodes, decr factor
252 newfactor
= podcast
.update_interval_factor
/ 1.2
253 podcast
.update_interval_factor
= max(1, newfactor
) # never below 1
255 latest_episode
= episodes
.last()
257 podcast
.latest_episode_timestamp
= latest_episode
.released
259 # podcast.episode_count is not update here on purpose. It is, instead,
260 # continuously updated when creating new episodes in
261 # EpisodeManager.get_or_create_for_url
263 self
._update
_categories
(podcast
, prev_latest_episode_timestamp
)
265 # try to download the logo and reset logo_url to None on http errors
266 found
= self
._save
_podcast
_logo
(podcast
.logo_url
)
268 podcast
.logo_url
= None
270 # check if search index should be considered out of date
271 new_index_fields
= get_index_fields(podcast
)
272 if list(old_index_fields
.items()) != list(new_index_fields
.items()):
273 podcast
.search_index_uptodate
= False
275 # The podcast is always saved (not just when there are changes) because
276 # we need to record the last update
277 logger
.info('Saving podcast.')
278 podcast
.last_update
= datetime
.utcnow()
282 subscribe_at_hub(podcast
)
283 except SubscriptionError
as se
:
284 logger
.warn('subscribing to hub failed: %s', str(se
))
286 self
.assign_slug(podcast
)
287 episode_updater
.assign_missing_episode_slugs()
288 update_related_podcasts
.delay(podcast
.pk
)
290 def assign_slug(self
, podcast
):
294 for slug
in PodcastSlugs(podcast
):
296 with transaction
.atomic():
297 podcast
.add_slug(slug
)
303 def _update_categories(self
, podcast
, prev_timestamp
):
304 """ checks some practical requirements and updates a category """
306 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
309 if not podcast
.latest_episode_timestamp
:
313 if prev_timestamp
and \
314 (podcast
.latest_episode_timestamp
<= prev_timestamp
):
317 # too far in the future
318 if podcast
.latest_episode_timestamp
> max_timestamp
:
321 # not enough subscribers
322 if podcast
.subscriber_count() < settings
.MIN_SUBSCRIBERS_CATEGORY
:
325 update_category(podcast
)
327 def _save_podcast_logo(self
, cover_art
):
332 image_sha1
= hashlib
.sha1(cover_art
.encode('utf-8')).hexdigest()
333 prefix
= CoverArt
.get_prefix(image_sha1
)
335 filename
= CoverArt
.get_original(prefix
, image_sha1
)
336 dirname
= CoverArt
.get_dir(filename
)
338 # get hash of existing file
339 if os
.path
.exists(filename
):
340 with
open(filename
, 'rb') as f
:
341 old_hash
= file_hash(f
).digest()
345 logger
.info('Logo %s', cover_art
)
348 with
open(filename
, 'wb') as fp
:
349 fp
.write(urllib
.request
.urlopen(cover_art
).read())
351 # get hash of new file
352 with
open(filename
, 'rb') as f
:
353 new_hash
= file_hash(f
).digest()
355 # remove thumbnails if cover changed
356 if old_hash
!= new_hash
:
357 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
358 logger
.info('Removing %d thumbnails', len(thumbnails
))
364 except (urllib
.error
.HTTPError
, urllib
.error
.URLError
, ValueError,
365 http
.client
.HTTPException
, socket
.error
, IOError) as e
:
366 logger
.warn('Exception while updating podcast logo: %s', str(e
))
368 def _mark_outdated(self
, podcast
, msg
, episode_updater
):
369 logger
.info('marking podcast outdated: %s', msg
)
370 podcast
.outdated
= True
371 podcast
.last_update
= datetime
.utcnow()
373 episode_updater
.update_episodes([])
376 class MultiEpisodeUpdater(object):
378 def __init__(self
, podcast
, update_result
):
379 self
.podcast
= podcast
380 self
.update_result
= update_result
381 self
.updated_episodes
= []
382 self
.max_episode_order
= None
384 def update_episodes(self
, parsed_episodes
):
386 pid
= self
.podcast
.get_id()
388 episodes_to_update
= list(islice(parsed_episodes
, 0,
389 MAX_EPISODES_UPDATE
))
390 logger
.info('Parsed %d (%d) episodes', len(parsed_episodes
),
391 len(episodes_to_update
))
393 logger
.info('Updating %d episodes', len(episodes_to_update
))
394 for n
, parsed
in enumerate(episodes_to_update
, 1):
396 url
= self
.get_episode_url(parsed
)
398 logger
.info('Skipping episode %d for missing URL', n
)
401 logger
.info('Updating episode %d / %d', n
, len(parsed_episodes
))
403 episode
, created
= Episode
.objects
.get_or_create_for_url(
407 self
.update_result
.episodes_added
+= 1
409 updater
= EpisodeUpdater(episode
, self
.podcast
)
410 updater
.update_episode(parsed
)
412 self
.updated_episodes
.append(episode
)
414 # and mark the remaining ones outdated
415 current_episodes
= Episode
.objects
.filter(podcast
=self
.podcast
,
416 outdated
=False)[:500]
417 outdated_episodes
= set(current_episodes
) - set(self
.updated_episodes
)
419 logger
.info('Marking %d episodes as outdated', len(outdated_episodes
))
420 for episode
in outdated_episodes
:
421 updater
= EpisodeUpdater(episode
, self
.podcast
)
422 updater
.mark_outdated()
425 def order_episodes(self
):
426 """ Reorder the podcast's episode according to release timestamp
428 Returns the highest order value (corresponding to the most recent
431 num_episodes
= self
.podcast
.episode_count
435 episodes
= self
.podcast
.episode_set
.all().extra(select
={
436 'has_released': 'released IS NOT NULL',
438 .order_by('-has_released', '-released', 'pk')\
441 for n
, episode
in enumerate(episodes
.iterator(), 1):
442 # assign ``order`` from higher (most recent) to 0 (oldest)
443 # None means "unknown"
444 new_order
= num_episodes
- n
446 # optimize for new episodes that are newer than all existing
447 if episode
.order
== new_order
:
450 logger
.info('Updating order from {} to {}'.format(episode
.order
,
452 episode
.order
= new_order
455 self
.max_episode_order
= num_episodes
- 1
457 def get_episode_url(self
, parsed_episode
):
458 """ returns the URL of a parsed episode """
459 for f
in parsed_episode
.get('files', []):
460 if f
.get('urls', []):
464 def count_episodes(self
):
465 return Episode
.objects
.filter(podcast
=self
.podcast
).count()
467 def get_update_interval(self
, episodes
):
468 """ calculates the avg interval between new episodes """
470 count
= episodes
.count()
472 logger
.info('no episodes, using default interval of %dh',
473 DEFAULT_UPDATE_INTERVAL
)
474 return DEFAULT_UPDATE_INTERVAL
476 earliest
= episodes
.first()
477 now
= datetime
.utcnow()
479 timespan_s
= (now
- earliest
.released
).total_seconds()
480 timespan_h
= timespan_s
/ 60 / 60
482 interval
= int(timespan_h
/ count
)
483 logger
.info('%d episodes in %d days => %dh interval', count
,
484 timespan_h
/ 24, interval
)
486 # place interval between {MIN,MAX}_UPDATE_INTERVAL
487 interval
= max(interval
, MIN_UPDATE_INTERVAL
)
488 interval
= min(interval
, MAX_UPDATE_INTERVAL
)
492 def assign_missing_episode_slugs(self
):
493 common_title
= self
.podcast
.get_common_episode_title()
495 episodes
= Episode
.objects
.filter(podcast
=self
.podcast
,
498 for episode
in episodes
:
500 for slug
in EpisodeSlugs(episode
, common_title
):
502 with transaction
.atomic():
503 episode
.set_slug(slug
)
510 class EpisodeUpdater(object):
511 """ Updates an individual episode """
513 def __init__(self
, episode
, podcast
):
514 self
.episode
= episode
515 self
.podcast
= podcast
517 def update_episode(self
, parsed_episode
):
518 """ updates "episode" with the data from "parsed_episode" """
520 # TODO: check if there have been any changes, to
521 # avoid unnecessary updates
522 self
.episode
.guid
= to_maxlength(
524 parsed_episode
.get('guid') or self
.episode
.guid
)
526 self
.episode
.description
= (parsed_episode
.get('description') or
527 self
.episode
.description
)
529 self
.episode
.subtitle
= (parsed_episode
.get('subtitle') or
530 self
.episode
.subtitle
)
532 self
.episode
.content
= (parsed_episode
.get('content') or
533 parsed_episode
.get('description') or
534 self
.episode
.content
)
536 self
.episode
.link
= to_maxlength(
538 parsed_episode
.get('link') or self
.episode
.link
)
540 self
.episode
.released
= (datetime
.utcfromtimestamp(
541 parsed_episode
.get('released')) if parsed_episode
.get('released')
542 else self
.episode
.released
)
544 self
.episode
.author
= to_maxlength(
546 parsed_episode
.get('author') or self
.episode
.author
)
548 self
.episode
.duration
= (parsed_episode
.get('duration') or
549 self
.episode
.duration
)
551 self
.episode
.filesize
= parsed_episode
['files'][0]['filesize']
553 self
.episode
.language
= (parsed_episode
.get('language') or
554 self
.episode
.language
or
555 self
.podcast
.language
)
557 mimetypes
= [f
['mimetype'] for f
in parsed_episode
.get('files', [])]
558 self
.episode
.mimetypes
= ','.join(list(set(filter(None, mimetypes
))))
560 self
.episode
.flattr_url
= to_maxlength(
561 Episode
, 'flattr_url',
562 parsed_episode
.get('flattr') or self
.episode
.flattr_url
)
564 self
.episode
.license
= (parsed_episode
.get('license') or
565 self
.episode
.license
)
567 self
.episode
.title
= to_maxlength(
569 parsed_episode
.get('title') or self
.episode
.title
or
570 file_basename_no_extension(self
.episode
.url
))
572 self
.episode
.last_update
= datetime
.utcnow()
575 parsed_urls
= list(chain
.from_iterable(
576 f
.get('urls', []) for f
in parsed_episode
.get('files', [])))
577 self
.episode
.add_missing_urls(parsed_urls
)
579 def mark_outdated(self
):
580 """ marks the episode outdated if its not already """
581 if self
.episode
.outdated
:
584 self
.episode
.outdated
= True
585 self
.episode
.last_update
= datetime
.utcnow()
589 def file_basename_no_extension(filename
):
590 """ Returns filename without extension
592 >>> file_basename_no_extension('/home/me/file.txt')
595 >>> file_basename_no_extension('file')
598 base
= os
.path
.basename(filename
)
599 name
, extension
= os
.path
.splitext(base
)
603 def verify_podcast_url(self
):
604 parsed
= _fetch_feed(self
.podcast_url
)
605 self
._validate
_parsed
(parsed
)