2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
21 import urllib
.request
, urllib
.error
, urllib
.parse
22 from urllib
.parse
import urljoin
25 from datetime
import datetime
, timedelta
26 from itertools
import chain
, islice
30 from django
.db
import transaction
31 from django
.conf
import settings
33 from mygpo
.podcasts
.models
import Podcast
, Episode
34 from mygpo
.core
.slugs
import PodcastSlugs
, EpisodeSlugs
35 from mygpo
.podcasts
.models
import DEFAULT_UPDATE_INTERVAL
, \
36 MIN_UPDATE_INTERVAL
, MAX_UPDATE_INTERVAL
37 from mygpo
.utils
import file_hash
, to_maxlength
38 from mygpo
.web
.logo
import CoverArt
39 from mygpo
.data
.podcast
import subscribe_at_hub
40 from mygpo
.data
.tasks
import update_related_podcasts
41 from mygpo
.pubsub
.models
import SubscriptionError
42 from mygpo
.directory
.tags
import update_category
45 logger
= logging
.getLogger(__name__
)
47 MAX_EPISODES_UPDATE
= 200
50 class UpdatePodcastException(Exception):
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue
):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n
, podcast_url
in enumerate(queue
, 1):
66 logger
.info('Update %d - %s', n
, podcast_url
)
68 yield update_podcast(podcast_url
)
70 except NoPodcastCreated
as npc
:
71 logger
.info('No podcast created: %s', npc
)
74 logger
.exception('Error while updating podcast "%s"',
79 def update_podcast(podcast_url
):
80 """ Update the podcast for the supplied URL """
83 parsed
= _fetch_feed(podcast_url
)
84 _validate_parsed(parsed
)
86 except requests
.exceptions
.RequestException
as re
:
87 logging
.exception('Error while fetching response from feedservice')
89 # if we fail to parse the URL, we don't even create the
92 p
= Podcast
.objects
.get(urls__url
=podcast_url
)
93 # if it exists already, we mark it as outdated
94 _mark_outdated(p
, 'error while fetching feed: %s' % str(re
))
95 p
.last_update
= datetime
.utcnow()
99 except Podcast
.DoesNotExist
:
100 raise NoPodcastCreated(re
)
102 except NoEpisodesException
as nee
:
103 logging
.warn('No episode found while parsing podcast')
105 # if we fail to parse the URL, we don't even create the
108 p
= Podcast
.objects
.get(urls__url
=podcast_url
)
109 # if it exists already, we mark it as outdated
110 _mark_outdated(p
, 'error while fetching feed: %s' % str(nee
))
113 except Podcast
.DoesNotExist
:
114 raise NoPodcastCreated(nee
)
116 assert parsed
, 'fetch_feed must return something'
117 p
= Podcast
.objects
.get_or_create_for_url(podcast_url
)
118 episodes
= _update_episodes(p
, parsed
.get('episodes', []))
120 p
.episode_count
= Episode
.objects
.filter(podcast
=p
).count()
122 max_episode_order
= _order_episodes(p
)
123 _update_podcast(p
, parsed
, episodes
, max_episode_order
)
127 def verify_podcast_url(podcast_url
):
128 parsed
= _fetch_feed(podcast_url
)
129 _validate_parsed(parsed
)
133 def _fetch_feed(podcast_url
):
136 'process_text': 'markdown',
139 'Accept': 'application/json',
141 url
= urljoin(settings
.FEEDSERVICE_URL
, 'parse')
142 r
= requests
.get(url
, params
=params
, headers
=headers
, timeout
=10)
144 if r
.status_code
!= 200:
145 logger
.error('Feed-service status code for "%s" was %s', podcast_url
,
153 'Feed-service error while parsing response for url "%s": %s',
159 def _validate_parsed(parsed
):
160 """ validates the parsed results and raises an exception if invalid
162 feedparser parses pretty much everything. We reject anything that
163 doesn't look like a feed"""
165 if not parsed
or not parsed
.get('episodes', []):
166 raise NoEpisodesException('no episodes found')
169 def _update_podcast(podcast
, parsed
, episodes
, max_episode_order
):
170 """ updates a podcast according to new parser results """
172 # we need that later to decide if we can "bump" a category
173 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
175 podcast
.title
= parsed
.get('title') or podcast
.title
176 podcast
.description
= parsed
.get('description') or podcast
.description
177 podcast
.subtitle
= parsed
.get('subtitle') or podcast
.subtitle
178 podcast
.link
= parsed
.get('link') or podcast
.link
179 podcast
.logo_url
= parsed
.get('logo') or podcast
.logo_url
180 podcast
.author
= to_maxlength(Podcast
, 'author', parsed
.get('author') or
182 podcast
.language
= to_maxlength(Podcast
, 'language',
183 parsed
.get('language') or podcast
.language
)
184 podcast
.content_types
= ','.join(parsed
.get('content_types')) or \
185 podcast
.content_types
186 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
187 podcast
.common_episode_title
= to_maxlength(
189 'common_episode_title',
190 parsed
.get('common_title') or podcast
.common_episode_title
)
191 podcast
.new_location
= parsed
.get('new_location') or podcast
.new_location
192 podcast
.flattr_url
= to_maxlength(Podcast
, 'flattr_url',
193 parsed
.get('flattr') or
195 podcast
.hub
= parsed
.get('hub') or podcast
.hub
196 podcast
.license
= parsed
.get('license') or podcast
.license
197 podcast
.max_episode_order
= max_episode_order
199 podcast
.add_missing_urls(parsed
.get('urls', []))
201 if podcast
.new_location
:
203 new_podcast
= Podcast
.objects
.get(urls__url
=podcast
.new_location
)
204 if new_podcast
!= podcast
:
205 _mark_outdated(podcast
, 'redirected to different podcast')
207 except Podcast
.DoesNotExist
:
208 podcast
.set_url(podcast
.new_location
)
210 # latest episode timestamp
211 episodes
= Episode
.objects
.filter(podcast
=podcast
,
212 released__isnull
=False)\
213 .order_by('released')
215 podcast
.update_interval
= get_update_interval(episodes
)
217 latest_episode
= episodes
.last()
219 podcast
.latest_episode_timestamp
= latest_episode
.released
221 # podcast.episode_count is not update here on purpose. It is, instead,
222 # continuously updated when creating new episodes in
223 # EpisodeManager.get_or_create_for_url
225 _update_categories(podcast
, prev_latest_episode_timestamp
)
227 # try to download the logo and reset logo_url to None on http errors
228 found
= _save_podcast_logo(podcast
.logo_url
)
230 podcast
.logo_url
= None
232 # The podcast is always saved (not just when there are changes) because
233 # we need to record the last update
234 logger
.info('Saving podcast.')
235 podcast
.last_update
= datetime
.utcnow()
239 subscribe_at_hub(podcast
)
240 except SubscriptionError
as se
:
241 logger
.warn('subscribing to hub failed: %s', str(se
))
244 assign_missing_episode_slugs(podcast
)
245 update_related_podcasts
.delay(podcast
)
248 def assign_slug(podcast
):
252 for slug
in PodcastSlugs(podcast
):
254 with transaction
.atomic():
255 podcast
.add_slug(slug
)
262 def assign_missing_episode_slugs(podcast
):
263 common_title
= podcast
.get_common_episode_title()
265 episodes
= Episode
.objects
.filter(podcast
=podcast
, slugs__isnull
=True)
267 for episode
in episodes
:
269 for slug
in EpisodeSlugs(episode
, common_title
):
271 with transaction
.atomic():
272 episode
.set_slug(slug
)
279 def _update_categories(podcast
, prev_timestamp
):
280 """ checks some practical requirements and updates a category """
282 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
285 if not podcast
.latest_episode_timestamp
:
289 if prev_timestamp
and podcast
.latest_episode_timestamp
<= prev_timestamp
:
292 # too far in the future
293 if podcast
.latest_episode_timestamp
> max_timestamp
:
296 # not enough subscribers
297 if podcast
.subscriber_count() < settings
.MIN_SUBSCRIBERS_CATEGORY
:
300 update_category(podcast
)
303 def _update_episodes(podcast
, parsed_episodes
):
305 pid
= podcast
.get_id()
307 # list of (obj, fun) where fun is the function to update obj
308 updated_episodes
= []
309 episodes_to_update
= list(islice(parsed_episodes
, 0, MAX_EPISODES_UPDATE
))
310 logger
.info('Parsed %d (%d) episodes', len(parsed_episodes
),
311 len(episodes_to_update
))
313 logger
.info('Updating %d episodes', len(episodes_to_update
))
314 for n
, parsed
in enumerate(episodes_to_update
, 1):
316 url
= get_episode_url(parsed
)
318 logger
.info('Skipping episode %d for missing URL', n
)
321 logger
.info('Updating episode %d / %d', n
, len(parsed_episodes
))
323 episode
= Episode
.objects
.get_or_create_for_url(podcast
, url
)
325 update_episode(parsed
, episode
, podcast
)
326 updated_episodes
.append(episode
)
328 # and mark the remaining ones outdated
329 current_episodes
= Episode
.objects
.filter(podcast
=podcast
,
330 outdated
=False)[:500]
331 outdated_episodes
= set(current_episodes
) - set(updated_episodes
)
333 logger
.info('Marking %d episodes as outdated', len(outdated_episodes
))
334 for episode
in outdated_episodes
:
335 mark_outdated(episode
)
339 def _order_episodes(podcast
):
340 """ Reorder the podcast's episode according to release timestamp
342 Returns the highest order value (corresponding to the most recent
345 num_episodes
= podcast
.episode_count
349 episodes
= podcast
.episode_set
.all().extra(select
={
350 'has_released': 'released IS NOT NULL',
352 .order_by('-has_released', '-released', 'pk')\
355 for n
, episode
in enumerate(episodes
.iterator(), 1):
356 # assign ``order`` from higher (most recent) to 0 (oldest)
357 # None means "unknown"
358 new_order
= num_episodes
- n
360 # optimize for new episodes that are newer than all existing
361 if episode
.order
== new_order
:
364 logger
.info('Updating order from {} to {}'.format(episode
.order
,
366 episode
.order
= new_order
369 return num_episodes
- 1
372 def _save_podcast_logo(cover_art
):
377 image_sha1
= hashlib
.sha1(cover_art
.encode('utf-8')).hexdigest()
378 prefix
= CoverArt
.get_prefix(image_sha1
)
380 filename
= CoverArt
.get_original(prefix
, image_sha1
)
381 dirname
= CoverArt
.get_dir(filename
)
383 # get hash of existing file
384 if os
.path
.exists(filename
):
385 with
open(filename
, 'rb') as f
:
386 old_hash
= file_hash(f
).digest()
390 logger
.info('Logo %s', cover_art
)
393 with
open(filename
, 'wb') as fp
:
394 fp
.write(urllib
.request
.urlopen(cover_art
).read())
396 # get hash of new file
397 with
open(filename
, 'rb') as f
:
398 new_hash
= file_hash(f
).digest()
400 # remove thumbnails if cover changed
401 if old_hash
!= new_hash
:
402 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
403 logger
.info('Removing %d thumbnails', len(thumbnails
))
409 except (urllib
.error
.HTTPError
, urllib
.error
.URLError
, ValueError,
410 http
.client
.HTTPException
, socket
.error
, IOError) as e
:
411 logger
.warn('Exception while updating podcast logo: %s', str(e
))
414 def _mark_outdated(podcast
, msg
=''):
415 logger
.info('marking podcast outdated: %s', msg
)
416 podcast
.outdated
= True
417 podcast
.last_update
= datetime
.utcnow()
419 _update_episodes(podcast
, [])
422 def get_episode_url(parsed_episode
):
423 """ returns the URL of a parsed episode """
424 for f
in parsed_episode
.get('files', []):
425 if f
.get('urls', []):
430 def update_episode(parsed_episode
, episode
, podcast
):
431 """ updates "episode" with the data from "parsed_episode" """
433 # TODO: check if there have been any changes, to avoid unnecessary updates
434 episode
.guid
= to_maxlength(Episode
, 'guid', parsed_episode
.get('guid') or
436 episode
.description
= parsed_episode
.get('description') or \
438 episode
.subtitle
= parsed_episode
.get('subtitle') or episode
.subtitle
439 episode
.content
= parsed_episode
.get('content') or \
440 parsed_episode
.get('description') or episode
.content
441 episode
.link
= to_maxlength(Episode
, 'link',
442 parsed_episode
.get('link') or episode
.link
)
443 episode
.released
= datetime
.utcfromtimestamp(
444 parsed_episode
.get('released')) if parsed_episode
.get('released') \
445 else episode
.released
446 episode
.author
= to_maxlength(Episode
, 'author',
447 parsed_episode
.get('author') or
449 episode
.duration
= parsed_episode
.get('duration') or episode
.duration
450 episode
.filesize
= parsed_episode
['files'][0]['filesize']
451 episode
.language
= parsed_episode
.get('language') or \
452 episode
.language
or podcast
.language
453 episode
.mimetypes
= ','.join(list(set(
454 filter(None, [f
['mimetype'] for f
in parsed_episode
.get('files', [])])
456 episode
.flattr_url
= to_maxlength(Episode
, 'flattr_url',
457 parsed_episode
.get('flattr') or
459 episode
.license
= parsed_episode
.get('license') or episode
.license
461 episode
.title
= to_maxlength(Episode
, 'title',
462 parsed_episode
.get('title') or
464 file_basename_no_extension(episode
.url
))
466 episode
.last_update
= datetime
.utcnow()
469 parsed_urls
= list(chain
.from_iterable(
470 f
.get('urls', []) for f
in parsed_episode
.get('files', [])))
471 episode
.add_missing_urls(parsed_urls
)
474 def mark_outdated(obj
):
475 """ marks obj outdated if its not already """
480 obj
.last_update
= datetime
.utcnow()
484 def get_update_interval(episodes
):
485 """ calculates the avg interval between new episodes """
487 count
= len(episodes
)
489 logger
.info('no episodes, using default interval of %dh',
490 DEFAULT_UPDATE_INTERVAL
)
491 return DEFAULT_UPDATE_INTERVAL
493 earliest
= episodes
[0]
494 now
= datetime
.utcnow()
496 timespan_s
= (now
- earliest
.released
).total_seconds()
497 timespan_h
= timespan_s
/ 60 / 60
499 interval
= int(timespan_h
/ count
)
500 logger
.info('%d episodes in %d days => %dh interval', count
,
501 timespan_h
/ 24, interval
)
503 # place interval between {MIN,MAX}_UPDATE_INTERVAL
504 interval
= max(interval
, MIN_UPDATE_INTERVAL
)
505 interval
= min(interval
, MAX_UPDATE_INTERVAL
)
510 def file_basename_no_extension(filename
):
511 """ Returns filename without extension
513 >>> file_basename_no_extension('/home/me/file.txt')
516 >>> file_basename_no_extension('file')
519 base
= os
.path
.basename(filename
)
520 name
, extension
= os
.path
.splitext(base
)