2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
24 from datetime
import datetime
25 from itertools
import chain
, islice
28 from django
.conf
import settings
30 from mygpo
.podcasts
.models
import Podcast
, URL
, Slug
, Episode
31 from mygpo
.core
.slugs
import assign_missing_episode_slugs
, PodcastSlug
32 from mygpo
.podcasts
.models
import DEFAULT_UPDATE_INTERVAL
, \
33 MIN_UPDATE_INTERVAL
, MAX_UPDATE_INTERVAL
34 from feedservice
.parse
import parse_feed
, FetchFeedException
35 from feedservice
.parse
.text
import ConvertMarkdown
36 from feedservice
.parse
.models
import ParserException
37 from feedservice
.parse
.vimeo
import VimeoError
38 from mygpo
.utils
import file_hash
39 from mygpo
.web
.logo
import CoverArt
40 from mygpo
.data
.podcast
import subscribe_at_hub
41 from mygpo
.pubsub
.models
import SubscriptionError
42 from mygpo
.directory
.tags
import update_category
45 logger
= logging
.getLogger(__name__
)
47 MAX_EPISODES_UPDATE
=200
49 class NoPodcastCreated(Exception):
50 """ raised when no podcast obj was created for a new URL """
53 class NoEpisodesException(Exception):
54 """ raised when parsing something that doesn't contain any episodes """
57 class PodcastUpdater(object):
58 """ Updates a number of podcasts with data from their feeds """
60 def update_queue(self
, queue
):
61 """ Fetch data for the URLs supplied as the queue iterable """
63 for n
, podcast_url
in enumerate(queue
, 1):
64 logger
.info('Update %d - %s', n
, podcast_url
)
66 yield self
.update(podcast_url
)
68 except NoPodcastCreated
as npc
:
69 logger
.info('No podcast created: %s', npc
)
72 def update(self
, podcast_url
):
73 """ Update the podcast for the supplied URL """
76 parsed
= self
._fetch
_feed
(podcast_url
)
77 self
._validate
_parsed
(parsed
)
79 except (ParserException
, FetchFeedException
, NoEpisodesException
,
80 VimeoError
, ValueError, socket
.error
, urllib2
.HTTPError
) as ex
:
81 #TODO: catch valueError (for invalid Ipv6 in feedservice)
83 if isinstance(ex
, VimeoError
):
84 logger
.exception('Problem when updating Vimeo feed %s',
87 # if we fail to parse the URL, we don't even create the
90 p
= Podcast
.objects
.get(urls__url
=podcast_url
)
91 # if it exists already, we mark it as outdated
92 self
._mark
_outdated
(p
, 'error while fetching feed: %s' %
96 except Podcast
.DoesNotExist
:
97 raise NoPodcastCreated(ex
)
99 assert parsed
, 'fetch_feed must return something'
100 p
= Podcast
.objects
.get_or_create_for_url(podcast_url
)
101 episodes
= self
._update
_episodes
(p
, parsed
.episodes
)
102 self
._update
_podcast
(p
, parsed
, episodes
)
106 def verify_podcast_url(self
, podcast_url
):
107 parsed
= self
._fetch
_feed
(podcast_url
)
108 self
._validate
_parsed
(parsed
)
112 def _fetch_feed(self
, podcast_url
):
114 t
= socket
.getdefaulttimeout()
115 socket
.setdefaulttimeout(10)
116 return parse_feed(podcast_url
, text_processor
=ConvertMarkdown())
117 socket
.setdefaulttimeout(t
)
121 def _validate_parsed(self
, parsed
):
122 """ validates the parsed results and raises an exception if invalid
124 feedparser parses pretty much everything. We reject anything that
125 doesn't look like a feed"""
127 if not parsed
or not parsed
.episodes
:
128 raise NoEpisodesException('no episodes found')
131 def _update_podcast(self
, podcast
, parsed
, episodes
):
132 """ updates a podcast according to new parser results """
134 # we need that later to decide if we can "bump" a category
135 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
137 podcast
.title
= parsed
.title
or podcast
.title
138 podcast
.description
= parsed
.description
or podcast
.description
139 podcast
.subtitle
= parsed
.subtitle
or podcast
.subtitle
140 podcast
.link
= parsed
.link
or podcast
.link
141 podcast
.logo_url
= parsed
.logo
or podcast
.logo_url
142 podcast
.author
= parsed
.author
or podcast
.author
143 podcast
.language
= parsed
.language
or podcast
.language
144 podcast
.content_types
= parsed
.content_types
or podcast
.content_types
145 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
146 podcast
.common_episode_title
= parsed
.common_title
or podcast
.common_episode_title
147 podcast
.new_location
= parsed
.new_location
or podcast
.new_location
148 podcast
.flattr_url
= parsed
.flattr
or podcast
.flattr_url
149 podcast
.hub
= parsed
.hub
or podcast
.hub
150 podcast
.license
= parsed
.license
or podcast
.license
152 podcast
.add_missing_urls(parsed
.urls
)
154 if podcast
.new_location
:
156 new_podcast
= Podcast
.objects
.get(urls__url
=podcast
.new_location
)
157 if new_podcast
!= podcast
:
158 self
._mark
_outdated
(podcast
, 'redirected to different podcast')
160 except Podcast
.DoesNotExist
:
161 podcast
.urls
.insert(0, podcast
.new_location
)
164 # latest episode timestamp
165 episodes
= Episode
.objects
.filter(podcast
=podcast
, released__isnull
=False).order_by('released')
167 podcast
.update_interval
= get_update_interval(episodes
)
169 latest_episode
= episodes
.last()
171 podcast
.latest_episode_timestamp
= latest_episode
.released
173 podcast
.episode_count
= Episode
.objects
.filter(podcast
=podcast
).count()
176 self
._update
_categories
(podcast
, prev_latest_episode_timestamp
)
178 # try to download the logo and reset logo_url to None on http errors
179 found
= self
._save
_podcast
_logo
(podcast
.logo_url
)
181 podcast
.logo_url
= None
183 # The podcast is always saved (not just when there are changes) because
184 # we need to record the last update
185 logger
.info('Saving podcast.')
186 podcast
.last_update
= datetime
.utcnow()
191 subscribe_at_hub(podcast
)
192 except SubscriptionError
as se
:
193 logger
.warn('subscribing to hub failed: %s', str(se
))
197 slug
= PodcastSlug(podcast
).get_slug()
199 podcast
.add_slug(slug
)
201 assign_missing_episode_slugs(podcast
)
204 def _update_categories(self
, podcast
, prev_timestamp
):
205 """ checks some practical requirements and updates a category """
207 from datetime
import timedelta
209 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
212 if not podcast
.latest_episode_timestamp
:
216 if prev_timestamp
and podcast
.latest_episode_timestamp
<= prev_timestamp
:
219 # too far in the future
220 if podcast
.latest_episode_timestamp
> max_timestamp
:
223 # not enough subscribers
224 if podcast
.subscriber_count() < settings
.MIN_SUBSCRIBERS_CATEGORY
:
227 update_category(podcast
)
230 def _update_episodes(self
, podcast
, parsed_episodes
):
232 pid
= podcast
.get_id()
234 # list of (obj, fun) where fun is the function to update obj
235 updated_episodes
= []
236 episodes_to_update
= list(islice(parsed_episodes
, 0, MAX_EPISODES_UPDATE
))
237 logger
.info('Parsed %d (%d) episodes', len(parsed_episodes
),
238 len(episodes_to_update
))
240 logger
.info('Updating %d episodes', len(episodes_to_update
))
241 for n
, parsed
in enumerate(episodes_to_update
, 1):
243 url
= get_episode_url(parsed
)
245 logger
.info('Skipping episode %d for missing URL', n
)
248 logger
.info('Updating episode %d / %d', n
, len(parsed_episodes
))
250 episode
= Episode
.objects
.get_or_create_for_url(podcast
, url
)
252 update_episode(parsed
, episode
, podcast
)
253 updated_episodes
.append(episode
)
255 # and mark the remaining ones outdated
256 current_episodes
= Episode
.objects
.filter(podcast
=podcast
,
257 outdated
=False)[:500]
258 outdated_episodes
= set(current_episodes
) - set(updated_episodes
)
260 logger
.info('Marking %d episodes as outdated', len(outdated_episodes
))
261 for episode
in outdated_episodes
:
262 mark_outdated(episode
)
265 def _save_podcast_logo(self
, cover_art
):
270 image_sha1
= hashlib
.sha1(cover_art
).hexdigest()
271 prefix
= CoverArt
.get_prefix(image_sha1
)
273 filename
= CoverArt
.get_original(prefix
, image_sha1
)
274 dirname
= CoverArt
.get_dir(filename
)
276 # get hash of existing file
277 if os
.path
.exists(filename
):
278 with
open(filename
) as f
:
279 old_hash
= file_hash(f
).digest()
283 logger
.info('Logo %s', cover_art
)
286 with
open(filename
, 'w') as fp
:
287 fp
.write(urllib2
.urlopen(cover_art
).read())
289 # get hash of new file
290 with
open(filename
) as f
:
291 new_hash
= file_hash(f
).digest()
293 # remove thumbnails if cover changed
294 if old_hash
!= new_hash
:
295 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
296 logger
.info('Removing %d thumbnails', len(thumbnails
))
302 except (urllib2
.HTTPError
, urllib2
.URLError
, ValueError,
303 httplib
.BadStatusLine
, socket
.error
, IOError) as e
:
304 logger
.warn('Exception while updating podcast logo: %s', str(e
))
307 def _mark_outdated(self
, podcast
, msg
=''):
308 logger
.info('marking podcast outdated: %s', msg
)
309 podcast
.outdated
= True
310 podcast
.last_update
= datetime
.utcnow()
312 self
._update
_episodes
(podcast
, [])
315 def get_episode_url(parsed_episode
):
316 """ returns the URL of a parsed episode """
317 for f
in parsed_episode
.files
:
323 def update_episode(parsed_episode
, episode
, podcast
):
324 """ updates "episode" with the data from "parsed_episode" """
326 # TODO: check if there have been any changes, to avoid unnecessary updates
327 episode
.guid
= parsed_episode
.guid
or episode
.guid
328 episode
.description
= parsed_episode
.description
or episode
.description
329 episode
.subtitle
= parsed_episode
.subtitle
or episode
.subtitle
330 episode
.content
= parsed_episode
.content
or parsed_episode
.description
or episode
.content
331 episode
.link
= parsed_episode
.link
or episode
.link
332 episode
.released
= datetime
.utcfromtimestamp(parsed_episode
.released
) if parsed_episode
.released
else episode
.released
333 episode
.author
= parsed_episode
.author
or episode
.author
334 episode
.duration
= parsed_episode
.duration
or episode
.duration
335 episode
.filesize
= parsed_episode
.files
[0].filesize
336 episode
.language
= parsed_episode
.language
or episode
.language
or \
338 episode
.mimetypes
= list(set(filter(None, [f
.mimetype
for f
in parsed_episode
.files
])))
339 episode
.flattr_url
= parsed_episode
.flattr
or episode
.flattr_url
340 episode
.license
= parsed_episode
.license
or episode
.license
342 episode
.title
= parsed_episode
.title
or episode
.title
or \
343 file_basename_no_extension(episode
.url
)
345 episode
.last_update
= datetime
.utcnow()
348 parsed_urls
= list(chain
.from_iterable(f
.urls
for f
in parsed_episode
.files
))
349 episode
.add_missing_urls(parsed_urls
)
352 def mark_outdated(obj
):
353 """ marks obj outdated if its not already """
358 obj
.last_update
= datetime
.utcnow()
362 def get_update_interval(episodes
):
363 """ calculates the avg interval between new episodes """
365 count
= len(episodes
)
367 logger
.info('no episodes, using default interval of %dh',
368 DEFAULT_UPDATE_INTERVAL
)
369 return DEFAULT_UPDATE_INTERVAL
371 earliest
= episodes
[0]
372 now
= datetime
.utcnow()
374 timespan_s
= (now
- earliest
.released
).total_seconds()
375 timespan_h
= timespan_s
/ 60 / 60
377 interval
= int(timespan_h
/ count
)
378 logger
.info('%d episodes in %d days => %dh interval', count
,
379 timespan_h
/ 24, interval
)
381 # place interval between {MIN,MAX}_UPDATE_INTERVAL
382 interval
= max(interval
, MIN_UPDATE_INTERVAL
)
383 interval
= min(interval
, MAX_UPDATE_INTERVAL
)
388 def file_basename_no_extension(filename
):
389 """ Returns filename without extension
391 >>> file_basename_no_extension('/home/me/file.txt')
394 >>> file_basename_no_extension('file')
397 base
= os
.path
.basename(filename
)
398 name
, extension
= os
.path
.splitext(base
)