2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
25 from datetime
import datetime
26 from itertools
import chain
28 from django
.conf
import settings
30 from mygpo
.core
.slugs
import assign_missing_episode_slugs
, assign_slug
, \
32 from feedservice
.parse
import parse_feed
, FetchFeedException
33 from feedservice
.parse
.text
import ConvertMarkdown
34 from feedservice
.parse
.models
import ParserException
35 from mygpo
.utils
import file_hash
, deep_eq
36 from mygpo
.web
.logo
import CoverArt
37 from mygpo
.data
.podcast
import subscribe_at_hub
38 from mygpo
.db
.couchdb
.episode
import episode_for_podcast_id_url
, \
39 episodes_for_podcast_current
40 from mygpo
.db
.couchdb
.podcast
import podcast_for_url
, reload_podcast
41 from mygpo
.directory
.tags
import update_category
42 from mygpo
.decorators
import repeat_on_conflict
43 from mygpo
.db
.couchdb
import get_main_database
, bulk_save_retry
46 logger
= logging
.getLogger(__name__
)
49 class NoPodcastCreated(Exception):
50 """ raised when no podcast obj was created for a new URL """
53 class NoEpisodesException(Exception):
54 """ raised when parsing something that doesn't contain any episodes """
57 class PodcastUpdater(object):
58 """ Updates a number of podcasts with data from their feeds """
61 """ Queue is an iterable of podcast objects """
62 self
.db
= get_main_database()
65 def update_queue(self
, queue
):
66 """ Fetch data for the URLs supplied as the queue iterable """
68 for n
, podcast_url
in enumerate(queue
, 1):
69 logger
.info('Update %d - %s', n
, podcast_url
)
71 yield self
.update(podcast_url
)
73 except NoPodcastCreated
as npc
:
74 logger
.info('No podcast created: %s', npc
)
77 def update(self
, podcast_url
):
78 """ Update the podcast for the supplied URL """
81 parsed
= self
._fetch
_feed
(podcast_url
)
82 self
._validate
_parsed
(parsed
)
84 except (ParserException
, FetchFeedException
, NoEpisodesException
) as ex
:
86 # if we fail to parse the URL, we don't even create the
88 p
= podcast_for_url(podcast_url
, create
=False)
90 # if it exists already, we mark it as outdated
91 self
._mark
_outdated
(p
, 'error while fetching feed: %s' %
96 raise NoPodcastCreated(ex
)
98 assert parsed
, 'fetch_feed must return something'
99 p
= podcast_for_url(podcast_url
, create
=True)
100 episodes
= self
._update
_episodes
(p
, parsed
.episodes
)
101 self
._update
_podcast
(p
, parsed
, episodes
)
105 def verify_podcast_url(self
, podcast_url
):
106 parsed
= self
._fetch
_feed
(podcast_url
)
107 self
._validate
_parsed
(parsed
)
111 def _fetch_feed(self
, podcast_url
):
112 return parse_feed(podcast_url
, text_processor
=ConvertMarkdown())
116 def _validate_parsed(self
, parsed
):
117 """ validates the parsed results and raises an exception if invalid
119 feedparser parses pretty much everything. We reject anything that
120 doesn't look like a feed"""
122 if not parsed
or not parsed
.episodes
:
123 raise NoEpisodesException('no episodes found')
126 @repeat_on_conflict(['podcast'], reload_f
=reload_podcast
)
127 def _update_podcast(self
, podcast
, parsed
, episodes
):
128 """ updates a podcast according to new parser results """
130 # we need that later to decide if we can "bump" a category
131 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
133 old_json
= copy
.deepcopy(podcast
.to_json())
135 podcast
.title
= parsed
.title
or podcast
.title
136 podcast
.urls
= list(set(podcast
.urls
+ parsed
.urls
))
137 podcast
.description
= parsed
.description
or podcast
.description
138 podcast
.subtitle
= parsed
.subtitle
or podcast
.subtitle
139 podcast
.link
= parsed
.link
or podcast
.link
140 podcast
.logo_url
= parsed
.logo
or podcast
.logo_url
141 podcast
.author
= parsed
.author
or podcast
.author
142 podcast
.language
= parsed
.language
or podcast
.language
143 podcast
.content_types
= parsed
.content_types
or podcast
.content_types
144 podcast
.tags
['feed'] = parsed
.tags
or podcast
.tags
.get('feed', [])
145 podcast
.common_episode_title
= parsed
.common_title
or podcast
.common_episode_title
146 podcast
.new_location
= parsed
.new_location
or podcast
.new_location
147 podcast
.flattr_url
= parsed
.flattr
or podcast
.flattr_url
148 podcast
.hub
= parsed
.hub
or podcast
.hub
149 podcast
.license
= parsed
.license
or podcast
.license
152 if podcast
.new_location
:
153 new_podcast
= podcast_for_url(podcast
.new_location
)
154 if new_podcast
!= podcast
:
155 self
._mark
_outdated
(podcast
, 'redirected to different podcast')
158 elif not new_podcast
:
159 podcast
.urls
.insert(0, podcast
.new_location
)
162 logger
.info('Retrieved %d episodes in total', len(episodes
))
164 # latest episode timestamp
165 eps
= filter(lambda e
: bool(e
.released
), episodes
)
166 eps
= sorted(eps
, key
=lambda e
: e
.released
)
168 podcast
.latest_episode_timestamp
= eps
[-1].released
169 podcast
.episode_count
= len(eps
)
172 self
._update
_categories
(podcast
, prev_latest_episode_timestamp
)
174 # try to download the logo and reset logo_url to None on http errors
175 found
= self
._save
_podcast
_logo
(podcast
.logo_url
)
177 podcast
.logo_url
= None
179 if not deep_eq(old_json
, podcast
.to_json()):
180 logger
.info('Saving podcast.')
181 podcast
.last_update
= datetime
.utcnow()
185 subscribe_at_hub(podcast
)
187 assign_slug(podcast
, PodcastSlug
)
188 assign_missing_episode_slugs(podcast
)
191 def _update_categories(self
, podcast
, prev_timestamp
):
192 """ checks some practical requirements and updates a category """
194 from datetime
import timedelta
196 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
199 if not podcast
.latest_episode_timestamp
:
203 if prev_timestamp
and podcast
.latest_episode_timestamp
<= prev_timestamp
:
206 # too far in the future
207 if podcast
.latest_episode_timestamp
> max_timestamp
:
210 # not enough subscribers
211 if podcast
.subscriber_count() < settings
.MIN_SUBSCRIBERS_CATEGORY
:
214 update_category(podcast
)
217 def _update_episodes(self
, podcast
, parsed_episodes
):
219 pid
= podcast
.get_id()
221 # list of (obj, fun) where fun is the function to update obj
223 logger
.info('Parsed %d episodes', len(parsed_episodes
))
225 for n
, parsed
in enumerate(parsed_episodes
, 1):
227 url
= get_episode_url(parsed
)
229 logger
.info('Skipping episode %d for missing URL', n
)
232 logger
.info('Updating episode %d / %d', n
, len(parsed_episodes
))
233 episode
= episode_for_podcast_id_url(pid
, url
, create
=True)
235 update_episode
= get_episode_update_function(parsed
, episode
)
236 changes
.append((episode
, update_episode
))
238 # determine which episodes have been found
239 updated_episodes
= [e
for (e
, f
) in changes
]
240 logging
.info('Updating %d episodes with new data', len(updated_episodes
))
242 # and mark the remaining ones outdated
243 current_episodes
= set(episodes_for_podcast_current(podcast
, limit
=100))
244 outdated_episodes
= current_episodes
- set(updated_episodes
)
245 logging
.info('Marking %d episodes as outdated', len(outdated_episodes
))
246 changes
.extend((e
, mark_outdated
) for e
in outdated_episodes
)
248 logging
.info('Saving %d changes', len(changes
))
249 bulk_save_retry(changes
, self
.db
)
251 return updated_episodes
254 def _save_podcast_logo(self
, cover_art
):
259 image_sha1
= hashlib
.sha1(cover_art
).hexdigest()
260 prefix
= CoverArt
.get_prefix(image_sha1
)
262 filename
= CoverArt
.get_original(prefix
, image_sha1
)
263 dirname
= CoverArt
.get_dir(filename
)
265 # get hash of existing file
266 if os
.path
.exists(filename
):
267 with
open(filename
) as f
:
268 old_hash
= file_hash(f
).digest()
272 logger
.info('Logo %s', cover_art
)
275 with
open(filename
, 'w') as fp
:
276 fp
.write(urllib2
.urlopen(cover_art
).read())
278 # get hash of new file
279 with
open(filename
) as f
:
280 new_hash
= file_hash(f
).digest()
282 # remove thumbnails if cover changed
283 if old_hash
!= new_hash
:
284 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
285 logger
.info('Removing %d thumbnails', len(thumbnails
))
291 except (urllib2
.HTTPError
, urllib2
.URLError
, ValueError,
292 httplib
.BadStatusLine
) as e
:
293 logger
.warn('Exception while updating podcast: %s', str(e
))
296 @repeat_on_conflict(['podcast'], reload_f
=reload_podcast
)
297 def _mark_outdated(self
, podcast
, msg
=''):
298 logger
.info('marking podcast outdated: %s', msg
)
299 podcast
.outdated
= True
300 podcast
.last_update
= datetime
.utcnow()
302 self
._update
_episodes
(podcast
, [])
305 def get_episode_url(parsed_episode
):
306 """ returns the URL of a parsed episode """
307 for f
in parsed_episode
.files
:
313 def get_episode_update_function(parsed_episode
, episode
):
314 """ returns an update function that can be passed to bulk_save_retry """
316 def update_episode(episode
):
317 """ updates "episode" with the data from "parsed_episode" """
319 # copy the json so we can determine if there have been any changes
320 old_json
= copy
.deepcopy(episode
.to_json())
322 episode
.guid
= parsed_episode
.guid
or episode
.guid
323 episode
.title
= parsed_episode
.title
or episode
.title
324 episode
.description
= parsed_episode
.description
or episode
.description
325 episode
.subtitle
= parsed_episode
.subtitle
or episode
.subtitle
326 episode
.content
= parsed_episode
.content
or parsed_episode
.description
or episode
.content
327 episode
.link
= parsed_episode
.link
or episode
.link
328 episode
.released
= datetime
.utcfromtimestamp(parsed_episode
.released
) if parsed_episode
.released
else episode
.released
329 episode
.author
= parsed_episode
.author
or episode
.author
330 episode
.duration
= parsed_episode
.duration
or episode
.duration
331 episode
.filesize
= parsed_episode
.files
[0].filesize
332 episode
.language
= parsed_episode
.language
or episode
.language
333 episode
.mimetypes
= list(set(filter(None, [f
.mimetype
for f
in parsed_episode
.files
])))
334 episode
.flattr_url
= parsed_episode
.flattr
or episode
.flattr_url
335 episode
.license
= parsed_episode
.license
or episode
.license
337 urls
= list(chain
.from_iterable(f
.urls
for f
in parsed_episode
.files
))
338 episode
.urls
= sorted(set(episode
.urls
+ urls
), key
=len)
340 # if nothing changed we return None to indicate no required action
341 if deep_eq(old_json
, episode
.to_json()):
344 # set the last_update only if there have been changed above
345 episode
.last_update
= datetime
.utcnow()
348 return update_episode
350 def mark_outdated(obj
):
351 """ marks obj outdated if its not already """
356 obj
.last_update
= datetime
.utcnow()