2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
24 from datetime
import datetime
25 from itertools
import chain
27 from django
.conf
import settings
29 from mygpo
.core
.slugs
import assign_missing_episode_slugs
, assign_slug
, \
31 from feedservice
.parse
import parse_feed
, FetchFeedException
32 from feedservice
.parse
.text
import ConvertMarkdown
33 from feedservice
.parse
.models
import ParserException
34 from mygpo
.utils
import file_hash
, split_list
35 from mygpo
.web
.logo
import CoverArt
36 from mygpo
.db
.couchdb
.episode
import episode_for_podcast_id_url
, \
37 episodes_for_podcast_uncached
38 from mygpo
.db
.couchdb
.podcast
import podcast_for_url
39 from mygpo
.directory
.tags
import update_category
40 from mygpo
.decorators
import repeat_on_conflict
41 from mygpo
.db
.couchdb
import get_main_database
44 class NoPodcastCreated(Exception):
45 """ raised when no podcast obj was created for a new URL """
48 class NoEpisodesException(Exception):
49 """ raised when parsing something that doesn't contain any episodes """
52 class PodcastUpdater(object):
53 """ Updates a number of podcasts with data from their feeds """
56 """ Queue is an iterable of podcast objects """
57 self
.db
= get_main_database()
60 def update_queue(self
, queue
):
61 """ Fetch data for the URLs supplied as the queue iterable """
63 for n
, podcast_url
in enumerate(queue
):
66 yield self
.update(podcast_url
)
68 except NoPodcastCreated
as npc
:
69 print 'no podcast created:', npc
74 def update(self
, podcast_url
):
75 """ Update the podcast for the supplied URL """
78 parsed
= self
._fetch
_feed
(podcast_url
)
79 self
._validate
_parsed
(parsed
)
81 except (ParserException
, FetchFeedException
, NoEpisodesException
) as ex
:
83 # if we fail to parse the URL, we don't even create the
85 p
= podcast_for_url(podcast_url
, create
=False)
87 # if it exists already, we mark it as outdated
88 self
._mark
_outdated
(p
)
92 raise NoPodcastCreated(ex
)
94 assert parsed
, 'fetch_feed must return something'
95 p
= podcast_for_url(podcast_url
, create
=True)
96 self
._update
_podcast
(p
, parsed
)
100 def verify_podcast_url(self
, podcast_url
):
101 parsed
= self
._fetch
_feed
(podcast_url
)
102 self
._validate
_parsed
(parsed
)
106 def _fetch_feed(self
, podcast_url
):
107 return parse_feed(podcast_url
, text_processor
=ConvertMarkdown())
111 def _validate_parsed(self
, parsed
):
112 """ validates the parsed results and raises an exception if invalid
114 feedparser parses pretty much everything. We reject anything that
115 doesn't look like a feed"""
117 if not parsed
or not parsed
.episodes
:
118 raise NoEpisodesException('no episodes found')
121 @repeat_on_conflict(['podcast'])
122 def _update_podcast(self
, podcast
, parsed
):
123 """ updates a podcast according to new parser results """
127 # we need that later to decide if we can "bump" a category
128 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
130 changed |
= update_a(podcast
, 'title', parsed
.title
or podcast
.title
)
131 changed |
= update_a(podcast
, 'urls', list(set(podcast
.urls
+ parsed
.urls
)))
132 changed |
= update_a(podcast
, 'description', parsed
.description
or podcast
.description
)
133 changed |
= update_a(podcast
, 'link', parsed
.link
or podcast
.link
)
134 changed |
= update_a(podcast
, 'logo_url', parsed
.logo
or podcast
.logo_url
)
135 changed |
= update_a(podcast
, 'author', parsed
.author
or podcast
.author
)
136 changed |
= update_a(podcast
, 'language', parsed
.language
or podcast
.language
)
137 changed |
= update_a(podcast
, 'content_types', parsed
.content_types
or podcast
.content_types
)
138 changed |
= update_i(podcast
.tags
, 'feed', parsed
.tags
or podcast
.tags
.get('feed', []))
139 changed |
= update_a(podcast
, 'common_episode_title', parsed
.common_title
or podcast
.common_episode_title
)
140 changed |
= update_a(podcast
, 'new_location', parsed
.new_location
or podcast
.new_location
)
141 changed |
= update_a(podcast
, 'flattr_url', parsed
.flattr
)
144 if podcast
.new_location
:
145 new_podcast
= podcast_for_url(podcast
.new_location
)
146 if new_podcast
!= podcast
:
147 self
._mark
_outdated
(podcast
, 'redirected to different podcast')
150 elif not new_podcast
:
151 podcast
.urls
.insert(0, podcast
.new_location
)
155 episodes
= self
._update
_episodes
(podcast
, parsed
.episodes
)
157 # latest episode timestamp
158 eps
= filter(lambda e
: bool(e
.released
), episodes
)
159 eps
= sorted(eps
, key
=lambda e
: e
.released
)
161 changed |
= update_a(podcast
, 'latest_episode_timestamp', eps
[-1].released
)
162 changed |
= update_a(podcast
, 'episode_count', len(eps
))
165 self
._update
_categories
(podcast
, prev_latest_episode_timestamp
)
167 # try to download the logo and reset logo_url to None on http errors
168 found
= self
._save
_podcast
_logo
(podcast
.logo_url
)
170 changed |
= update_a(podcast
, 'logo_url', None)
173 print 'saving podcast'
174 podcast
.last_update
= datetime
.utcnow()
178 assign_slug(podcast
, PodcastSlug
)
179 assign_missing_episode_slugs(podcast
)
182 def _update_categories(self
, podcast
, prev_timestamp
):
183 """ checks some practical requirements and updates a category """
185 from datetime
import timedelta
187 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
190 if not podcast
.latest_episode_timestamp
:
194 if prev_timestamp
and podcast
.latest_episode_timestamp
<= prev_timestamp
:
197 # too far in the future
198 if podcast
.latest_episode_timestamp
> max_timestamp
:
201 # not enough subscribers
202 if podcast
.subscriber_count() < settings
.MIN_SUBSCRIBERS_CATEGORY
:
205 update_category(podcast
)
208 @repeat_on_conflict(['podcast'])
209 def _update_episodes(self
, podcast
, parsed_episodes
):
211 all_episodes
= set(episodes_for_podcast_uncached(podcast
))
212 remaining
= list(all_episodes
)
213 updated_episodes
= []
215 for parsed_episode
in parsed_episodes
:
219 for f
in parsed_episode
.files
:
226 guid
= parsed_episode
.guid
228 # pop matchin episodes out of the "existing" list
229 matching
, remaining
= split_list(remaining
, lambda e
: (e
.guid
and e
.guid
== guid
) or url
in e
.urls
)
232 new_episode
= episode_for_podcast_id_url(podcast
.get_id(),
234 matching
= [new_episode
]
235 all_episodes
.add(new_episode
)
238 for episode
in matching
:
240 changed |
= update_a(episode
, 'guid', parsed_episode
.guid
or episode
.guid
)
241 changed |
= update_a(episode
, 'title', parsed_episode
.title
or episode
.title
)
242 changed |
= update_a(episode
, 'description', parsed_episode
.description
or episode
.description
)
243 changed |
= update_a(episode
, 'content', parsed_episode
.content
or parsed_episode
.description
or episode
.content
)
244 changed |
= update_a(episode
, 'link', parsed_episode
.link
or episode
.link
)
245 changed |
= update_a(episode
, 'released', datetime
.utcfromtimestamp(parsed_episode
.released
) if parsed_episode
.released
else episode
.released
)
246 changed |
= update_a(episode
, 'author', parsed_episode
.author
or episode
.author
)
247 changed |
= update_a(episode
, 'duration', parsed_episode
.duration
or episode
.duration
)
248 changed |
= update_a(episode
, 'filesize', parsed_episode
.files
[0].filesize
)
249 changed |
= update_a(episode
, 'language', parsed_episode
.language
or episode
.language
)
250 changed |
= update_a(episode
, 'mimetypes', list(set(filter(None, [f
.mimetype
for f
in parsed_episode
.files
]))))
251 changed |
= update_a(episode
, 'flattr_url', parsed_episode
.flattr
)
253 urls
= list(chain
.from_iterable(f
.urls
for f
in parsed_episode
.files
))
254 changed |
= update_a(episode
, 'urls', sorted(set(episode
.urls
+ urls
), key
=len))
257 episode
.last_update
= datetime
.utcnow()
258 updated_episodes
.append(episode
)
261 outdated_episodes
= all_episodes
- set(updated_episodes
)
263 # set episodes to be outdated, where necessary
264 for e
in filter(lambda e
: not e
.outdated
, outdated_episodes
):
266 updated_episodes
.append(e
)
270 print 'Updating', len(updated_episodes
), 'episodes'
271 self
.db
.save_docs(updated_episodes
)
276 def _save_podcast_logo(self
, cover_art
):
281 image_sha1
= hashlib
.sha1(cover_art
).hexdigest()
282 prefix
= CoverArt
.get_prefix(image_sha1
)
284 filename
= CoverArt
.get_original(prefix
, image_sha1
)
285 dirname
= CoverArt
.get_dir(filename
)
287 # get hash of existing file
288 if os
.path
.exists(filename
):
289 with
open(filename
) as f
:
290 old_hash
= file_hash(f
).digest()
294 print 'LOGO @', cover_art
297 with
open(filename
, 'w') as fp
:
298 fp
.write(urllib2
.urlopen(cover_art
).read())
300 # get hash of new file
301 with
open(filename
) as f
:
302 new_hash
= file_hash(f
).digest()
304 # remove thumbnails if cover changed
305 if old_hash
!= new_hash
:
306 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
307 print 'Removing %d thumbnails' % len(thumbnails
)
313 except (urllib2
.HTTPError
, urllib2
.URLError
, ValueError,
314 httplib
.BadStatusLine
) as e
:
318 @repeat_on_conflict(['podcast'])
319 def _mark_outdated(self
, podcast
, msg
=''):
320 print 'mark outdated', msg
321 podcast
.outdated
= True
322 podcast
.last_update
= datetime
.utcnow()
324 self
._update
_episodes
(podcast
, [])
330 def update_a(obj
, attrib
, value
):
331 changed
= getattr(obj
, attrib
, _none
) != value
332 setattr(obj
, attrib
, value
)
336 def update_i(obj
, item
, value
):
337 changed
= obj
.get(item
, _none
) != value