2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
23 from datetime
import datetime
24 from itertools
import chain
26 from mygpo
.core
.models
import Podcast
, PodcastGroup
27 from mygpo
.core
.slugs
import assign_missing_episode_slugs
, assign_slug
, \
29 from feedservice
.parse
import parse_feed
30 from feedservice
.parse
.text
import ConvertMarkdown
31 from mygpo
.utils
import file_hash
, split_list
32 from mygpo
.web
.logo
import CoverArt
33 from mygpo
.db
.couchdb
.episode
import episode_for_podcast_id_url
, \
34 episodes_for_podcast_uncached
35 from mygpo
.db
.couchdb
.podcast
import podcast_for_url
36 from mygpo
.db
.couchdb
.directory
import category_for_tag
37 from mygpo
.directory
.tags
import update_category
39 from mygpo
.couch
import get_main_database
44 class PodcastUpdater(object):
45 """ Updates a number of podcasts with data from their feeds """
47 def __init__(self
, queue
):
48 """ Queue is an iterable of podcast objects """
50 self
.db
= get_main_database()
54 """ Run the updates """
56 for n
, podcast
in enumerate(self
.queue
):
58 if isinstance(podcast
, PodcastGroup
):
59 for m
in range(len(podcast
.podcasts
)):
60 pg
= PodcastGroup
.get(podcast
._id
)
62 print '{:5d} {:s}'.format(n
, p
.url
)
63 self
.update_podcast(p
)
66 print '{:5d} {:s}'.format(n
, podcast
.url
)
67 self
.update_podcast(podcast
)
72 def update_podcast(self
, podcast
):
75 parsed
= parse_feed(podcast
.url
, text_processor
=ConvertMarkdown())
77 except urllib2
.HTTPError
as e
:
78 if e
.code
in (404, 400):
79 self
.mark_outdated(podcast
)
85 self
.mark_outdated(podcast
)
88 podcast
= podcast_for_url(parsed
.urls
[0])
91 # we need that later to decide if we can "bump" a category
92 prev_latest_episode_timestamp
= podcast
.latest_episode_timestamp
94 changed |
= update_a(podcast
, 'title', parsed
.title
or podcast
.title
)
95 changed |
= update_a(podcast
, 'urls', list(set(podcast
.urls
+ parsed
.urls
)))
96 changed |
= update_a(podcast
, 'description', parsed
.description
or podcast
.description
)
97 changed |
= update_a(podcast
, 'link', parsed
.link
or podcast
.link
)
98 changed |
= update_a(podcast
, 'logo_url', parsed
.logo
or podcast
.logo_url
)
99 changed |
= update_a(podcast
, 'author', parsed
.author
or podcast
.author
)
100 changed |
= update_a(podcast
, 'language', parsed
.language
or podcast
.language
)
101 changed |
= update_a(podcast
, 'content_types', parsed
.content_types
or podcast
.content_types
)
102 changed |
= update_i(podcast
.tags
, 'feed', parsed
.tags
or podcast
.tags
.get('feed', []))
103 changed |
= update_a(podcast
, 'common_episode_title', parsed
.common_title
or podcast
.common_episode_title
)
104 changed |
= update_a(podcast
, 'new_location', parsed
.new_location
or podcast
.new_location
)
107 if podcast
.new_location
:
108 new_podcast
= Podcast
.for_url(podcast
.new_location
)
110 self
.mark_outdated(podcast
)
114 podcast
.urls
.insert(0, podcast
.new_location
)
118 episodes
= self
.update_episodes(podcast
, parsed
.episodes
)
120 # latest episode timestamp
121 eps
= filter(lambda e
: bool(e
.released
), episodes
)
122 eps
= sorted(eps
, key
=lambda e
: e
.released
)
124 changed |
= update_a(podcast
, 'latest_episode_timestamp', eps
[-1].released
)
127 self
.update_categories(podcast
, prev_latest_episode_timestamp
)
130 print ' saving podcast'
131 podcast
.last_update
= datetime
.utcnow()
135 assign_slug(podcast
, PodcastSlug
)
136 assign_missing_episode_slugs(podcast
)
138 self
.save_podcast_logo(podcast
.logo_url
)
141 def update_categories(self
, podcast
, prev_timestamp
, min_subscribers
=5):
142 from datetime
import timedelta
144 max_timestamp
= datetime
.utcnow() + timedelta(days
=1)
147 if not podcast
.latest_episode_timestamp
:
151 if prev_timestamp
and podcast
.latest_episode_timestamp
<= prev_timestamp
:
154 # too far in the future
155 if podcast
.latest_episode_timestamp
> max_timestamp
:
158 # not enough subscribers
159 if podcast
.subscriber_count() < min_subscribers
:
162 update_category(podcast
)
165 def update_episodes(self
, podcast
, parsed_episodes
):
167 all_episodes
= set(episodes_for_podcast_uncached(podcast
))
168 remaining
= list(all_episodes
)
169 updated_episodes
= []
171 for parsed_episode
in parsed_episodes
:
175 for f
in parsed_episode
.files
:
182 guid
= parsed_episode
.guid
184 # pop matchin episodes out of the "existing" list
185 matching
, remaining
= split_list(remaining
, lambda e
: (e
.guid
and e
.guid
== guid
) or url
in e
.urls
)
188 new_episode
= episode_for_podcast_id_url(podcast
.get_id(),
190 matching
= [new_episode
]
191 all_episodes
.add(new_episode
)
194 for episode
in matching
:
196 changed |
= update_a(episode
, 'guid', parsed_episode
.guid
or episode
.guid
)
197 changed |
= update_a(episode
, 'title', parsed_episode
.title
or episode
.title
)
198 changed |
= update_a(episode
, 'description', parsed_episode
.description
or episode
.description
)
199 changed |
= update_a(episode
, 'content', parsed_episode
.content
or parsed_episode
.description
or episode
.content
)
200 changed |
= update_a(episode
, 'link', parsed_episode
.link
or episode
.link
)
201 changed |
= update_a(episode
, 'released', datetime
.utcfromtimestamp(parsed_episode
.released
) if parsed_episode
.released
else None)
202 changed |
= update_a(episode
, 'author', parsed_episode
.author
or episode
.author
)
203 changed |
= update_a(episode
, 'duration', parsed_episode
.duration
or episode
.duration
)
204 changed |
= update_a(episode
, 'filesize', parsed_episode
.files
[0].filesize
)
205 changed |
= update_a(episode
, 'language', parsed_episode
.language
or episode
.language
)
206 changed |
= update_a(episode
, 'mimetypes', list(set(filter(None, [f
.mimetype
for f
in parsed_episode
.files
]))))
208 urls
= list(chain
.from_iterable(f
.urls
for f
in parsed_episode
.files
))
209 changed |
= update_a(episode
, 'urls', sorted(set(episode
.urls
+ urls
), key
=len))
212 episode
.last_update
= datetime
.utcnow()
213 updated_episodes
.append(episode
)
215 #episode.content_types = None #TODO
218 outdated_episodes
= all_episodes
- set(updated_episodes
)
220 # set episodes to be outdated, where necessary
221 for e
in filter(lambda e
: not e
.outdated
, outdated_episodes
):
223 updated_episodes
.append(e
)
227 print ' Updating', len(updated_episodes
), 'episodes'
228 self
.db
.save_docs(updated_episodes
)
233 def save_podcast_logo(self
, cover_art
):
238 image_sha1
= hashlib
.sha1(cover_art
).hexdigest()
239 prefix
= CoverArt
.get_prefix(image_sha1
)
241 filename
= CoverArt
.get_original(prefix
, image_sha1
)
242 dirname
= CoverArt
.get_dir(filename
)
244 # get hash of existing file
245 if os
.path
.exists(filename
):
246 with
open(filename
) as f
:
247 old_hash
= file_hash(f
).digest()
251 print ' LOGO @', cover_art
254 with
open(filename
, 'w') as fp
:
255 fp
.write(urllib2
.urlopen(cover_art
).read())
257 # get hash of new file
258 with
open(filename
) as f
:
259 new_hash
= file_hash(f
).digest()
261 # remove thumbnails if cover changed
262 if old_hash
!= new_hash
:
263 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
264 print ' Removing %d thumbnails' % len(thumbnails
)
270 except urllib2
.HTTPError
as e
:
273 except urllib2
.URLError
as e
:
277 def mark_outdated(self
, podcast
):
278 print ' mark outdated'
279 podcast
.outdated
= True
280 podcast
.last_update
= datetime
.utcnow()
282 self
.update_episodes(podcast
, [])
288 def update_a(obj
, attrib
, value
):
289 changed
= getattr(obj
, attrib
, _none
) != value
290 setattr(obj
, attrib
, value
)
294 def update_i(obj
, item
, value
):
295 changed
= obj
.get(item
, _none
) != value