2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT
= 'mygpo crawler (+http://my.gpodder.org)'
25 from datetime
import datetime
, timedelta
30 from functools
import partial
31 from itertools
import chain
33 from mygpo
.decorators
import repeat_on_conflict
34 from mygpo
.data
import feedcore
35 from mygpo
.utils
import parse_time
, file_hash
36 from mygpo
.api
.sanitizing
import sanitize_url
, rewrite_podcasts
37 from mygpo
.data
import youtube
38 from mygpo
.data
.mimetype
import get_mimetype
, check_mimetype
, get_podcast_types
39 from mygpo
.core
.models
import Episode
, Podcast
40 from mygpo
.core
.slugs
import assign_missing_episode_slugs
, assign_slug
, \
42 from mygpo
.web
.logo
import CoverArt
44 fetcher
= feedcore
.Fetcher(USER_AGENT
)
46 def mark_outdated(podcast
):
47 for e
in podcast
.get_episodes():
51 def get_episode_url(entry
):
52 """Get the download / episode URL of a feedparser entry"""
53 enclosures
= getattr(entry
, 'enclosures', [])
54 for enclosure
in enclosures
:
55 if 'href' in enclosure
:
56 mimetype
= get_mimetype(enclosure
.get('type', ''), enclosure
['href'])
57 if check_mimetype(mimetype
):
58 return enclosure
['href'], mimetype
60 media_content
= getattr(entry
, 'media_content', [])
61 for media
in media_content
:
63 mimetype
= get_mimetype(media
.get('type', ''), media
['url'])
64 if check_mimetype(mimetype
):
65 return media
['url'], mimetype
67 links
= getattr(entry
, 'links', [])
69 if not hasattr(link
, 'href'):
72 if youtube
.is_video_link(link
['href']):
73 return link
['href'], 'application/x-youtube'
75 # XXX: Implement link detection as in gPodder
79 def get_episode_summary(entry
):
80 for key
in ('summary', 'subtitle', 'link'):
81 value
= entry
.get(key
, None)
87 def get_duration(entry
):
88 str = entry
.get('itunes_duration', '')
91 return parse_time(str)
92 except (ValueError, TypeError):
95 def get_filesize(entry
, url
):
96 enclosures
= getattr(entry
, 'enclosures', [])
97 for enclosure
in enclosures
:
98 if 'href' in enclosure
and enclosure
['href'] == url
:
99 if 'length' in enclosure
:
101 return int(enclosure
['length'])
109 def get_feed_tags(feed
):
112 for tag
in feed
.get('tags', []):
114 tags
.extend([t
for t
in tag
['term'].split(',') if t
])
117 tags
.append(tag
['label'])
119 return list(set(tags
))
122 def get_episode_metadata(entry
, url
, mimetype
, podcast_language
):
125 'title': entry
.get('title', entry
.get('link', '')),
126 'description': get_episode_summary(entry
),
127 'link': entry
.get('link', ''),
128 'author': entry
.get('author', entry
.get('itunes_author', '')),
129 'duration': get_duration(entry
),
130 'filesize': get_filesize(entry
, url
),
131 'language': entry
.get('language', podcast_language
),
132 'mimetypes': [mimetype
],
135 d
['released'] = datetime(*(entry
.updated_parsed
)[:6])
139 # set outdated true if we didn't find a title (so that the
140 # feed-downloader doesn't try again infinitely
141 d
['outdated'] = not d
['title']
146 def get_podcast_metadata(podcast
, feed
):
148 episodes
= list(podcast
.get_episodes())
151 title
= feed
.feed
.get('title', podcast
.url
),
152 link
= feed
.feed
.get('link', podcast
.url
),
153 description
= feed
.feed
.get('subtitle', podcast
.description
),
154 author
= feed
.feed
.get('author', feed
.feed
.get('itunes_author', podcast
.author
)),
155 language
= feed
.feed
.get('language', podcast
.language
),
156 logo_url
= get_podcast_logo(podcast
, feed
),
157 content_types
= get_podcast_types(episodes
),
158 latest_episode_timestamp
= get_latest_episode_timestamp(episodes
),
162 def get_latest_episode_timestamp(episodes
):
164 timestamps
= filter(None, [e
.released
for e
in episodes
])
169 max_timestamp
= max(timestamps
)
172 max_future
= datetime
.utcnow() + timedelta(days
=2)
174 if max_timestamp
> max_future
:
175 return datetime
.utcnow()
181 def update_podcasts(fetch_queue
):
182 for n
, podcast
in enumerate(fetch_queue
):
183 print '(%d) %s' % (n
, podcast
.url
)
186 timeout
= socket
.getdefaulttimeout()
187 socket
.setdefaulttimeout(60)
188 fetcher
.fetch(podcast
.url
)
189 socket
.setdefaulttimeout(timeout
)
191 except (feedcore
.Offline
, feedcore
.InvalidFeed
, feedcore
.WifiLogin
,
192 feedcore
.AuthenticationRequired
, socket
.error
, IOError):
193 print 'marking outdated'
194 mark_outdated(podcast
)
196 except feedcore
.NewLocation
, location
:
197 print 'redirecting to', location
.data
198 new_url
= sanitize_url(location
.data
)
201 p
= Podcast
.for_url(new_url
)
203 podcast
.urls
.insert(0, new_url
)
204 fetch_queue
= chain([podcast
], fetch_queue
)
206 print 'podcast with new URL found, outdating old one'
207 podcast
.new_location
= new_url
209 mark_outdated(podcast
)
211 except feedcore
.UpdatedFeed
, updated
:
214 existing_episodes
= list(podcast
.get_episodes())
215 update_ep
= partial(update_episode
, podcast
=podcast
)
216 feed_episodes
= filter(None, map(update_ep
, feed
.entries
))
217 outdated_episodes
= set(existing_episodes
) - set(feed_episodes
)
219 # set episodes to be outdated, where necessary
220 for e
in filter(lambda e
: not e
.outdated
, outdated_episodes
):
225 podcast_md
= get_podcast_metadata(podcast
, feed
)
228 for key
, value
in podcast_md
.items():
229 if getattr(podcast
, key
) != value
:
230 setattr(podcast
, key
, value
)
233 tags
= get_feed_tags(feed
.feed
)
234 if podcast
.tags
.get('feed', None) != tags
:
235 podcast
.tags
['feed'] = tags
239 print 'updating podcast'
240 podcast
.last_update
= datetime
.utcnow()
243 print 'podcast not updated'
248 print >>sys
.stderr
, 'Exception:', e
251 assign_slug(podcast
, PodcastSlug
)
252 assign_missing_episode_slugs(podcast
)
255 def get_podcast_logo(podcast
, feed
):
256 cover_art
= podcast
.logo_url
257 image
= feed
.feed
.get('image', None)
258 if image
is not None:
259 for key
in ('href', 'url'):
260 cover_art
= getattr(image
, key
, None)
265 yturl
= youtube
.get_real_cover(podcast
.link
)
271 image_sha1
= hashlib
.sha1(cover_art
).hexdigest()
272 prefix
= CoverArt
.get_prefix(image_sha1
)
274 filename
= CoverArt
.get_original(prefix
, image_sha1
)
275 dirname
= CoverArt
.get_dir(filename
)
277 # get hash of existing file
278 if os
.path
.exists(filename
):
279 with
open(filename
) as f
:
280 old_hash
= file_hash(f
).digest()
284 print 'LOGO @', cover_art
287 with
open(filename
, 'w') as fp
:
288 fp
.write(urllib2
.urlopen(cover_art
).read())
290 # get hash of new file
291 with
open(filename
) as f
:
292 new_hash
= file_hash(f
).digest()
294 # remove thumbnails if cover changed
295 if old_hash
!= new_hash
:
296 thumbnails
= CoverArt
.get_existing_thumbnails(prefix
, filename
)
297 print 'Removing %d thumbnails' % len(thumbnails
)
306 print >> sys
.stderr
, \
307 unicode('cannot save image for podcast %s: %s'
308 % (podcast
.get_id(), str(e
)), errors
='ignore')
310 print >> sys
.stderr
, 'cannot save podcast logo'
316 def update_episode(entry
, podcast
):
317 url
, mimetype
= get_episode_url(entry
)
320 print 'Ignoring entry'
323 url
= sanitize_url(url
, 'episode')
325 print 'Ignoring entry'
328 episode
= Episode
.for_podcast_id_url(podcast
.get_id(),
330 md
= get_episode_metadata(entry
, url
, mimetype
,
334 for key
, value
in md
.items():
335 if getattr(episode
, key
) != value
:
336 setattr(episode
, key
, value
)
341 print 'Updating Episode: %s' % episode
.title
.encode('utf-8', 'ignore')