2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT
= 'mygpo crawler (+http://my.gpodder.org)'
30 from mygpo
.decorators
import repeat_on_conflict
31 from mygpo
import migrate
32 from mygpo
.data
import feedcore
33 from mygpo
.api
import models
34 from mygpo
.utils
import parse_time
35 from mygpo
.api
.sanitizing
import sanitize_url
, rewrite_podcasts
36 from mygpo
.data
import youtube
37 from mygpo
.data
.mimetype
import get_mimetype
, check_mimetype
, get_podcast_types
38 from mygpo
import migrate
40 socket
.setdefaulttimeout(10)
41 fetcher
= feedcore
.Fetcher(USER_AGENT
)
44 def mark_outdated(podcast
):
45 for e
in models
.Episode
.objects
.filter(podcast
=podcast
):
49 def get_episode_url(entry
):
50 """Get the download / episode URL of a feedparser entry"""
51 enclosures
= getattr(entry
, 'enclosures', [])
52 for enclosure
in enclosures
:
53 if 'href' in enclosure
:
54 mimetype
= get_mimetype(enclosure
.get('type', ''), enclosure
['href'])
55 if check_mimetype(mimetype
):
56 return enclosure
['href'], mimetype
58 media_content
= getattr(entry
, 'media_content', [])
59 for media
in media_content
:
61 mimetype
= get_mimetype(media
.get('type', ''), media
['url'])
62 if check_mimetype(mimetype
):
63 return media
['url'], mimetype
65 links
= getattr(entry
, 'links', [])
67 if not hasattr(link
, 'href'):
70 if youtube
.is_video_link(link
['href']):
71 return link
['href'], 'application/x-youtube'
73 # XXX: Implement link detection as in gPodder
77 def get_episode_summary(entry
):
78 for key
in ('summary', 'subtitle', 'link'):
79 value
= entry
.get(key
, None)
85 def get_duration(entry
):
86 str = entry
.get('itunes_duration', '')
89 return parse_time(str)
93 def get_filesize(entry
, url
):
94 enclosures
= getattr(entry
, 'enclosures', [])
95 for enclosure
in enclosures
:
96 if 'href' in enclosure
and enclosure
['href'] == url
:
97 if 'length' in enclosure
:
99 return int(enclosure
['length'])
107 def get_feed_tags(feed
):
110 for tag
in feed
.get('tags', []):
112 tags
.extend([t
for t
in tag
['term'].split(',') if t
])
115 tags
.append(tag
['label'])
117 return list(set(tags
))
120 @repeat_on_conflict()
121 def update_feed_tags(podcast
, tags
):
123 np
= migrate
.get_or_migrate_podcast(podcast
)
128 from couchdbkit
import ResourceConflict
129 if isinstance(e
, ResourceConflict
):
132 print >> sys
.stderr
, 'error saving tags for podcast %s: %s' % (np
.get_id(), e
)
135 def get_episode_metadata(entry
, url
, mimetype
):
138 'title': entry
.get('title', entry
.get('link', '')),
139 'description': get_episode_summary(entry
),
140 'link': entry
.get('link', ''),
142 'author': entry
.get('author', entry
.get('itunes_author', '')),
143 'duration': get_duration(entry
),
144 'filesize': get_filesize(entry
, url
),
145 'language': entry
.get('language', ''),
147 'mimetype': mimetype
,
150 d
['timestamp'] = datetime
.datetime(*(entry
.updated_parsed
)[:6])
152 d
['timestamp'] = None
157 def update_podcasts(fetch_queue
):
159 count
= len(fetch_queue
)
161 for podcast
in fetch_queue
:
163 print '(%d/%d) %s' % (n
, count
, podcast
.url
)
166 fetcher
.fetch(podcast
.url
)
168 except (feedcore
.Offline
, feedcore
.InvalidFeed
, feedcore
.WifiLogin
, feedcore
.AuthenticationRequired
):
169 mark_outdated(podcast
)
171 except feedcore
.NewLocation
, location
:
173 new_url
= sanitize_url(location
.data
)
176 if not models
.Podcast
.objects
.filter(url
=new_url
).exists():
177 podcast
.url
= new_url
179 p
= models
.Podcast
.objects
.get(url
=new_url
)
180 rewrite_podcasts(podcast
, p
)
184 except feedcore
.UpdatedFeed
, updated
:
186 podcast
.title
= feed
.feed
.get('title', podcast
.url
)
187 podcast
.link
= feed
.feed
.get('link', podcast
.url
)
188 podcast
.description
= feed
.feed
.get('subtitle', podcast
.description
)
189 podcast
.author
= feed
.feed
.get('author', feed
.feed
.get('itunes_author', podcast
.author
))
190 podcast
.language
= feed
.feed
.get('language', podcast
.language
)
192 cover_art
= podcast
.logo_url
193 image
= feed
.feed
.get('image', None)
194 if image
is not None:
195 for key
in ('href', 'url'):
196 cover_art
= getattr(image
, key
, None)
200 yturl
= youtube
.get_real_cover(podcast
.link
)
206 image_sha1
= hashlib
.sha1()
207 image_sha1
.update(cover_art
)
208 image_sha1
= image_sha1
.hexdigest()
209 filename
= os
.path
.join(os
.path
.dirname(os
.path
.abspath(__file__
)), '..', '..', 'htdocs', 'media', 'logo', image_sha1
)
210 fp
= open(filename
, 'w')
211 fp
.write(urllib2
.urlopen(cover_art
).read())
213 print 'LOGO @', cover_art
214 podcast
.logo_url
= cover_art
216 podcast
.logo_url
= None
218 print >> sys
.stderr
, 'cannot save image %s for podcast %d: %s' % (cover_art
.encode('utf-8'), podcast
.id, repr(e
).encode('utf-8'))
220 update_feed_tags(podcast
, get_feed_tags(feed
.feed
))
222 existing_episodes
= list(models
.Episode
.objects
.filter(podcast
=podcast
))
224 for entry
in feed
.entries
:
226 url
, mimetype
= get_episode_url(entry
)
228 print 'Ignoring entry'
231 url
= sanitize_url(url
, 'episode')
232 md
= get_episode_metadata(entry
, url
, mimetype
)
233 e
, created
= models
.Episode
.objects
.get_or_create(
238 print 'New episode: ', e
.title
.encode('utf-8', 'ignore')
240 print 'Updating', e
.title
.encode('utf-8', 'ignore')
242 setattr(e
, key
, md
[key
])
244 # we need to distinguish it from non-updated episodes
251 if e
in existing_episodes
:
252 existing_episodes
.remove(e
)
255 print 'Cannot get episode:', e
257 # all episodes that could not be found in the feed
258 for e
in existing_episodes
:
263 podcast
.content_types
= get_podcast_types(podcast
)
266 print >>sys
.stderr
, 'Exception:', e
268 podcast
.last_update
= datetime
.datetime
.now()