2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT
= 'mygpo crawler (+http://my.gpodder.org)'
30 from mygpo
.data
import feedcore
31 from mygpo
.api
import models
32 from mygpo
.data
.models
import PodcastTag
33 from mygpo
.utils
import parse_time
34 from mygpo
.api
.sanitizing
import sanitize_url
, rewrite_podcasts
35 from mygpo
.data
import youtube
36 from mygpo
.data
.mimetype
import get_mimetype
, check_mimetype
, get_podcast_types
38 socket
.setdefaulttimeout(10)
39 fetcher
= feedcore
.Fetcher(USER_AGENT
)
42 def mark_outdated(podcast
):
43 for e
in models
.Episode
.objects
.filter(podcast
=podcast
):
47 def get_episode_url(entry
):
48 """Get the download / episode URL of a feedparser entry"""
49 enclosures
= getattr(entry
, 'enclosures', [])
50 for enclosure
in enclosures
:
51 if 'href' in enclosure
:
52 mimetype
= get_mimetype(enclosure
.get('type', ''), enclosure
['href'])
53 if check_mimetype(mimetype
):
54 return enclosure
['href'], mimetype
56 media_content
= getattr(entry
, 'media_content', [])
57 for media
in media_content
:
59 mimetype
= get_mimetype(media
.get('type', ''), media
['url'])
60 if check_mimetype(mimetype
):
61 return media
['url'], mimetype
63 links
= getattr(entry
, 'links', [])
65 if not hasattr(link
, 'href'):
68 if youtube
.is_video_link(link
['href']):
69 return link
['href'], 'application/x-youtube'
71 # XXX: Implement link detection as in gPodder
75 def get_episode_summary(entry
):
76 for key
in ('summary', 'subtitle', 'link'):
77 value
= entry
.get(key
, None)
83 def get_duration(entry
):
84 str = entry
.get('itunes_duration', '')
87 return parse_time(str)
91 def get_filesize(entry
, url
):
92 enclosures
= getattr(entry
, 'enclosures', [])
93 for enclosure
in enclosures
:
94 if 'href' in enclosure
and enclosure
['href'] == url
:
95 if 'length' in enclosure
:
97 return int(enclosure
['length'])
105 def get_feed_tags(feed
):
108 for tag
in feed
.get('tags', []):
110 tags
.extend([t
for t
in tag
['term'].split(',') if t
])
113 tags
.append(tag
['label'])
118 def update_feed_tags(podcast
, tags
):
121 #delete all tags not found in the feed anymore
122 PodcastTag
.objects
.filter(podcast
=podcast
, source
=src
).exclude(tag__in
=tags
).delete()
124 #create new found tags
126 if not PodcastTag
.objects
.filter(podcast
=podcast
, source
=src
, tag
=tag
).exists():
127 PodcastTag
.objects
.get_or_create(podcast
=podcast
, source
=src
, tag
=tag
)
130 def get_episode_metadata(entry
, url
, mimetype
):
133 'title': entry
.get('title', entry
.get('link', '')),
134 'description': get_episode_summary(entry
),
135 'link': entry
.get('link', ''),
137 'author': entry
.get('author', entry
.get('itunes_author', '')),
138 'duration': get_duration(entry
),
139 'filesize': get_filesize(entry
, url
),
140 'language': entry
.get('language', ''),
142 'mimetype': mimetype
,
145 d
['timestamp'] = datetime
.datetime(*(entry
.updated_parsed
)[:6])
147 d
['timestamp'] = None
152 def update_podcasts(fetch_queue
):
154 count
= len(fetch_queue
)
156 for podcast
in fetch_queue
:
158 print '(%d/%d) %s' % (n
, count
, podcast
.url
)
161 fetcher
.fetch(podcast
.url
)
163 except (feedcore
.Offline
, feedcore
.InvalidFeed
, feedcore
.WifiLogin
, feedcore
.AuthenticationRequired
):
164 mark_outdated(podcast
)
166 except feedcore
.NewLocation
, location
:
168 new_url
= sanitize_url(location
.data
)
171 if not models
.Podcast
.objects
.filter(url
=new_url
).exists():
172 podcast
.url
= new_url
174 p
= models
.Podcast
.objects
.get(url
=new_url
)
175 rewrite_podcasts(podcast
, p
)
179 except feedcore
.UpdatedFeed
, updated
:
181 podcast
.title
= feed
.feed
.get('title', podcast
.url
)
182 podcast
.link
= feed
.feed
.get('link', podcast
.url
)
183 podcast
.description
= feed
.feed
.get('subtitle', podcast
.description
)
184 podcast
.author
= feed
.feed
.get('author', feed
.feed
.get('itunes_author', podcast
.author
))
185 podcast
.language
= feed
.feed
.get('language', podcast
.language
)
187 cover_art
= podcast
.logo_url
188 image
= feed
.feed
.get('image', None)
189 if image
is not None:
190 for key
in ('href', 'url'):
191 cover_art
= getattr(image
, key
, None)
195 if cover_art
is not None:
197 image_sha1
= hashlib
.sha1()
198 image_sha1
.update(cover_art
)
199 image_sha1
= image_sha1
.hexdigest()
200 filename
= os
.path
.join('..', 'htdocs', 'media', 'logo', image_sha1
)
201 fp
= open(filename
, 'w')
202 fp
.write(urllib2
.urlopen(cover_art
).read())
204 print >>sys
.stderr
, 'LOGO @', cover_art
205 podcast
.logo_url
= cover_art
207 podcast
.logo_url
= None
208 print >>sys
.stderr
, 'cannot save image: %s' % e
210 update_feed_tags(podcast
, get_feed_tags(feed
.feed
))
212 existing_episodes
= list(models
.Episode
.objects
.filter(podcast
=podcast
))
214 for entry
in feed
.entries
:
216 url
, mimetype
= get_episode_url(entry
)
218 print 'Ignoring entry'
221 url
= sanitize_url(url
, podcast
=False, episode
=True)
222 md
= get_episode_metadata(entry
, url
, mimetype
)
223 e
, created
= models
.Episode
.objects
.get_or_create(
228 print 'New episode: ', e
.title
.encode('utf-8', 'ignore')
230 print 'Updating', e
.title
.encode('utf-8', 'ignore')
232 setattr(e
, key
, md
[key
])
234 # we need to distinguish it from non-updated episodes
241 if e
in existing_episodes
:
242 existing_episodes
.remove(e
)
245 print 'Cannot get episode:', e
247 # all episodes that could not be found in the feed
248 for e
in existing_episodes
:
253 podcast
.content_types
= get_podcast_types(podcast
)
256 print >>sys
.stderr
, 'Exception:', e
258 podcast
.last_update
= datetime
.datetime
.now()