2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT
= 'mygpo crawler (+http://my.gpodder.org)'
30 os
.environ
['DJANGO_SETTINGS_MODULE'] = 'mygpo.settings'
32 sys
.path
.insert(0, os
.path
.join(os
.path
.dirname(__file__
), '..'))
34 from mygpo
import feedcore
35 from mygpo
.api
import models
37 socket
.setdefaulttimeout(10)
38 fetcher
= feedcore
.Fetcher(USER_AGENT
)
40 UPDATE_LIMIT
= datetime
.datetime
.now() - datetime
.timedelta(days
=15)
43 fetch_queue
= [models
.Podcast
.objects
.get(url
=url
) for url
in sys
.argv
[1:]]
45 #fetch_queue = models.Podcast.objects.all()
46 fetch_queue
= models
.Podcast
.objects
.filter(last_update__lt
=UPDATE_LIMIT
)
48 def check_mime(mimetype
):
49 """Check if a mimetype is a "wanted" media type"""
51 category
, _ignore
= mimetype
.split('/', 1)
52 return category
in ('audio', 'video', 'image')
56 def get_episode_url(entry
):
57 """Get the download / episode URL of a feedparser entry"""
58 enclosures
= getattr(entry
, 'enclosures', [])
59 for enclosure
in enclosures
:
60 if 'href' in enclosure
and check_mime(enclosure
.get('type', '')):
61 return enclosure
['href']
63 media_content
= getattr(entry
, 'media_content', [])
64 for media
in media_content
:
65 if 'url' in media
and check_mime(m
.get('type', '')):
68 links
= getattr(entry
, 'links', [])
70 if not hasattr(link
, 'href'):
72 # XXX: Implement link detection as in gPodder
76 def get_episode_summary(entry
):
77 for key
in ('summary', 'subtitle', 'link'):
78 value
= entry
.get(key
, None)
84 def get_episode_metadata(entry
, url
):
87 'title': entry
.get('title', entry
.get('link', '')),
88 'description': get_episode_summary(entry
),
89 'link': entry
.get('link', ''),
93 d
['timestamp'] = datetime
.datetime(*(entry
.updated_parsed
)[:6])
99 for podcast
in fetch_queue
:
103 fetcher
.fetch(podcast
.url
)
104 except feedcore
.Offline
:
106 except feedcore
.InvalidFeed
:
108 except feedcore
.WifiLogin
:
110 except feedcore
.AuthenticationRequired
:
112 except feedcore
.NewLocation
, location
:
113 podcast
.url
= location
.data
114 except feedcore
.UpdatedFeed
, updated
:
116 podcast
.title
= feed
.feed
.get('title', podcast
.url
)
117 podcast
.link
= feed
.feed
.get('link', podcast
.url
)
118 podcast
.description
= feed
.feed
.get('subtitle', podcast
.description
)
121 image
= feed
.feed
.get('image', None)
122 if image
is not None:
123 for key
in ('href', 'url'):
124 cover_art
= getattr(image
, key
, None)
128 if cover_art
is not None:
129 image_sha1
= hashlib
.sha1()
130 image_sha1
.update(cover_art
)
131 image_sha1
= image_sha1
.hexdigest()
132 filename
= os
.path
.join('htdocs', 'media', 'logo', image_sha1
)
133 if not os
.path
.exists(filename
):
135 fp
= open(filename
, 'w')
136 fp
.write(urllib2
.urlopen(cover_art
).read())
138 print >>sys
.stderr
, 'LOGO @', cover_art
139 podcast
.logo_url
= cover_art
141 print >>sys
.stderr
, 'cannot save image'
143 for entry
in feed
.entries
:
145 url
= get_episode_url(entry
)
147 print 'Ignoring entry'
149 e
, created
= models
.Episode
.objects
.get_or_create(
152 defaults
=get_episode_metadata(entry
, url
))
154 print 'New episode: ', e
.title
.encode('utf-8', 'ignore')
157 print 'Cannot get episode:', e
159 print >>sys
.stderr
, 'Exception:', e
161 podcast
.last_update
= datetime
.datetime
.now()