mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 import datetime
  26 import hashlib
  27 import urllib2
  28 import socket
  29
  30 from mygpo.data import feedcore
  31 from mygpo.api import models
  32 from mygpo.data.models import PodcastTag
  33 from mygpo.utils import parse_time
  34 from mygpo.api.sanitizing import sanitize_url, rewrite_podcasts
  35 from mygpo.data import youtube
  36 from mygpo.data.mimetype import get_mimetype, check_mimetype, get_podcast_types
  37
  38 socket.setdefaulttimeout(10)
  39 fetcher = feedcore.Fetcher(USER_AGENT)
  40
  41
  42 def mark_outdated(podcast):
  43     for e in models.Episode.objects.filter(podcast=podcast):
  44         e.outdated = True
  45         e.save()
  46
  47 def get_episode_url(entry):
  48     """Get the download / episode URL of a feedparser entry"""
  49     enclosures = getattr(entry, 'enclosures', [])
  50     for enclosure in enclosures:
  51         if 'href' in enclosure:
  52             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
  53             if check_mimetype(mimetype):
  54                 return enclosure['href'], mimetype
  55
  56     media_content = getattr(entry, 'media_content', [])
  57     for media in media_content:
  58         if 'url' in media:
  59             mimetype = get_mimetype(media.get('type', ''), media['url'])
  60             if check_mimetype(mimetype):
  61                 return media['url'], mimetype
  62
  63     links = getattr(entry, 'links', [])
  64     for link in links:
  65         if not hasattr(link, 'href'):
  66             continue
  67
  68         if youtube.is_video_link(link['href']):
  69             return link['href'], 'application/x-youtube'
  70
  71         # XXX: Implement link detection as in gPodder
  72
  73     return None, None
  74
  75 def get_episode_summary(entry):
  76     for key in ('summary', 'subtitle', 'link'):
  77         value = entry.get(key, None)
  78         if value:
  79             return value
  80
  81     return ''
  82
  83 def get_duration(entry):
  84     str = entry.get('itunes_duration', '')
  85
  86     try:
  87         return parse_time(str)
  88     except ValueError:
  89         return 0
  90
  91 def get_filesize(entry, url):
  92     enclosures = getattr(entry, 'enclosures', [])
  93     for enclosure in enclosures:
  94         if 'href' in enclosure and enclosure['href'] == url:
  95             if 'length' in enclosure:
  96                 try:
  97                     return int(enclosure['length'])
  98                 except ValueError:
  99                     return None
 100
 101             return None
 102     return None
 103
 104
 105 def get_feed_tags(feed):
 106     tags = []
 107
 108     for tag in feed.get('tags', []):
 109         if tag['term']:
 110             tags.extend([t for t in tag['term'].split(',') if t])
 111
 112         if tag['label']:
 113             tags.append(tag['label'])
 114
 115     return set(tags)
 116
 117
 118 def update_feed_tags(podcast, tags):
 119     src = 'feed'
 120
 121     #delete all tags not found in the feed anymore
 122     PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
 123
 124     #create new found tags
 125     for tag in tags:
 126         if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
 127             PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
 128
 129
 130 def get_episode_metadata(entry, url, mimetype):
 131     d = {
 132             'url': url,
 133             'title': entry.get('title', entry.get('link', '')),
 134             'description': get_episode_summary(entry),
 135             'link': entry.get('link', ''),
 136             'timestamp': None,
 137             'author': entry.get('author', entry.get('itunes_author', '')),
 138             'duration': get_duration(entry),
 139             'filesize': get_filesize(entry, url),
 140             'language': entry.get('language', ''),
 141             'outdated': False,
 142             'mimetype': mimetype,
 143     }
 144     try:
 145         d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6])
 146     except:
 147         d['timestamp'] = None
 148
 149     return d
 150
 151
 152 def update_podcasts(fetch_queue):
 153     n=0
 154     count = len(fetch_queue)
 155
 156     for podcast in fetch_queue:
 157         n+=1
 158         print '(%d/%d) %s' % (n, count, podcast.url)
 159
 160         try:
 161             fetcher.fetch(podcast.url)
 162
 163         except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
 164             mark_outdated(podcast)
 165
 166         except feedcore.NewLocation, location:
 167             print location.data
 168             new_url = sanitize_url(location.data)
 169             if new_url:
 170                 print new_url
 171                 if not models.Podcast.objects.filter(url=new_url).exists():
 172                     podcast.url = new_url
 173                 else:
 174                     p = models.Podcast.objects.get(url=new_url)
 175                     rewrite_podcasts(podcast, p)
 176                     podcast.delete()
 177                     continue
 178
 179         except feedcore.UpdatedFeed, updated:
 180             feed = updated.data
 181             podcast.title = feed.feed.get('title', podcast.url)
 182             podcast.link = feed.feed.get('link', podcast.url)
 183             podcast.description = feed.feed.get('subtitle', podcast.description)
 184             podcast.author = feed.feed.get('author', feed.feed.get('itunes_author', podcast.author))
 185             podcast.language = feed.feed.get('language', podcast.language)
 186
 187             cover_art = podcast.logo_url
 188             image = feed.feed.get('image', None)
 189             if image is not None:
 190                 for key in ('href', 'url'):
 191                     cover_art = getattr(image, key, None)
 192                     if cover_art:
 193                         break
 194
 195             if cover_art is not None:
 196                 try:
 197                     image_sha1 = hashlib.sha1()
 198                     image_sha1.update(cover_art)
 199                     image_sha1 = image_sha1.hexdigest()
 200                     filename = os.path.join('..', 'htdocs', 'media', 'logo', image_sha1)
 201                     fp = open(filename, 'w')
 202                     fp.write(urllib2.urlopen(cover_art).read())
 203                     fp.close()
 204                     print >>sys.stderr, 'LOGO @', cover_art
 205                     podcast.logo_url = cover_art
 206                 except Exception, e:
 207                     podcast.logo_url = None
 208                     print >>sys.stderr, 'cannot save image: %s' % e
 209
 210             update_feed_tags(podcast, get_feed_tags(feed.feed))
 211
 212             existing_episodes = list(models.Episode.objects.filter(podcast=podcast))
 213
 214             for entry in feed.entries:
 215                 try:
 216                     url, mimetype = get_episode_url(entry)
 217                     if url is None:
 218                         print 'Ignoring entry'
 219                         continue
 220
 221                     url = sanitize_url(url, podcast=False, episode=True)
 222                     md = get_episode_metadata(entry, url, mimetype)
 223                     e, created = models.Episode.objects.get_or_create(
 224                         podcast=podcast,
 225                         url=url,
 226                         defaults=md)
 227                     if created:
 228                         print 'New episode: ', e.title.encode('utf-8', 'ignore')
 229                     else:
 230                         print 'Updating', e.title.encode('utf-8', 'ignore')
 231                         for key in md:
 232                             setattr(e, key, md[key])
 233
 234                     # we need to distinguish it from non-updated episodes
 235                     if not e.title:
 236                         e.outdated = True
 237                     else:
 238                         e.outdated = False
 239                     e.save()
 240
 241                     if e in existing_episodes:
 242                         existing_episodes.remove(e)
 243
 244                 except Exception, e:
 245                     print 'Cannot get episode:', e
 246
 247             # all episodes that could not be found in the feed
 248             for e in existing_episodes:
 249                 if not e.outdated:
 250                     e.outdated = True
 251                     e.save()
 252
 253             podcast.content_types = get_podcast_types(podcast)
 254
 255         except Exception, e:
 256             print >>sys.stderr, 'Exception:', e
 257
 258         podcast.last_update = datetime.datetime.now()
 259         try:
 260             podcast.save()
 261         except Exception, e:
 262             print e
 263
 264