mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 from datetime import datetime, timedelta
  26 import hashlib
  27 import urllib2
  28 import socket
  29 from glob import glob
  30 from functools import partial
  31 from itertools import chain
  32
  33 from mygpo.decorators import repeat_on_conflict
  34 from mygpo.data import feedcore
  35 from mygpo.utils import parse_time, file_hash
  36 from mygpo.api.sanitizing import sanitize_url, rewrite_podcasts
  37 from mygpo.data import youtube
  38 from mygpo.data.mimetype import get_mimetype, check_mimetype, get_podcast_types
  39 from mygpo.core.models import Episode, Podcast
  40 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
  41          PodcastSlug
  42 from mygpo.web.logo import CoverArt
  43
  44 fetcher = feedcore.Fetcher(USER_AGENT)
  45
  46 def mark_outdated(podcast):
  47     for e in podcast.get_episodes():
  48         e.outdated = True
  49         e.save()
  50
  51 def get_episode_url(entry):
  52     """Get the download / episode URL of a feedparser entry"""
  53     enclosures = getattr(entry, 'enclosures', [])
  54     for enclosure in enclosures:
  55         if 'href' in enclosure:
  56             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
  57             if check_mimetype(mimetype):
  58                 return enclosure['href'], mimetype
  59
  60     media_content = getattr(entry, 'media_content', [])
  61     for media in media_content:
  62         if 'url' in media:
  63             mimetype = get_mimetype(media.get('type', ''), media['url'])
  64             if check_mimetype(mimetype):
  65                 return media['url'], mimetype
  66
  67     links = getattr(entry, 'links', [])
  68     for link in links:
  69         if not hasattr(link, 'href'):
  70             continue
  71
  72         if youtube.is_video_link(link['href']):
  73             return link['href'], 'application/x-youtube'
  74
  75         # XXX: Implement link detection as in gPodder
  76
  77     return None, None
  78
  79 def get_episode_summary(entry):
  80     for key in ('summary', 'subtitle', 'link'):
  81         value = entry.get(key, None)
  82         if value:
  83             return value
  84
  85     return ''
  86
  87 def get_duration(entry):
  88     str = entry.get('itunes_duration', '')
  89
  90     try:
  91         return parse_time(str)
  92     except (ValueError, TypeError):
  93         return 0
  94
  95 def get_filesize(entry, url):
  96     enclosures = getattr(entry, 'enclosures', [])
  97     for enclosure in enclosures:
  98         if 'href' in enclosure and enclosure['href'] == url:
  99             if 'length' in enclosure:
 100                 try:
 101                     return int(enclosure['length'])
 102                 except ValueError:
 103                     return None
 104
 105             return None
 106     return None
 107
 108
 109 def get_feed_tags(feed):
 110     tags = []
 111
 112     for tag in feed.get('tags', []):
 113         if tag['term']:
 114             tags.extend([t for t in tag['term'].split(',') if t])
 115
 116         if tag['label']:
 117             tags.append(tag['label'])
 118
 119     return list(set(tags))
 120
 121
 122 def get_episode_metadata(entry, url, mimetype, podcast_language):
 123     d = {
 124             'url': url,
 125             'title': entry.get('title', entry.get('link', '')),
 126             'description': get_episode_summary(entry),
 127             'link': entry.get('link', ''),
 128             'author': entry.get('author', entry.get('itunes_author', '')),
 129             'duration': get_duration(entry),
 130             'filesize': get_filesize(entry, url),
 131             'language': entry.get('language', podcast_language),
 132             'mimetypes': [mimetype],
 133     }
 134     try:
 135         d['released'] = datetime(*(entry.updated_parsed)[:6])
 136     except:
 137         d['released'] = None
 138
 139     # set outdated true if we didn't find a title (so that the
 140     # feed-downloader doesn't try again infinitely
 141     d['outdated'] = not d['title']
 142
 143     return d
 144
 145
 146 def get_podcast_metadata(podcast, feed):
 147
 148     episodes = list(podcast.get_episodes())
 149
 150     return dict(
 151         title = feed.feed.get('title', podcast.url),
 152         link = feed.feed.get('link', podcast.url),
 153         description = feed.feed.get('subtitle', podcast.description),
 154         author = feed.feed.get('author', feed.feed.get('itunes_author', podcast.author)),
 155         language = feed.feed.get('language', podcast.language),
 156         logo_url = get_podcast_logo(podcast, feed),
 157         content_types = get_podcast_types(episodes),
 158         latest_episode_timestamp = get_latest_episode_timestamp(episodes),
 159     )
 160
 161
 162 def get_latest_episode_timestamp(episodes):
 163
 164     timestamps = filter(None, [e.released for e in episodes])
 165
 166     if not timestamps:
 167         return None
 168
 169     max_timestamp = max(timestamps)
 170
 171
 172     max_future = datetime.utcnow() + timedelta(days=2)
 173
 174     if max_timestamp > max_future:
 175         return datetime.utcnow()
 176
 177     return max_timestamp
 178
 179
 180
 181 def update_podcasts(fetch_queue):
 182     for n, podcast in enumerate(fetch_queue):
 183         print '(%d) %s' % (n, podcast.url)
 184
 185         try:
 186             timeout = socket.getdefaulttimeout()
 187             socket.setdefaulttimeout(60)
 188             fetcher.fetch(podcast.url)
 189             socket.setdefaulttimeout(timeout)
 190
 191         except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin,
 192                 feedcore.AuthenticationRequired, socket.error, IOError):
 193             print 'marking outdated'
 194             mark_outdated(podcast)
 195
 196         except feedcore.NewLocation, location:
 197             print 'redirecting to', location.data
 198             new_url = sanitize_url(location.data)
 199             if new_url:
 200
 201                 p = Podcast.for_url(new_url)
 202                 if not p:
 203                     podcast.urls.insert(0, new_url)
 204                     fetch_queue = chain([podcast], fetch_queue)
 205                 else:
 206                     print 'podcast with new URL found, outdating old one'
 207                     podcast.new_location = new_url
 208                     podcast.save()
 209                     mark_outdated(podcast)
 210
 211         except feedcore.UpdatedFeed, updated:
 212             feed = updated.data
 213
 214             existing_episodes = list(podcast.get_episodes())
 215             update_ep = partial(update_episode, podcast=podcast)
 216             feed_episodes = filter(None, map(update_ep, feed.entries))
 217             outdated_episodes = set(existing_episodes) - set(feed_episodes)
 218
 219             # set episodes to be outdated, where necessary
 220             for e in filter(lambda e: not e.outdated, outdated_episodes):
 221                 e.outdated = True
 222                 e.save()
 223
 224
 225             podcast_md = get_podcast_metadata(podcast, feed)
 226
 227             changed = False
 228             for key, value in podcast_md.items():
 229                 if getattr(podcast, key) != value:
 230                     setattr(podcast, key, value)
 231                     changed = True
 232
 233             tags = get_feed_tags(feed.feed)
 234             if podcast.tags.get('feed', None) != tags:
 235                 podcast.tags['feed'] = tags
 236                 changed = True
 237
 238             if changed:
 239                 print 'updating podcast'
 240                 podcast.last_update = datetime.utcnow()
 241                 podcast.save()
 242             else:
 243                 print 'podcast not updated'
 244
 245
 246         except Exception, e:
 247             print podcast.url
 248             print >>sys.stderr, 'Exception:', e
 249
 250
 251         assign_slug(podcast, PodcastSlug)
 252         assign_missing_episode_slugs(podcast)
 253
 254
 255 def get_podcast_logo(podcast, feed):
 256     cover_art = podcast.logo_url
 257     image = feed.feed.get('image', None)
 258     if image is not None:
 259         for key in ('href', 'url'):
 260             cover_art = getattr(image, key, None)
 261             if cover_art:
 262                 break
 263
 264     if podcast.link:
 265         yturl = youtube.get_real_cover(podcast.link)
 266         if yturl:
 267             cover_art = yturl
 268
 269     if cover_art:
 270         try:
 271             image_sha1 = hashlib.sha1(cover_art).hexdigest()
 272             prefix = CoverArt.get_prefix(image_sha1)
 273
 274             filename = CoverArt.get_original(prefix, image_sha1)
 275             dirname = CoverArt.get_dir(filename)
 276
 277             # get hash of existing file
 278             if os.path.exists(filename):
 279                 with open(filename) as f:
 280                     old_hash = file_hash(f).digest()
 281             else:
 282                 old_hash = ''
 283
 284             print 'LOGO @', cover_art
 285
 286             # save new cover art
 287             with open(filename, 'w') as fp:
 288                 fp.write(urllib2.urlopen(cover_art).read())
 289
 290             # get hash of new file
 291             with open(filename) as f:
 292                 new_hash = file_hash(f).digest()
 293
 294             # remove thumbnails if cover changed
 295             if old_hash != new_hash:
 296                 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
 297                 print 'Removing %d thumbnails' % len(thumbnails)
 298                 for f in thumbnails:
 299                     os.unlink(f)
 300
 301             return  cover_art
 302
 303         except Exception, e:
 304             if str(e).strip():
 305                 try:
 306                     print >> sys.stderr, \
 307                         unicode('cannot save image for podcast %s: %s'
 308                         % (podcast.get_id(), str(e)), errors='ignore')
 309                 except:
 310                     print >> sys.stderr, 'cannot save podcast logo'
 311
 312             return None
 313
 314
 315
 316 def update_episode(entry, podcast):
 317     url, mimetype = get_episode_url(entry)
 318
 319     if url is None:
 320         print 'Ignoring entry'
 321         return
 322
 323     url = sanitize_url(url, 'episode')
 324     if not url:
 325         print 'Ignoring entry'
 326         return
 327
 328     episode = Episode.for_podcast_id_url(podcast.get_id(),
 329             url, create=True)
 330     md = get_episode_metadata(entry, url, mimetype,
 331             podcast.language)
 332
 333     changed = False
 334     for key, value in md.items():
 335         if getattr(episode, key) != value:
 336             setattr(episode, key, value)
 337             changed = True
 338
 339     if changed:
 340         episode.save()
 341         print 'Updating Episode: %s' % episode.title.encode('utf-8', 'ignore')
 342
 343     return episode