straw/feedfinder.py

   1 """feedfinder: Find the Web feed for a Web page
   2 http://www.aaronsw.com/2002/feedfinder/
   3
   4 Usage:
   5   feed(uri) - returns feed found for a URI
   6   feeds(uri) - returns all feeds found for a URI
   7
   8     >>> import feedfinder
   9     >>> feedfinder.feed('scripting.com')
  10     'http://scripting.com/rss.xml'
  11     >>>
  12     >>> feedfinder.feeds('scripting.com')
  13     ['http://delong.typepad.com/sdj/atom.xml',
  14      'http://delong.typepad.com/sdj/index.rdf',
  15      'http://delong.typepad.com/sdj/rss.xml']
  16     >>>
  17
  18 Can also use from the command line.  Feeds are returned one per line:
  19
  20     $ python feedfinder.py diveintomark.org
  21     http://diveintomark.org/xml/atom.xml
  22
  23 How it works:
  24   0. At every step, feeds are minimally verified to make sure they are really feeds.
  25   1. If the URI points to a feed, it is simply returned; otherwise
  26      the page is downloaded and the real fun begins.
  27   2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
  28   3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
  29      ".atom"
  30   4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
  31   5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
  32      ".atom"
  33   6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
  34   7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
  35   8. As a last ditch effort, we search Syndic8 for feeds matching the URI
  36 """
  37
  38 __version__ = "1.371"
  39 __date__ = "2006-04-24"
  40 __maintainer__ = "Aaron Swartz (me@aaronsw.com)"
  41 __author__ = "Mark Pilgrim (http://diveintomark.org)"
  42 __copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
  43 __license__ = "Python"
  44 __credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
  45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
  46
  47 _debug = 1
  48
  49 import sgmllib, urllib, urlparse, re, sys, robotparser
  50
  51 import threading
  52 class TimeoutError(Exception): pass
  53 def timelimit(timeout):
  54     """borrowed from web.py"""
  55     def _1(function):
  56         def _2(*args, **kw):
  57             class Dispatch(threading.Thread):
  58                 def __init__(self):
  59                     threading.Thread.__init__(self)
  60                     self.result = None
  61                     self.error = None
  62
  63                     self.setDaemon(True)
  64                     self.start()
  65
  66                 def run(self):
  67                     try:
  68                         self.result = function(*args, **kw)
  69                     except:
  70                         self.error = sys.exc_info()
  71
  72             c = Dispatch()
  73             c.join(timeout)
  74             if c.isAlive():
  75                 raise TimeoutError, 'took too long'
  76             if c.error:
  77                 raise c.error[0], c.error[1]
  78             return c.result
  79         return _2
  80     return _1
  81
  82 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
  83 # Python 2.3 now comes with this module by default, otherwise you can download it
  84 try:
  85     import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
  86 except ImportError:
  87     xmlrpclib = None
  88
  89 if not dict:
  90     def dict(aList):
  91         rc = {}
  92         for k, v in aList:
  93             rc[k] = v
  94         return rc
  95
  96 def _debuglog(message):
  97     if _debug: print message
  98
  99 class URLGatekeeper:
 100     """a class to track robots.txt rules across multiple servers"""
 101     def __init__(self):
 102         self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
 103         self.urlopener = urllib.FancyURLopener()
 104         self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://www.aaronsw.com/2002/feedfinder/"
 105         _debuglog(self.urlopener.version)
 106         self.urlopener.addheaders = [('User-agent', self.urlopener.version)]
 107         robotparser.URLopener.version = self.urlopener.version
 108         robotparser.URLopener.addheaders = self.urlopener.addheaders
 109
 110     def _getrp(self, url):
 111         protocol, domain = urlparse.urlparse(url)[:2]
 112         if self.rpcache.has_key(domain):
 113             return self.rpcache[domain]
 114         baseurl = '%s://%s' % (protocol, domain)
 115         robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
 116         _debuglog('fetching %s' % robotsurl)
 117         rp = robotparser.RobotFileParser(robotsurl)
 118         try:
 119             rp.read()
 120         except:
 121             pass
 122         self.rpcache[domain] = rp
 123         return rp
 124
 125     def can_fetch(self, url):
 126         rp = self._getrp(url)
 127         allow = rp.can_fetch(self.urlopener.version, url)
 128         _debuglog("gatekeeper of %s says %s" % (url, allow))
 129         return allow
 130
 131     @timelimit(50)
 132     def get(self, url, check=True):
 133         if check and not self.can_fetch(url): return ''
 134         try:
 135             content = ""
 136
 137             if url.startswith("file://"):
 138                 f = open(url[7:], "r")
 139                 content = f.read()
 140                 f.close()
 141             else:
 142                 import httplib2, os
 143                 import Config
 144                 CACHE_DIR = os.path.join(Config.straw_home(), 'cache')
 145                 h = httplib2.Http(CACHE_DIR)
 146                 resp, content = h.request(url, "GET")
 147
 148             return content
 149             #return self.urlopener.open(url).read()
 150         except Exception, e:
 151             _debuglog(e)
 152             return ''
 153
 154 _gatekeeper = URLGatekeeper()
 155
 156 class BaseParser(sgmllib.SGMLParser):
 157     def __init__(self, baseuri):
 158         sgmllib.SGMLParser.__init__(self)
 159         self.links = []
 160         self.baseuri = baseuri
 161
 162     def normalize_attrs(self, attrs):
 163         def cleanattr(v):
 164             v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
 165             v = v.strip()
 166             v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
 167             return v
 168         attrs = [(k.lower(), cleanattr(v)) for k, v in attrs]
 169         attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
 170         return attrs
 171
 172     def do_base(self, attrs):
 173         attrsD = dict(self.normalize_attrs(attrs))
 174         if not attrsD.has_key('href'): return
 175         self.baseuri = attrsD['href']
 176
 177     def error(self, text):
 178         pass
 179         #_debuglog(text)
 180
 181 class LinkParser(BaseParser):
 182     FEED_TYPES = ('application/rss+xml',
 183                   'text/xml',
 184                   'application/atom+xml',
 185                   'application/x.atom+xml',
 186                   'application/x-atom+xml')
 187
 188     def do_link(self, attrs):
 189         attrsD = dict(self.normalize_attrs(attrs))
 190         if not attrsD.has_key('rel'): return
 191         rels = attrsD['rel'].split()
 192         if 'alternate' not in rels: return
 193         if attrsD.get('type') not in self.FEED_TYPES: return
 194         if not attrsD.has_key('href'): return
 195         self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
 196
 197 class ALinkParser(BaseParser):
 198     def start_a(self, attrs):
 199         attrsD = dict(self.normalize_attrs(attrs))
 200         if not attrsD.has_key('href'): return
 201         self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
 202
 203 def fix_quirks(content):
 204     p = re.compile( '<!DOCTYPE(.*?)\/>', re.M | re.S)
 205     return p.sub('', content)
 206
 207 def makeFullURI(uri):
 208     uri = uri.strip()
 209     if uri.startswith('feed://'):
 210         uri = 'http://' + uri.split('feed://', 1).pop()
 211     for x in ['http', 'https', 'file']:
 212         if uri.startswith('%s://' % x):
 213             return uri
 214     return 'http://%s' % uri
 215
 216 def getLinks(data, baseuri):
 217     p = LinkParser(baseuri)
 218     p.feed(data)
 219     return p.links
 220
 221 def getALinks(data, baseuri):
 222     p = ALinkParser(baseuri)
 223     p.feed(data)
 224     return p.links
 225
 226 def getLocalLinks(links, baseuri):
 227     baseuri = baseuri.lower()
 228     return [l for l in links if l.lower().startswith(baseuri)]
 229
 230 def isFeedLink(link):
 231     return link.endswith("feed/") or link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
 232
 233 def isXMLRelatedLink(link):
 234     link = link.lower()
 235     return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
 236
 237 r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
 238 def tryBrokenRedirect(data):
 239     if '<newLocation' in data:
 240         newuris = r_brokenRedirect.findall(data)
 241         if newuris: return newuris[0].strip()
 242
 243 def couldBeFeedData(data):
 244     data = data.lower()
 245     if data.count('<html'): return 0
 246     return data.count('<rss') + data.count('<rdf') + data.count('<feed')
 247
 248 def isFeed(uri):
 249     _debuglog('seeing if %s is a feed' % uri)
 250     protocol = urlparse.urlparse(uri)
 251     if protocol[0] not in ('http', 'https'): return 0
 252     data = _gatekeeper.get(uri)
 253     print "isFeed -- %s" % uri
 254     return (couldBeFeedData(data), uri, data)
 255
 256 def sortFeeds(feed1Info, feed2Info):
 257     return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
 258
 259 def getFeedsFromSyndic8(uri):
 260     feeds = []
 261     try:
 262         server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
 263         feedids = server.syndic8.FindFeeds(uri)
 264         infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
 265         infolist.sort(sortFeeds)
 266         feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
 267         _debuglog('found %s feeds through Syndic8' % len(feeds))
 268     except:
 269         pass
 270     return feeds
 271
 272 def feeds(uri, all=False, querySyndic8=False, _recurs=None):
 273     if _recurs is None: _recurs = [uri]
 274
 275     fulluri = makeFullURI(uri)
 276
 277     try:
 278         data = _gatekeeper.get(fulluri, check=False)
 279     except:
 280         return []
 281
 282     # is this already a feed?
 283     if couldBeFeedData(data):
 284         return (fulluri, data)
 285
 286     newuri = tryBrokenRedirect(data)
 287
 288     if newuri and newuri not in _recurs:
 289         _recurs.append(newuri)
 290         return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
 291
 292     # nope, it's a page, try LINK tags first
 293     _debuglog('looking for LINK tags')
 294     data = fix_quirks(data)
 295
 296     links = set()
 297
 298     try:
 299         linktag_links = set(getLinks(data, fulluri))
 300     except Exception, e:
 301         linktag_links = set()
 302         _debuglog("Exception in getLinks: %s" % e)
 303
 304     _debuglog('found %s feeds through LINK tags' % len(links))
 305     #outfeeds = process(outfeeds)
 306
 307     links = links.union(getALinks(data, fulluri))
 308     links = links.union(getLocalLinks(links, fulluri))
 309
 310     suffixes = [ # filenames used by popular software:
 311           'atom.xml', # blogger, TypePad
 312           'index.atom', # MT, apparently
 313           'index.rdf', # MT
 314           'rss.xml', # Dave Winer/Manila
 315           'index.xml', # MT
 316           'index.rss' # Slash
 317         ]
 318
 319     #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
 320
 321     links -= linktag_links
 322
 323     return process(linktag_links | set([url for url in links if isFeedLink(url) or isXMLRelatedLink(url)]))
 324
 325     if all or not outfeeds:
 326         _debuglog('no A tags, guessing')
 327
 328         #link_list = list(set( + link_list))
 329         #outfeeds.extend(process(link_list))
 330     if (all or not outfeeds) and querySyndic8:
 331         # still no luck, search Syndic8 for feeds (requires xmlrpclib)
 332         _debuglog('still no luck, searching Syndic8')
 333         #outfeeds.extend(getFeedsFromSyndic8(uri))
 334     #if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
 335     #    outfeeds = list(set(outfeeds))
 336
 337     return outfeeds
 338
 339 getFeeds = feeds # backwards-compatibility
 340
 341 def process(urls):
 342     #print "PROCESSING %s" % str(urls)
 343     return [(a[1], a[2]) for a in [isFeed(_url) for _url in urls] if a[0] > 0]
 344
 345 def feed(uri):
 346     #todo: give preference to certain feed formats
 347     feedlist = feeds(uri)
 348     if feedlist:
 349         return feedlist[0]
 350     else:
 351         return None
 352
 353 ##### test harness ######
 354
 355 def test():
 356     uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
 357     failed = []
 358     count = 0
 359     while 1:
 360         data = _gatekeeper.get(uri)
 361         if data.find('Atom autodiscovery test') == -1: break
 362         sys.stdout.write('.')
 363         sys.stdout.flush()
 364         count += 1
 365         links = getLinks(data, uri)
 366         if not links:
 367             print '\n*** FAILED ***', uri, 'could not find link'
 368             failed.append(uri)
 369         elif len(links) > 1:
 370             print '\n*** FAILED ***', uri, 'found too many links'
 371             failed.append(uri)
 372         else:
 373             atomdata = urllib.urlopen(links[0]).read()
 374             if atomdata.find('<link rel="alternate"') == -1:
 375                 print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
 376                 failed.append(uri)
 377             else:
 378                 backlink = atomdata.split('href="').pop().split('"')[0]
 379                 if backlink != uri:
 380                     print '\n*** FAILED ***', uri, 'retrieved wrong feed'
 381                     failed.append(uri)
 382         if data.find('<link rel="next" href="') == -1: break
 383         uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
 384     print
 385     print count, 'tests executed,', len(failed), 'failed'
 386
 387 if __name__ == '__main__':
 388     args = sys.argv[1:]
 389     if args and args[0] == '--debug':
 390         _debug = 1
 391         args.pop(0)
 392     if args:
 393         uri = args[0]
 394     else:
 395         uri = 'http://diveintomark.org/'
 396     if uri == 'test':
 397         test()
 398     else:
 399         print "\n".join(getFeeds(uri))