straw/feedfinder.py

   1 """feedfinder: Find the Web feed for a Web page
   2 http://www.aaronsw.com/2002/feedfinder/
   3
   4 Usage:
   5   feed(uri) - returns feed found for a URI
   6   feeds(uri) - returns all feeds found for a URI
   7
   8     >>> import feedfinder
   9     >>> feedfinder.feed('scripting.com')
  10     'http://scripting.com/rss.xml'
  11     >>>
  12     >>> feedfinder.feeds('scripting.com')
  13     ['http://delong.typepad.com/sdj/atom.xml',
  14      'http://delong.typepad.com/sdj/index.rdf',
  15      'http://delong.typepad.com/sdj/rss.xml']
  16     >>>
  17
  18 Can also use from the command line.  Feeds are returned one per line:
  19
  20     $ python feedfinder.py diveintomark.org
  21     http://diveintomark.org/xml/atom.xml
  22
  23 How it works:
  24   0. At every step, feeds are minimally verified to make sure they are really feeds.
  25   1. If the URI points to a feed, it is simply returned; otherwise
  26      the page is downloaded and the real fun begins.
  27   2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
  28   3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
  29      ".atom"
  30   4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
  31   5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
  32      ".atom"
  33   6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
  34   7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
  35   8. As a last ditch effort, we search Syndic8 for feeds matching the URI
  36 """
  37
  38 __version__ = "1.371"
  39 __date__ = "2006-04-24"
  40 __maintainer__ = "Aaron Swartz (me@aaronsw.com)"
  41 __author__ = "Mark Pilgrim (http://diveintomark.org)"
  42 __copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
  43 __license__ = "Python"
  44 __credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
  45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
  46
  47 import error
  48 import sgmllib, urlparse, re, sys, robotparser
  49 import threading
  50
  51 log = error.get_logger()
  52
  53 class TimeoutError(Exception): pass
  54 def timelimit(timeout):
  55     """borrowed from web.py"""
  56     def _1(function):
  57         def _2(*args, **kw):
  58             class Dispatch(threading.Thread):
  59                 def __init__(self):
  60                     threading.Thread.__init__(self)
  61                     self.result = None
  62                     self.error = None
  63
  64                     self.setDaemon(True)
  65                     self.start()
  66
  67                 def run(self):
  68                     try:
  69                         self.result = function(*args, **kw)
  70                     except:
  71                         self.error = sys.exc_info()
  72
  73             c = Dispatch()
  74             c.join(timeout)
  75             if c.isAlive():
  76                 raise TimeoutError, 'took too long'
  77             if c.error:
  78                 raise c.error[0], c.error[1]
  79             return c.result
  80         return _2
  81     return _1
  82
  83 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
  84 # Python 2.3 now comes with this module by default, otherwise you can download it
  85 try:
  86     import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
  87 except ImportError:
  88     xmlrpclib = None
  89
  90 if not dict:
  91     def dict(aList):
  92         rc = {}
  93         for k, v in aList:
  94             rc[k] = v
  95         return rc
  96
  97 def _debuglog(message):
  98     log.debug(message)
  99
 100 class URLGatekeeper:
 101     """a class to track robots.txt rules across multiple servers"""
 102     def __init__(self):
 103         self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
 104         self.version = "feedfinder/" + __version__ + " +http://www.aaronsw.com/2002/feedfinder/"
 105         self.addheaders = [('User-agent', self.version)]
 106         robotparser.URLopener.version = self.version
 107         robotparser.URLopener.addheaders = self.addheaders
 108
 109     def _getrp(self, url):
 110         protocol, domain = urlparse.urlparse(url)[:2]
 111         if self.rpcache.has_key(domain):
 112             return self.rpcache[domain]
 113         baseurl = '%s://%s' % (protocol, domain)
 114         robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
 115         _debuglog('fetching %s' % robotsurl)
 116         rp = robotparser.RobotFileParser(robotsurl)
 117         try:
 118             rp.read()
 119         except:
 120             _debuglog('failed to fetch %s' % robotsurl)
 121         self.rpcache[domain] = rp
 122         return rp
 123
 124     def can_fetch(self, url):
 125         rp = self._getrp(url)
 126         allow = rp.can_fetch(self.version, url)
 127         _debuglog("gatekeeper of %s says %s" % (url, allow))
 128         return allow
 129
 130     @timelimit(50)
 131     def get(self, url, check=True):
 132         if check and not self.can_fetch(url): return ''
 133         try:
 134             content = ""
 135
 136             if url.startswith("file://"):
 137                 f = open(url[7:], "r")
 138                 content = f.read()
 139                 f.close()
 140             else:
 141                 import httplib2, os
 142                 import Config
 143                 CACHE_DIR = os.path.join(Config.straw_home(), 'cache')
 144                 h = httplib2.Http(CACHE_DIR)
 145                 resp, content = h.request(url, "GET")
 146
 147             return content
 148         except Exception, e:
 149             _debuglog(e)
 150             return ''
 151
 152 _gatekeeper = URLGatekeeper()
 153
 154 class BaseParser(sgmllib.SGMLParser):
 155     def __init__(self, baseuri):
 156         sgmllib.SGMLParser.__init__(self)
 157         self.links = []
 158         self.baseuri = baseuri
 159
 160     def normalize_attrs(self, attrs):
 161         def cleanattr(v):
 162             v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
 163             v = v.strip()
 164             v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
 165             return v
 166         attrs = [(k.lower(), cleanattr(v)) for k, v in attrs]
 167         attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
 168         return attrs
 169
 170     def do_base(self, attrs):
 171         attrsD = dict(self.normalize_attrs(attrs))
 172         if not attrsD.has_key('href'): return
 173         self.baseuri = attrsD['href']
 174
 175 class LinkParser(BaseParser):
 176     FEED_TYPES = ('application/rss+xml',
 177                   'text/xml',
 178                   'application/atom+xml',
 179                   'application/x.atom+xml',
 180                   'application/x-atom+xml')
 181
 182     def do_link(self, attrs):
 183         attrsD = dict(self.normalize_attrs(attrs))
 184         if not attrsD.has_key('rel'): return
 185         rels = attrsD['rel'].split()
 186         if 'alternate' not in rels: return
 187         if attrsD.get('type') not in self.FEED_TYPES: return
 188         if not attrsD.has_key('href'): return
 189         self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
 190
 191 class ALinkParser(BaseParser):
 192     def start_a(self, attrs):
 193         attrsD = dict(self.normalize_attrs(attrs))
 194         if not attrsD.has_key('href'): return
 195         self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
 196
 197 def makeFullURI(uri):
 198     uri = uri.strip()
 199     if uri.startswith('feed://'):
 200         uri = 'http://' + uri.split('feed://', 1).pop()
 201     for x in ['http', 'https', 'file']:
 202         if uri.startswith('%s://' % x):
 203             return uri
 204     return 'http://%s' % uri
 205
 206 def getLinks(data, baseuri):
 207     p = LinkParser(baseuri)
 208
 209     try:
 210         p.feed(data)
 211     except sgmllib.SGMLParseError, e:
 212         _debuglog("got SGMLParseError: %s" % str(e))
 213
 214     return p.links
 215
 216 def getALinks(data, baseuri):
 217     p = ALinkParser(baseuri)
 218
 219     try:
 220         p.feed(data)
 221     except sgmllib.SGMLParseError, e:
 222         _debuglog("got SGMLParseError: %s" % str(e))
 223
 224     return p.links
 225
 226 def getLocalLinks(links, baseuri):
 227     baseuri = baseuri.lower()
 228     return [l for l in links if l.lower().startswith(baseuri)]
 229
 230 def isFeedLink(link):
 231     return link.endswith("feed/") or link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
 232
 233 def isXMLRelatedLink(link):
 234     link = link.lower()
 235     return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
 236
 237 r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
 238 def tryBrokenRedirect(data):
 239     if '<newLocation' in data:
 240         newuris = r_brokenRedirect.findall(data)
 241         if newuris: return newuris[0].strip()
 242
 243 def couldBeFeedData(data):
 244     if not data:
 245         return False
 246
 247     data = data.lower()
 248     if data.count('<html'): return 0
 249     return data.count('<rss') + data.count('<rdf') + data.count('<feed')
 250
 251 def isFeed(uri):
 252     _debuglog('seeing if %s is a feed' % uri)
 253     protocol = urlparse.urlparse(uri)
 254     if protocol[0] not in ('http', 'https'): return 0
 255     data = _gatekeeper.get(uri)
 256     print "isFeed -- %s" % uri
 257     return (couldBeFeedData(data), uri, data)
 258
 259 def sortFeeds(feed1Info, feed2Info):
 260     return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
 261
 262 def getFeedsFromSyndic8(uri):
 263     feeds = []
 264     try:
 265         server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
 266         feedids = server.syndic8.FindFeeds(uri)
 267         infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
 268         infolist.sort(sortFeeds)
 269         feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
 270         _debuglog('found %s feeds through Syndic8' % len(feeds))
 271     except:
 272         pass
 273     return feeds
 274
 275 def urls(uri, uri_data):
 276     #if _recurs is None: _recurs = [uri]
 277
 278     """fulluri = makeFullURI(uri)
 279
 280     try:
 281         data = _gatekeeper.get(fulluri, check=False)
 282     except:
 283         return []"""
 284
 285     # is this already a feed?
 286     if couldBeFeedData(uri_data):
 287         result = set()
 288         result.add((uri, uri_data))
 289         return result
 290
 291     """newuri = tryBrokenRedirect(data)
 292
 293     if newuri and newuri not in _recurs:
 294         _recurs.append(newuri)
 295         return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)"""
 296
 297     # nope, it's a page, try LINK tags first
 298     _debuglog('looking for LINK tags')
 299
 300     links = set()
 301
 302     try:
 303         linktag_links = set(getLinks(uri_data, uri))
 304     except Exception, e:
 305         linktag_links = set()
 306         _debuglog("Exception in getLinks: %s" % e)
 307
 308     _debuglog('found %s feeds through LINK tags' % len(links))
 309     #outfeeds = process(outfeeds)
 310
 311     links = links.union(getALinks(uri_data, uri))
 312     links = links.union(getLocalLinks(links, uri))
 313
 314     suffixes = [ # filenames used by popular software:
 315           'atom.xml', # blogger, TypePad
 316           'index.atom', # MT, apparently
 317           'index.rdf', # MT
 318           'rss.xml', # Dave Winer/Manila
 319           'index.xml', # MT
 320           'index.rss' # Slash
 321         ]
 322
 323     #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
 324
 325     links -= linktag_links
 326
 327     return linktag_links | set([url for url in links if isFeedLink(url) or isXMLRelatedLink(url)])
 328
 329 def process(urls):
 330     #print "PROCESSING %s" % str(urls)
 331     return [(a[1], a[2]) for a in [isFeed(_url) for _url in urls] if a[0] > 0]
 332
 333 ##### test harness ######
 334
 335 def test():
 336     uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
 337     failed = []
 338     count = 0
 339     while 1:
 340         data = _gatekeeper.get(uri)
 341         if data.find('Atom autodiscovery test') == -1: break
 342         sys.stdout.write('.')
 343         sys.stdout.flush()
 344         count += 1
 345         links = getLinks(data, uri)
 346         if not links:
 347             print '\n*** FAILED ***', uri, 'could not find link'
 348             failed.append(uri)
 349         elif len(links) > 1:
 350             print '\n*** FAILED ***', uri, 'found too many links'
 351             failed.append(uri)
 352         else:
 353             atomdata = urllib.urlopen(links[0]).read()
 354             if atomdata.find('<link rel="alternate"') == -1:
 355                 print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
 356                 failed.append(uri)
 357             else:
 358                 backlink = atomdata.split('href="').pop().split('"')[0]
 359                 if backlink != uri:
 360                     print '\n*** FAILED ***', uri, 'retrieved wrong feed'
 361                     failed.append(uri)
 362         if data.find('<link rel="next" href="') == -1: break
 363         uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
 364     print
 365     print count, 'tests executed,', len(failed), 'failed'