Work on feed discovery dialog.
[straw/fork.git] / straw / feedfinder.py
blob98f771624ef14b79b52fe774c9ad1b0275b3a0ed
1 """feedfinder: Find the Web feed for a Web page
2 http://www.aaronsw.com/2002/feedfinder/
4 Usage:
5 feed(uri) - returns feed found for a URI
6 feeds(uri) - returns all feeds found for a URI
8 >>> import feedfinder
9 >>> feedfinder.feed('scripting.com')
10 'http://scripting.com/rss.xml'
11 >>>
12 >>> feedfinder.feeds('scripting.com')
13 ['http://delong.typepad.com/sdj/atom.xml',
14 'http://delong.typepad.com/sdj/index.rdf',
15 'http://delong.typepad.com/sdj/rss.xml']
16 >>>
18 Can also use from the command line. Feeds are returned one per line:
20 $ python feedfinder.py diveintomark.org
21 http://diveintomark.org/xml/atom.xml
23 How it works:
24 0. At every step, feeds are minimally verified to make sure they are really feeds.
25 1. If the URI points to a feed, it is simply returned; otherwise
26 the page is downloaded and the real fun begins.
27 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
28 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
29 ".atom"
30 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
31 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
32 ".atom"
33 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
34 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
35 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
36 """
38 __version__ = "1.371"
39 __date__ = "2006-04-24"
40 __maintainer__ = "Aaron Swartz (me@aaronsw.com)"
41 __author__ = "Mark Pilgrim (http://diveintomark.org)"
42 __copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
43 __license__ = "Python"
44 __credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
47 _debug = 1
49 import sgmllib, urllib, urlparse, re, sys, robotparser
51 import threading
52 class TimeoutError(Exception): pass
53 def timelimit(timeout):
54 """borrowed from web.py"""
55 def _1(function):
56 def _2(*args, **kw):
57 class Dispatch(threading.Thread):
58 def __init__(self):
59 threading.Thread.__init__(self)
60 self.result = None
61 self.error = None
63 self.setDaemon(True)
64 self.start()
66 def run(self):
67 try:
68 self.result = function(*args, **kw)
69 except:
70 self.error = sys.exc_info()
72 c = Dispatch()
73 c.join(timeout)
74 if c.isAlive():
75 raise TimeoutError, 'took too long'
76 if c.error:
77 raise c.error[0], c.error[1]
78 return c.result
79 return _2
80 return _1
82 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
83 # Python 2.3 now comes with this module by default, otherwise you can download it
84 try:
85 import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
86 except ImportError:
87 xmlrpclib = None
89 if not dict:
90 def dict(aList):
91 rc = {}
92 for k, v in aList:
93 rc[k] = v
94 return rc
96 def _debuglog(message):
97 if _debug: print message
99 class URLGatekeeper:
100 """a class to track robots.txt rules across multiple servers"""
101 def __init__(self):
102 self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
103 self.urlopener = urllib.FancyURLopener()
104 self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://www.aaronsw.com/2002/feedfinder/"
105 _debuglog(self.urlopener.version)
106 self.urlopener.addheaders = [('User-agent', self.urlopener.version)]
107 robotparser.URLopener.version = self.urlopener.version
108 robotparser.URLopener.addheaders = self.urlopener.addheaders
110 def _getrp(self, url):
111 protocol, domain = urlparse.urlparse(url)[:2]
112 if self.rpcache.has_key(domain):
113 return self.rpcache[domain]
114 baseurl = '%s://%s' % (protocol, domain)
115 robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
116 _debuglog('fetching %s' % robotsurl)
117 rp = robotparser.RobotFileParser(robotsurl)
118 try:
119 rp.read()
120 except:
121 pass
122 self.rpcache[domain] = rp
123 return rp
125 def can_fetch(self, url):
126 rp = self._getrp(url)
127 allow = rp.can_fetch(self.urlopener.version, url)
128 _debuglog("gatekeeper of %s says %s" % (url, allow))
129 return allow
131 @timelimit(50)
132 def get(self, url, check=True):
133 if check and not self.can_fetch(url): return ''
134 try:
135 content = ""
137 if url.startswith("file://"):
138 f = open(url[7:], "r")
139 content = f.read()
140 f.close()
141 else:
142 import httplib2, os
143 import Config
144 CACHE_DIR = os.path.join(Config.straw_home(), 'cache')
145 h = httplib2.Http(CACHE_DIR)
146 resp, content = h.request(url, "GET")
148 return content
149 #return self.urlopener.open(url).read()
150 except Exception, e:
151 _debuglog(e)
152 return ''
154 _gatekeeper = URLGatekeeper()
156 class BaseParser(sgmllib.SGMLParser):
157 def __init__(self, baseuri):
158 sgmllib.SGMLParser.__init__(self)
159 self.links = []
160 self.baseuri = baseuri
162 def normalize_attrs(self, attrs):
163 def cleanattr(v):
164 v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
165 v = v.strip()
166 v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
167 return v
168 attrs = [(k.lower(), cleanattr(v)) for k, v in attrs]
169 attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
170 return attrs
172 def do_base(self, attrs):
173 attrsD = dict(self.normalize_attrs(attrs))
174 if not attrsD.has_key('href'): return
175 self.baseuri = attrsD['href']
177 def error(self, text):
178 pass
179 #_debuglog(text)
181 class LinkParser(BaseParser):
182 FEED_TYPES = ('application/rss+xml',
183 'text/xml',
184 'application/atom+xml',
185 'application/x.atom+xml',
186 'application/x-atom+xml')
188 def do_link(self, attrs):
189 attrsD = dict(self.normalize_attrs(attrs))
190 if not attrsD.has_key('rel'): return
191 rels = attrsD['rel'].split()
192 if 'alternate' not in rels: return
193 if attrsD.get('type') not in self.FEED_TYPES: return
194 if not attrsD.has_key('href'): return
195 self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
197 class ALinkParser(BaseParser):
198 def start_a(self, attrs):
199 attrsD = dict(self.normalize_attrs(attrs))
200 if not attrsD.has_key('href'): return
201 self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
203 def fix_quirks(content):
204 p = re.compile( '<!DOCTYPE(.*?)\/>', re.M | re.S)
205 return p.sub('', content)
207 def makeFullURI(uri):
208 uri = uri.strip()
209 if uri.startswith('feed://'):
210 uri = 'http://' + uri.split('feed://', 1).pop()
211 for x in ['http', 'https', 'file']:
212 if uri.startswith('%s://' % x):
213 return uri
214 return 'http://%s' % uri
216 def getLinks(data, baseuri):
217 p = LinkParser(baseuri)
218 p.feed(data)
219 return p.links
221 def getALinks(data, baseuri):
222 p = ALinkParser(baseuri)
223 p.feed(data)
224 return p.links
226 def getLocalLinks(links, baseuri):
227 baseuri = baseuri.lower()
228 return [l for l in links if l.lower().startswith(baseuri)]
230 def isFeedLink(link):
231 return link.endswith("feed/") or link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
233 def isXMLRelatedLink(link):
234 link = link.lower()
235 return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
237 r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
238 def tryBrokenRedirect(data):
239 if '<newLocation' in data:
240 newuris = r_brokenRedirect.findall(data)
241 if newuris: return newuris[0].strip()
243 def couldBeFeedData(data):
244 data = data.lower()
245 if data.count('<html'): return 0
246 return data.count('<rss') + data.count('<rdf') + data.count('<feed')
248 def isFeed(uri):
249 _debuglog('seeing if %s is a feed' % uri)
250 protocol = urlparse.urlparse(uri)
251 if protocol[0] not in ('http', 'https'): return 0
252 data = _gatekeeper.get(uri)
253 print "isFeed -- %s" % uri
254 return (couldBeFeedData(data), uri, data)
256 def sortFeeds(feed1Info, feed2Info):
257 return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
259 def getFeedsFromSyndic8(uri):
260 feeds = []
261 try:
262 server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
263 feedids = server.syndic8.FindFeeds(uri)
264 infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
265 infolist.sort(sortFeeds)
266 feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
267 _debuglog('found %s feeds through Syndic8' % len(feeds))
268 except:
269 pass
270 return feeds
272 def feeds(uri, all=False, querySyndic8=False, _recurs=None):
273 if _recurs is None: _recurs = [uri]
275 fulluri = makeFullURI(uri)
277 try:
278 data = _gatekeeper.get(fulluri, check=False)
279 except:
280 return []
282 # is this already a feed?
283 if couldBeFeedData(data):
284 return (fulluri, data)
286 newuri = tryBrokenRedirect(data)
288 if newuri and newuri not in _recurs:
289 _recurs.append(newuri)
290 return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
292 # nope, it's a page, try LINK tags first
293 _debuglog('looking for LINK tags')
294 data = fix_quirks(data)
296 links = set()
298 try:
299 linktag_links = set(getLinks(data, fulluri))
300 except Exception, e:
301 linktag_links = set()
302 _debuglog("Exception in getLinks: %s" % e)
304 _debuglog('found %s feeds through LINK tags' % len(links))
305 #outfeeds = process(outfeeds)
307 links = links.union(getALinks(data, fulluri))
308 links = links.union(getLocalLinks(links, fulluri))
310 suffixes = [ # filenames used by popular software:
311 'atom.xml', # blogger, TypePad
312 'index.atom', # MT, apparently
313 'index.rdf', # MT
314 'rss.xml', # Dave Winer/Manila
315 'index.xml', # MT
316 'index.rss' # Slash
319 #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
321 links -= linktag_links
323 return process(linktag_links | set([url for url in links if isFeedLink(url) or isXMLRelatedLink(url)]))
325 if all or not outfeeds:
326 _debuglog('no A tags, guessing')
328 #link_list = list(set( + link_list))
329 #outfeeds.extend(process(link_list))
330 if (all or not outfeeds) and querySyndic8:
331 # still no luck, search Syndic8 for feeds (requires xmlrpclib)
332 _debuglog('still no luck, searching Syndic8')
333 #outfeeds.extend(getFeedsFromSyndic8(uri))
334 #if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
335 # outfeeds = list(set(outfeeds))
337 return outfeeds
339 getFeeds = feeds # backwards-compatibility
341 def process(urls):
342 #print "PROCESSING %s" % str(urls)
343 return [(a[1], a[2]) for a in [isFeed(_url) for _url in urls] if a[0] > 0]
345 def feed(uri):
346 #todo: give preference to certain feed formats
347 feedlist = feeds(uri)
348 if feedlist:
349 return feedlist[0]
350 else:
351 return None
353 ##### test harness ######
355 def test():
356 uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
357 failed = []
358 count = 0
359 while 1:
360 data = _gatekeeper.get(uri)
361 if data.find('Atom autodiscovery test') == -1: break
362 sys.stdout.write('.')
363 sys.stdout.flush()
364 count += 1
365 links = getLinks(data, uri)
366 if not links:
367 print '\n*** FAILED ***', uri, 'could not find link'
368 failed.append(uri)
369 elif len(links) > 1:
370 print '\n*** FAILED ***', uri, 'found too many links'
371 failed.append(uri)
372 else:
373 atomdata = urllib.urlopen(links[0]).read()
374 if atomdata.find('<link rel="alternate"') == -1:
375 print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
376 failed.append(uri)
377 else:
378 backlink = atomdata.split('href="').pop().split('"')[0]
379 if backlink != uri:
380 print '\n*** FAILED ***', uri, 'retrieved wrong feed'
381 failed.append(uri)
382 if data.find('<link rel="next" href="') == -1: break
383 uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
384 print
385 print count, 'tests executed,', len(failed), 'failed'
387 if __name__ == '__main__':
388 args = sys.argv[1:]
389 if args and args[0] == '--debug':
390 _debug = 1
391 args.pop(0)
392 if args:
393 uri = args[0]
394 else:
395 uri = 'http://diveintomark.org/'
396 if uri == 'test':
397 test()
398 else:
399 print "\n".join(getFeeds(uri))