1 """feedfinder: Find the Web feed for a Web page
2 http://www.aaronsw.com/2002/feedfinder/
5 feed(uri) - returns feed found for a URI
6 feeds(uri) - returns all feeds found for a URI
9 >>> feedfinder.feed('scripting.com')
10 'http://scripting.com/rss.xml'
12 >>> feedfinder.feeds('scripting.com')
13 ['http://delong.typepad.com/sdj/atom.xml',
14 'http://delong.typepad.com/sdj/index.rdf',
15 'http://delong.typepad.com/sdj/rss.xml']
18 Can also use from the command line. Feeds are returned one per line:
20 $ python feedfinder.py diveintomark.org
21 http://diveintomark.org/xml/atom.xml
24 0. At every step, feeds are minimally verified to make sure they are really feeds.
25 1. If the URI points to a feed, it is simply returned; otherwise
26 the page is downloaded and the real fun begins.
27 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
28 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
30 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
31 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
33 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
34 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
35 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
39 __date__
= "2006-04-24"
40 __maintainer__
= "Aaron Swartz (me@aaronsw.com)"
41 __author__
= "Mark Pilgrim (http://diveintomark.org)"
42 __copyright__
= "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
43 __license__
= "Python"
44 __credits__
= """Abe Fettig for a patch to sort Syndic8 feeds by popularity
45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
49 import sgmllib
, urllib
, urlparse
, re
, sys
, robotparser
52 class TimeoutError(Exception): pass
53 def timelimit(timeout
):
54 """borrowed from web.py"""
57 class Dispatch(threading
.Thread
):
59 threading
.Thread
.__init
__(self
)
68 self
.result
= function(*args
, **kw
)
70 self
.error
= sys
.exc_info()
75 raise TimeoutError
, 'took too long'
77 raise c
.error
[0], c
.error
[1]
82 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
83 # Python 2.3 now comes with this module by default, otherwise you can download it
85 import xmlrpclib
# http://www.pythonware.com/products/xmlrpc/
96 def _debuglog(message
):
97 if _debug
: print message
100 """a class to track robots.txt rules across multiple servers"""
102 self
.rpcache
= {} # a dictionary of RobotFileParser objects, by domain
103 self
.urlopener
= urllib
.FancyURLopener()
104 self
.urlopener
.version
= "feedfinder/" + __version__
+ " " + self
.urlopener
.version
+ " +http://www.aaronsw.com/2002/feedfinder/"
105 _debuglog(self
.urlopener
.version
)
106 self
.urlopener
.addheaders
= [('User-agent', self
.urlopener
.version
)]
107 robotparser
.URLopener
.version
= self
.urlopener
.version
108 robotparser
.URLopener
.addheaders
= self
.urlopener
.addheaders
110 def _getrp(self
, url
):
111 protocol
, domain
= urlparse
.urlparse(url
)[:2]
112 if self
.rpcache
.has_key(domain
):
113 return self
.rpcache
[domain
]
114 baseurl
= '%s://%s' % (protocol
, domain
)
115 robotsurl
= urlparse
.urljoin(baseurl
, 'robots.txt')
116 _debuglog('fetching %s' % robotsurl
)
117 rp
= robotparser
.RobotFileParser(robotsurl
)
122 self
.rpcache
[domain
] = rp
125 def can_fetch(self
, url
):
126 rp
= self
._getrp
(url
)
127 allow
= rp
.can_fetch(self
.urlopener
.version
, url
)
128 _debuglog("gatekeeper of %s says %s" % (url
, allow
))
132 def get(self
, url
, check
=True):
133 if check
and not self
.can_fetch(url
): return ''
137 if url
.startswith("file://"):
138 f
= open(url
[7:], "r")
144 CACHE_DIR
= os
.path
.join(Config
.straw_home(), 'cache')
145 h
= httplib2
.Http(CACHE_DIR
)
146 resp
, content
= h
.request(url
, "GET")
149 #return self.urlopener.open(url).read()
154 _gatekeeper
= URLGatekeeper()
156 class BaseParser(sgmllib
.SGMLParser
):
157 def __init__(self
, baseuri
):
158 sgmllib
.SGMLParser
.__init
__(self
)
160 self
.baseuri
= baseuri
162 def normalize_attrs(self
, attrs
):
164 v
= sgmllib
.charref
.sub(lambda m
: unichr(int(m
.groups()[0])), v
)
166 v
= v
.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&')
168 attrs
= [(k
.lower(), cleanattr(v
)) for k
, v
in attrs
]
169 attrs
= [(k
, k
in ('rel','type') and v
.lower() or v
) for k
, v
in attrs
]
172 def do_base(self
, attrs
):
173 attrsD
= dict(self
.normalize_attrs(attrs
))
174 if not attrsD
.has_key('href'): return
175 self
.baseuri
= attrsD
['href']
177 def error(self
, text
):
181 class LinkParser(BaseParser
):
182 FEED_TYPES
= ('application/rss+xml',
184 'application/atom+xml',
185 'application/x.atom+xml',
186 'application/x-atom+xml')
188 def do_link(self
, attrs
):
189 attrsD
= dict(self
.normalize_attrs(attrs
))
190 if not attrsD
.has_key('rel'): return
191 rels
= attrsD
['rel'].split()
192 if 'alternate' not in rels
: return
193 if attrsD
.get('type') not in self
.FEED_TYPES
: return
194 if not attrsD
.has_key('href'): return
195 self
.links
.append(urlparse
.urljoin(self
.baseuri
, attrsD
['href']))
197 class ALinkParser(BaseParser
):
198 def start_a(self
, attrs
):
199 attrsD
= dict(self
.normalize_attrs(attrs
))
200 if not attrsD
.has_key('href'): return
201 self
.links
.append(urlparse
.urljoin(self
.baseuri
, attrsD
['href']))
203 def fix_quirks(content
):
204 p
= re
.compile( '<!DOCTYPE(.*?)\/>', re
.M | re
.S
)
205 return p
.sub('', content
)
207 def makeFullURI(uri
):
209 if uri
.startswith('feed://'):
210 uri
= 'http://' + uri
.split('feed://', 1).pop()
211 for x
in ['http', 'https', 'file']:
212 if uri
.startswith('%s://' % x
):
214 return 'http://%s' % uri
216 def getLinks(data
, baseuri
):
217 p
= LinkParser(baseuri
)
221 def getALinks(data
, baseuri
):
222 p
= ALinkParser(baseuri
)
226 def getLocalLinks(links
, baseuri
):
227 baseuri
= baseuri
.lower()
228 return [l
for l
in links
if l
.lower().startswith(baseuri
)]
230 def isFeedLink(link
):
231 return link
.endswith("feed/") or link
[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
233 def isXMLRelatedLink(link
):
235 return link
.count('rss') + link
.count('rdf') + link
.count('xml') + link
.count('atom')
237 r_brokenRedirect
= re
.compile('<newLocation[^>]*>(.*?)</newLocation>', re
.S
)
238 def tryBrokenRedirect(data
):
239 if '<newLocation' in data
:
240 newuris
= r_brokenRedirect
.findall(data
)
241 if newuris
: return newuris
[0].strip()
243 def couldBeFeedData(data
):
245 if data
.count('<html'): return 0
246 return data
.count('<rss') + data
.count('<rdf') + data
.count('<feed')
249 _debuglog('seeing if %s is a feed' % uri
)
250 protocol
= urlparse
.urlparse(uri
)
251 if protocol
[0] not in ('http', 'https'): return 0
252 data
= _gatekeeper
.get(uri
)
253 print "isFeed -- %s" % uri
254 return (couldBeFeedData(data
), uri
, data
)
256 def sortFeeds(feed1Info
, feed2Info
):
257 return cmp(feed2Info
['headlines_rank'], feed1Info
['headlines_rank'])
259 def getFeedsFromSyndic8(uri
):
262 server
= xmlrpclib
.Server('http://www.syndic8.com/xmlrpc.php')
263 feedids
= server
.syndic8
.FindFeeds(uri
)
264 infolist
= server
.syndic8
.GetFeedInfo(feedids
, ['headlines_rank','status','dataurl'])
265 infolist
.sort(sortFeeds
)
266 feeds
= [f
['dataurl'] for f
in infolist
if f
['status']=='Syndicated']
267 _debuglog('found %s feeds through Syndic8' % len(feeds
))
272 def feeds(uri
, all
=False, querySyndic8
=False, _recurs
=None):
273 if _recurs
is None: _recurs
= [uri
]
275 fulluri
= makeFullURI(uri
)
278 data
= _gatekeeper
.get(fulluri
, check
=False)
282 # is this already a feed?
283 if couldBeFeedData(data
):
284 return (fulluri
, data
)
286 newuri
= tryBrokenRedirect(data
)
288 if newuri
and newuri
not in _recurs
:
289 _recurs
.append(newuri
)
290 return feeds(newuri
, all
=all
, querySyndic8
=querySyndic8
, _recurs
=_recurs
)
292 # nope, it's a page, try LINK tags first
293 _debuglog('looking for LINK tags')
294 data
= fix_quirks(data
)
299 linktag_links
= set(getLinks(data
, fulluri
))
301 linktag_links
= set()
302 _debuglog("Exception in getLinks: %s" % e
)
304 _debuglog('found %s feeds through LINK tags' % len(links
))
305 #outfeeds = process(outfeeds)
307 links
= links
.union(getALinks(data
, fulluri
))
308 links
= links
.union(getLocalLinks(links
, fulluri
))
310 suffixes
= [ # filenames used by popular software:
311 'atom.xml', # blogger, TypePad
312 'index.atom', # MT, apparently
314 'rss.xml', # Dave Winer/Manila
319 #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
321 links
-= linktag_links
323 return process(linktag_links |
set([url
for url
in links
if isFeedLink(url
) or isXMLRelatedLink(url
)]))
325 if all
or not outfeeds
:
326 _debuglog('no A tags, guessing')
328 #link_list = list(set( + link_list))
329 #outfeeds.extend(process(link_list))
330 if (all
or not outfeeds
) and querySyndic8
:
331 # still no luck, search Syndic8 for feeds (requires xmlrpclib)
332 _debuglog('still no luck, searching Syndic8')
333 #outfeeds.extend(getFeedsFromSyndic8(uri))
334 #if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
335 # outfeeds = list(set(outfeeds))
339 getFeeds
= feeds
# backwards-compatibility
342 #print "PROCESSING %s" % str(urls)
343 return [(a
[1], a
[2]) for a
in [isFeed(_url
) for _url
in urls
] if a
[0] > 0]
346 #todo: give preference to certain feed formats
347 feedlist
= feeds(uri
)
353 ##### test harness ######
356 uri
= 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
360 data
= _gatekeeper
.get(uri
)
361 if data
.find('Atom autodiscovery test') == -1: break
362 sys
.stdout
.write('.')
365 links
= getLinks(data
, uri
)
367 print '\n*** FAILED ***', uri
, 'could not find link'
370 print '\n*** FAILED ***', uri
, 'found too many links'
373 atomdata
= urllib
.urlopen(links
[0]).read()
374 if atomdata
.find('<link rel="alternate"') == -1:
375 print '\n*** FAILED ***', uri
, 'retrieved something that is not a feed'
378 backlink
= atomdata
.split('href="').pop().split('"')[0]
380 print '\n*** FAILED ***', uri
, 'retrieved wrong feed'
382 if data
.find('<link rel="next" href="') == -1: break
383 uri
= urlparse
.urljoin(uri
, data
.split('<link rel="next" href="').pop().split('"')[0])
385 print count
, 'tests executed,', len(failed
), 'failed'
387 if __name__
== '__main__':
389 if args
and args
[0] == '--debug':
395 uri
= 'http://diveintomark.org/'
399 print "\n".join(getFeeds(uri
))