1 """feedfinder: Find the Web feed for a Web page
2 http://www.aaronsw.com/2002/feedfinder/
5 feed(uri) - returns feed found for a URI
6 feeds(uri) - returns all feeds found for a URI
9 >>> feedfinder.feed('scripting.com')
10 'http://scripting.com/rss.xml'
12 >>> feedfinder.feeds('scripting.com')
13 ['http://delong.typepad.com/sdj/atom.xml',
14 'http://delong.typepad.com/sdj/index.rdf',
15 'http://delong.typepad.com/sdj/rss.xml']
18 Can also use from the command line. Feeds are returned one per line:
20 $ python feedfinder.py diveintomark.org
21 http://diveintomark.org/xml/atom.xml
24 0. At every step, feeds are minimally verified to make sure they are really feeds.
25 1. If the URI points to a feed, it is simply returned; otherwise
26 the page is downloaded and the real fun begins.
27 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
28 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
30 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
31 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
33 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
34 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
35 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
39 __date__
= "2006-04-24"
40 __maintainer__
= "Aaron Swartz (me@aaronsw.com)"
41 __author__
= "Mark Pilgrim (http://diveintomark.org)"
42 __copyright__
= "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
43 __license__
= "Python"
44 __credits__
= """Abe Fettig for a patch to sort Syndic8 feeds by popularity
45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
48 import sgmllib
, urlparse
, re
, sys
, robotparser
51 log
= error
.get_logger()
53 class TimeoutError(Exception): pass
54 def timelimit(timeout
):
55 """borrowed from web.py"""
58 class Dispatch(threading
.Thread
):
60 threading
.Thread
.__init
__(self
)
69 self
.result
= function(*args
, **kw
)
71 self
.error
= sys
.exc_info()
76 raise TimeoutError
, 'took too long'
78 raise c
.error
[0], c
.error
[1]
83 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
84 # Python 2.3 now comes with this module by default, otherwise you can download it
86 import xmlrpclib
# http://www.pythonware.com/products/xmlrpc/
97 def _debuglog(message
):
101 """a class to track robots.txt rules across multiple servers"""
103 self
.rpcache
= {} # a dictionary of RobotFileParser objects, by domain
104 self
.version
= "feedfinder/" + __version__
+ " +http://www.aaronsw.com/2002/feedfinder/"
105 self
.addheaders
= [('User-agent', self
.version
)]
106 robotparser
.URLopener
.version
= self
.version
107 robotparser
.URLopener
.addheaders
= self
.addheaders
109 def _getrp(self
, url
):
110 protocol
, domain
= urlparse
.urlparse(url
)[:2]
111 if self
.rpcache
.has_key(domain
):
112 return self
.rpcache
[domain
]
113 baseurl
= '%s://%s' % (protocol
, domain
)
114 robotsurl
= urlparse
.urljoin(baseurl
, 'robots.txt')
115 _debuglog('fetching %s' % robotsurl
)
116 rp
= robotparser
.RobotFileParser(robotsurl
)
120 _debuglog('failed to fetch %s' % robotsurl
)
121 self
.rpcache
[domain
] = rp
124 def can_fetch(self
, url
):
125 rp
= self
._getrp
(url
)
126 allow
= rp
.can_fetch(self
.version
, url
)
127 _debuglog("gatekeeper of %s says %s" % (url
, allow
))
131 def get(self
, url
, check
=True):
132 if check
and not self
.can_fetch(url
): return ''
136 if url
.startswith("file://"):
137 f
= open(url
[7:], "r")
143 CACHE_DIR
= os
.path
.join(Config
.straw_home(), 'cache')
144 h
= httplib2
.Http(CACHE_DIR
)
145 resp
, content
= h
.request(url
, "GET")
152 _gatekeeper
= URLGatekeeper()
154 class BaseParser(sgmllib
.SGMLParser
):
155 def __init__(self
, baseuri
):
156 sgmllib
.SGMLParser
.__init
__(self
)
158 self
.baseuri
= baseuri
160 def normalize_attrs(self
, attrs
):
162 v
= sgmllib
.charref
.sub(lambda m
: unichr(int(m
.groups()[0])), v
)
164 v
= v
.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&')
166 attrs
= [(k
.lower(), cleanattr(v
)) for k
, v
in attrs
]
167 attrs
= [(k
, k
in ('rel','type') and v
.lower() or v
) for k
, v
in attrs
]
170 def do_base(self
, attrs
):
171 attrsD
= dict(self
.normalize_attrs(attrs
))
172 if not attrsD
.has_key('href'): return
173 self
.baseuri
= attrsD
['href']
175 class LinkParser(BaseParser
):
176 FEED_TYPES
= ('application/rss+xml',
178 'application/atom+xml',
179 'application/x.atom+xml',
180 'application/x-atom+xml')
182 def do_link(self
, attrs
):
183 attrsD
= dict(self
.normalize_attrs(attrs
))
184 if not attrsD
.has_key('rel'): return
185 rels
= attrsD
['rel'].split()
186 if 'alternate' not in rels
: return
187 if attrsD
.get('type') not in self
.FEED_TYPES
: return
188 if not attrsD
.has_key('href'): return
189 self
.links
.append(urlparse
.urljoin(self
.baseuri
, attrsD
['href']))
191 class ALinkParser(BaseParser
):
192 def start_a(self
, attrs
):
193 attrsD
= dict(self
.normalize_attrs(attrs
))
194 if not attrsD
.has_key('href'): return
195 self
.links
.append(urlparse
.urljoin(self
.baseuri
, attrsD
['href']))
197 def makeFullURI(uri
):
199 if uri
.startswith('feed://'):
200 uri
= 'http://' + uri
.split('feed://', 1).pop()
201 for x
in ['http', 'https', 'file']:
202 if uri
.startswith('%s://' % x
):
204 return 'http://%s' % uri
206 def getLinks(data
, baseuri
):
207 p
= LinkParser(baseuri
)
211 except sgmllib
.SGMLParseError
, e
:
212 _debuglog("got SGMLParseError: %s" % str(e
))
216 def getALinks(data
, baseuri
):
217 p
= ALinkParser(baseuri
)
221 except sgmllib
.SGMLParseError
, e
:
222 _debuglog("got SGMLParseError: %s" % str(e
))
226 def getLocalLinks(links
, baseuri
):
227 baseuri
= baseuri
.lower()
228 return [l
for l
in links
if l
.lower().startswith(baseuri
)]
230 def isFeedLink(link
):
231 return link
.endswith("feed/") or link
[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
233 def isXMLRelatedLink(link
):
235 return link
.count('rss') + link
.count('rdf') + link
.count('xml') + link
.count('atom')
237 r_brokenRedirect
= re
.compile('<newLocation[^>]*>(.*?)</newLocation>', re
.S
)
238 def tryBrokenRedirect(data
):
239 if '<newLocation' in data
:
240 newuris
= r_brokenRedirect
.findall(data
)
241 if newuris
: return newuris
[0].strip()
243 def couldBeFeedData(data
):
248 if data
.count('<html'): return 0
249 return data
.count('<rss') + data
.count('<rdf') + data
.count('<feed')
252 _debuglog('seeing if %s is a feed' % uri
)
253 protocol
= urlparse
.urlparse(uri
)
254 if protocol
[0] not in ('http', 'https'): return 0
255 data
= _gatekeeper
.get(uri
)
256 print "isFeed -- %s" % uri
257 return (couldBeFeedData(data
), uri
, data
)
259 def sortFeeds(feed1Info
, feed2Info
):
260 return cmp(feed2Info
['headlines_rank'], feed1Info
['headlines_rank'])
262 def getFeedsFromSyndic8(uri
):
265 server
= xmlrpclib
.Server('http://www.syndic8.com/xmlrpc.php')
266 feedids
= server
.syndic8
.FindFeeds(uri
)
267 infolist
= server
.syndic8
.GetFeedInfo(feedids
, ['headlines_rank','status','dataurl'])
268 infolist
.sort(sortFeeds
)
269 feeds
= [f
['dataurl'] for f
in infolist
if f
['status']=='Syndicated']
270 _debuglog('found %s feeds through Syndic8' % len(feeds
))
275 def urls(uri
, uri_data
):
276 #if _recurs is None: _recurs = [uri]
278 """fulluri = makeFullURI(uri)
281 data = _gatekeeper.get(fulluri, check=False)
285 # is this already a feed?
286 if couldBeFeedData(uri_data
):
288 result
.add((uri
, uri_data
))
291 """newuri = tryBrokenRedirect(data)
293 if newuri and newuri not in _recurs:
294 _recurs.append(newuri)
295 return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)"""
297 # nope, it's a page, try LINK tags first
298 _debuglog('looking for LINK tags')
303 linktag_links
= set(getLinks(uri_data
, uri
))
305 linktag_links
= set()
306 _debuglog("Exception in getLinks: %s" % e
)
308 _debuglog('found %s feeds through LINK tags' % len(links
))
309 #outfeeds = process(outfeeds)
311 links
= links
.union(getALinks(uri_data
, uri
))
312 links
= links
.union(getLocalLinks(links
, uri
))
314 suffixes
= [ # filenames used by popular software:
315 'atom.xml', # blogger, TypePad
316 'index.atom', # MT, apparently
318 'rss.xml', # Dave Winer/Manila
323 #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
325 links
-= linktag_links
327 return linktag_links |
set([url
for url
in links
if isFeedLink(url
) or isXMLRelatedLink(url
)])
330 #print "PROCESSING %s" % str(urls)
331 return [(a
[1], a
[2]) for a
in [isFeed(_url
) for _url
in urls
] if a
[0] > 0]
333 ##### test harness ######
336 uri
= 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
340 data
= _gatekeeper
.get(uri
)
341 if data
.find('Atom autodiscovery test') == -1: break
342 sys
.stdout
.write('.')
345 links
= getLinks(data
, uri
)
347 print '\n*** FAILED ***', uri
, 'could not find link'
350 print '\n*** FAILED ***', uri
, 'found too many links'
353 atomdata
= urllib
.urlopen(links
[0]).read()
354 if atomdata
.find('<link rel="alternate"') == -1:
355 print '\n*** FAILED ***', uri
, 'retrieved something that is not a feed'
358 backlink
= atomdata
.split('href="').pop().split('"')[0]
360 print '\n*** FAILED ***', uri
, 'retrieved wrong feed'
362 if data
.find('<link rel="next" href="') == -1: break
363 uri
= urlparse
.urljoin(uri
, data
.split('<link rel="next" href="').pop().split('"')[0])
365 print count
, 'tests executed,', len(failed
), 'failed'