1 """feedfinder: Find the Web feed for a Web page
2 http://www.aaronsw.com/2002/feedfinder/
5 feed(uri) - returns feed found for a URI
6 feeds(uri) - returns all feeds found for a URI
9 >>> feedfinder.feed('scripting.com')
10 'http://scripting.com/rss.xml'
12 >>> feedfinder.feeds('scripting.com')
13 ['http://delong.typepad.com/sdj/atom.xml',
14 'http://delong.typepad.com/sdj/index.rdf',
15 'http://delong.typepad.com/sdj/rss.xml']
18 Can also use from the command line. Feeds are returned one per line:
20 $ python feedfinder.py diveintomark.org
21 http://diveintomark.org/xml/atom.xml
24 0. At every step, feeds are minimally verified to make sure they are really feeds.
25 1. If the URI points to a feed, it is simply returned; otherwise
26 the page is downloaded and the real fun begins.
27 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
28 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
30 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
31 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
33 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
34 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
35 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
39 __date__
= "2006-04-24"
40 __maintainer__
= "Aaron Swartz (me@aaronsw.com)"
41 __author__
= "Mark Pilgrim (http://diveintomark.org)"
42 __copyright__
= "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
43 __license__
= "Python"
44 __credits__
= """Abe Fettig for a patch to sort Syndic8 feeds by popularity
45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
49 import sgmllib
, urllib
, urlparse
, re
, sys
, robotparser
52 class TimeoutError(Exception): pass
53 def timelimit(timeout
):
54 """borrowed from web.py"""
57 class Dispatch(threading
.Thread
):
59 threading
.Thread
.__init
__(self
)
68 self
.result
= function(*args
, **kw
)
70 self
.error
= sys
.exc_info()
75 raise TimeoutError
, 'took too long'
77 raise c
.error
[0], c
.error
[1]
82 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
83 # Python 2.3 now comes with this module by default, otherwise you can download it
85 import xmlrpclib
# http://www.pythonware.com/products/xmlrpc/
96 def _debuglog(message
):
97 if _debug
: print message
100 """a class to track robots.txt rules across multiple servers"""
102 self
.rpcache
= {} # a dictionary of RobotFileParser objects, by domain
103 self
.urlopener
= urllib
.FancyURLopener()
104 self
.urlopener
.version
= "feedfinder/" + __version__
+ " " + self
.urlopener
.version
+ " +http://www.aaronsw.com/2002/feedfinder/"
105 _debuglog(self
.urlopener
.version
)
106 self
.urlopener
.addheaders
= [('User-agent', self
.urlopener
.version
)]
107 robotparser
.URLopener
.version
= self
.urlopener
.version
108 robotparser
.URLopener
.addheaders
= self
.urlopener
.addheaders
110 def _getrp(self
, url
):
111 protocol
, domain
= urlparse
.urlparse(url
)[:2]
112 if self
.rpcache
.has_key(domain
):
113 return self
.rpcache
[domain
]
114 baseurl
= '%s://%s' % (protocol
, domain
)
115 robotsurl
= urlparse
.urljoin(baseurl
, 'robots.txt')
116 _debuglog('fetching %s' % robotsurl
)
117 rp
= robotparser
.RobotFileParser(robotsurl
)
121 _debuglog('failed to fetch %s' % robotsurl
)
122 self
.rpcache
[domain
] = rp
125 def can_fetch(self
, url
):
126 rp
= self
._getrp
(url
)
127 allow
= rp
.can_fetch(self
.urlopener
.version
, url
)
128 _debuglog("gatekeeper of %s says %s" % (url
, allow
))
132 def get(self
, url
, check
=True):
133 if check
and not self
.can_fetch(url
): return ''
137 if url
.startswith("file://"):
138 f
= open(url
[7:], "r")
144 CACHE_DIR
= os
.path
.join(Config
.straw_home(), 'cache')
145 h
= httplib2
.Http(CACHE_DIR
)
146 resp
, content
= h
.request(url
, "GET")
149 #return self.urlopener.open(url).read()
154 _gatekeeper
= URLGatekeeper()
156 class BaseParser(sgmllib
.SGMLParser
):
157 def __init__(self
, baseuri
):
158 sgmllib
.SGMLParser
.__init
__(self
)
160 self
.baseuri
= baseuri
162 def normalize_attrs(self
, attrs
):
164 v
= sgmllib
.charref
.sub(lambda m
: unichr(int(m
.groups()[0])), v
)
166 v
= v
.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&')
168 attrs
= [(k
.lower(), cleanattr(v
)) for k
, v
in attrs
]
169 attrs
= [(k
, k
in ('rel','type') and v
.lower() or v
) for k
, v
in attrs
]
172 def do_base(self
, attrs
):
173 attrsD
= dict(self
.normalize_attrs(attrs
))
174 if not attrsD
.has_key('href'): return
175 self
.baseuri
= attrsD
['href']
177 class LinkParser(BaseParser
):
178 FEED_TYPES
= ('application/rss+xml',
180 'application/atom+xml',
181 'application/x.atom+xml',
182 'application/x-atom+xml')
184 def do_link(self
, attrs
):
185 attrsD
= dict(self
.normalize_attrs(attrs
))
186 if not attrsD
.has_key('rel'): return
187 rels
= attrsD
['rel'].split()
188 if 'alternate' not in rels
: return
189 if attrsD
.get('type') not in self
.FEED_TYPES
: return
190 if not attrsD
.has_key('href'): return
191 self
.links
.append(urlparse
.urljoin(self
.baseuri
, attrsD
['href']))
193 class ALinkParser(BaseParser
):
194 def start_a(self
, attrs
):
195 attrsD
= dict(self
.normalize_attrs(attrs
))
196 if not attrsD
.has_key('href'): return
197 self
.links
.append(urlparse
.urljoin(self
.baseuri
, attrsD
['href']))
199 def makeFullURI(uri
):
201 if uri
.startswith('feed://'):
202 uri
= 'http://' + uri
.split('feed://', 1).pop()
203 for x
in ['http', 'https', 'file']:
204 if uri
.startswith('%s://' % x
):
206 return 'http://%s' % uri
208 def getLinks(data
, baseuri
):
209 p
= LinkParser(baseuri
)
213 except sgmllib
.SGMLParseError
, e
:
214 _debuglog("got SGMLParseError: %s" % str(e
))
218 def getALinks(data
, baseuri
):
219 p
= ALinkParser(baseuri
)
223 except sgmllib
.SGMLParseError
, e
:
224 _debuglog("got SGMLParseError: %s" % str(e
))
228 def getLocalLinks(links
, baseuri
):
229 baseuri
= baseuri
.lower()
230 return [l
for l
in links
if l
.lower().startswith(baseuri
)]
232 def isFeedLink(link
):
233 return link
.endswith("feed/") or link
[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
235 def isXMLRelatedLink(link
):
237 return link
.count('rss') + link
.count('rdf') + link
.count('xml') + link
.count('atom')
239 r_brokenRedirect
= re
.compile('<newLocation[^>]*>(.*?)</newLocation>', re
.S
)
240 def tryBrokenRedirect(data
):
241 if '<newLocation' in data
:
242 newuris
= r_brokenRedirect
.findall(data
)
243 if newuris
: return newuris
[0].strip()
245 def couldBeFeedData(data
):
250 if data
.count('<html'): return 0
251 return data
.count('<rss') + data
.count('<rdf') + data
.count('<feed')
254 _debuglog('seeing if %s is a feed' % uri
)
255 protocol
= urlparse
.urlparse(uri
)
256 if protocol
[0] not in ('http', 'https'): return 0
257 data
= _gatekeeper
.get(uri
)
258 print "isFeed -- %s" % uri
259 return (couldBeFeedData(data
), uri
, data
)
261 def sortFeeds(feed1Info
, feed2Info
):
262 return cmp(feed2Info
['headlines_rank'], feed1Info
['headlines_rank'])
264 def getFeedsFromSyndic8(uri
):
267 server
= xmlrpclib
.Server('http://www.syndic8.com/xmlrpc.php')
268 feedids
= server
.syndic8
.FindFeeds(uri
)
269 infolist
= server
.syndic8
.GetFeedInfo(feedids
, ['headlines_rank','status','dataurl'])
270 infolist
.sort(sortFeeds
)
271 feeds
= [f
['dataurl'] for f
in infolist
if f
['status']=='Syndicated']
272 _debuglog('found %s feeds through Syndic8' % len(feeds
))
277 def urls(uri
, uri_data
):
278 #if _recurs is None: _recurs = [uri]
280 """fulluri = makeFullURI(uri)
283 data = _gatekeeper.get(fulluri, check=False)
287 # is this already a feed?
288 if couldBeFeedData(uri_data
):
290 result
.add((uri
, uri_data
))
293 """newuri = tryBrokenRedirect(data)
295 if newuri and newuri not in _recurs:
296 _recurs.append(newuri)
297 return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)"""
299 # nope, it's a page, try LINK tags first
300 _debuglog('looking for LINK tags')
305 linktag_links
= set(getLinks(uri_data
, uri
))
307 linktag_links
= set()
308 _debuglog("Exception in getLinks: %s" % e
)
310 _debuglog('found %s feeds through LINK tags' % len(links
))
311 #outfeeds = process(outfeeds)
313 links
= links
.union(getALinks(uri_data
, uri
))
314 links
= links
.union(getLocalLinks(links
, uri
))
316 suffixes
= [ # filenames used by popular software:
317 'atom.xml', # blogger, TypePad
318 'index.atom', # MT, apparently
320 'rss.xml', # Dave Winer/Manila
325 #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
327 links
-= linktag_links
329 return linktag_links |
set([url
for url
in links
if isFeedLink(url
) or isXMLRelatedLink(url
)])
332 #print "PROCESSING %s" % str(urls)
333 return [(a
[1], a
[2]) for a
in [isFeed(_url
) for _url
in urls
] if a
[0] > 0]
335 ##### test harness ######
338 uri
= 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
342 data
= _gatekeeper
.get(uri
)
343 if data
.find('Atom autodiscovery test') == -1: break
344 sys
.stdout
.write('.')
347 links
= getLinks(data
, uri
)
349 print '\n*** FAILED ***', uri
, 'could not find link'
352 print '\n*** FAILED ***', uri
, 'found too many links'
355 atomdata
= urllib
.urlopen(links
[0]).read()
356 if atomdata
.find('<link rel="alternate"') == -1:
357 print '\n*** FAILED ***', uri
, 'retrieved something that is not a feed'
360 backlink
= atomdata
.split('href="').pop().split('"')[0]
362 print '\n*** FAILED ***', uri
, 'retrieved wrong feed'
364 if data
.find('<link rel="next" href="') == -1: break
365 uri
= urlparse
.urljoin(uri
, data
.split('<link rel="next" href="').pop().split('"')[0])
367 print count
, 'tests executed,', len(failed
), 'failed'