Implemented "mark all as read".
[straw.git] / straw / feedfinder.py
blob161127d53381af43c6318348ec1563d37049c9f4
1 """feedfinder: Find the Web feed for a Web page
2 http://www.aaronsw.com/2002/feedfinder/
4 Usage:
5 feed(uri) - returns feed found for a URI
6 feeds(uri) - returns all feeds found for a URI
8 >>> import feedfinder
9 >>> feedfinder.feed('scripting.com')
10 'http://scripting.com/rss.xml'
11 >>>
12 >>> feedfinder.feeds('scripting.com')
13 ['http://delong.typepad.com/sdj/atom.xml',
14 'http://delong.typepad.com/sdj/index.rdf',
15 'http://delong.typepad.com/sdj/rss.xml']
16 >>>
18 Can also use from the command line. Feeds are returned one per line:
20 $ python feedfinder.py diveintomark.org
21 http://diveintomark.org/xml/atom.xml
23 How it works:
24 0. At every step, feeds are minimally verified to make sure they are really feeds.
25 1. If the URI points to a feed, it is simply returned; otherwise
26 the page is downloaded and the real fun begins.
27 2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
28 3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
29 ".atom"
30 4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
31 5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
32 ".atom"
33 6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
34 7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
35 8. As a last ditch effort, we search Syndic8 for feeds matching the URI
36 """
38 __version__ = "1.371"
39 __date__ = "2006-04-24"
40 __maintainer__ = "Aaron Swartz (me@aaronsw.com)"
41 __author__ = "Mark Pilgrim (http://diveintomark.org)"
42 __copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
43 __license__ = "Python"
44 __credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
45 Also Jason Diamond, Brian Lalor for bug reporting and patches"""
47 import error
48 import sgmllib, urlparse, re, sys, robotparser
49 import threading
51 log = error.get_logger()
53 class TimeoutError(Exception): pass
54 def timelimit(timeout):
55 """borrowed from web.py"""
56 def _1(function):
57 def _2(*args, **kw):
58 class Dispatch(threading.Thread):
59 def __init__(self):
60 threading.Thread.__init__(self)
61 self.result = None
62 self.error = None
64 self.setDaemon(True)
65 self.start()
67 def run(self):
68 try:
69 self.result = function(*args, **kw)
70 except:
71 self.error = sys.exc_info()
73 c = Dispatch()
74 c.join(timeout)
75 if c.isAlive():
76 raise TimeoutError, 'took too long'
77 if c.error:
78 raise c.error[0], c.error[1]
79 return c.result
80 return _2
81 return _1
83 # XML-RPC support allows feedfinder to query Syndic8 for possible matches.
84 # Python 2.3 now comes with this module by default, otherwise you can download it
85 try:
86 import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
87 except ImportError:
88 xmlrpclib = None
90 if not dict:
91 def dict(aList):
92 rc = {}
93 for k, v in aList:
94 rc[k] = v
95 return rc
97 def _debuglog(message):
98 log.debug(message)
100 class URLGatekeeper:
101 """a class to track robots.txt rules across multiple servers"""
102 def __init__(self):
103 self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
104 self.version = "feedfinder/" + __version__ + " +http://www.aaronsw.com/2002/feedfinder/"
105 self.addheaders = [('User-agent', self.version)]
106 robotparser.URLopener.version = self.version
107 robotparser.URLopener.addheaders = self.addheaders
109 def _getrp(self, url):
110 protocol, domain = urlparse.urlparse(url)[:2]
111 if self.rpcache.has_key(domain):
112 return self.rpcache[domain]
113 baseurl = '%s://%s' % (protocol, domain)
114 robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
115 _debuglog('fetching %s' % robotsurl)
116 rp = robotparser.RobotFileParser(robotsurl)
117 try:
118 rp.read()
119 except:
120 _debuglog('failed to fetch %s' % robotsurl)
121 self.rpcache[domain] = rp
122 return rp
124 def can_fetch(self, url):
125 rp = self._getrp(url)
126 allow = rp.can_fetch(self.version, url)
127 _debuglog("gatekeeper of %s says %s" % (url, allow))
128 return allow
130 @timelimit(50)
131 def get(self, url, check=True):
132 if check and not self.can_fetch(url): return ''
133 try:
134 content = ""
136 if url.startswith("file://"):
137 f = open(url[7:], "r")
138 content = f.read()
139 f.close()
140 else:
141 import httplib2, os
142 import Config
143 CACHE_DIR = os.path.join(Config.straw_home(), 'cache')
144 h = httplib2.Http(CACHE_DIR)
145 resp, content = h.request(url, "GET")
147 return content
148 except Exception, e:
149 _debuglog(e)
150 return ''
152 _gatekeeper = URLGatekeeper()
154 class BaseParser(sgmllib.SGMLParser):
155 def __init__(self, baseuri):
156 sgmllib.SGMLParser.__init__(self)
157 self.links = []
158 self.baseuri = baseuri
160 def normalize_attrs(self, attrs):
161 def cleanattr(v):
162 v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
163 v = v.strip()
164 v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
165 return v
166 attrs = [(k.lower(), cleanattr(v)) for k, v in attrs]
167 attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
168 return attrs
170 def do_base(self, attrs):
171 attrsD = dict(self.normalize_attrs(attrs))
172 if not attrsD.has_key('href'): return
173 self.baseuri = attrsD['href']
175 class LinkParser(BaseParser):
176 FEED_TYPES = ('application/rss+xml',
177 'text/xml',
178 'application/atom+xml',
179 'application/x.atom+xml',
180 'application/x-atom+xml')
182 def do_link(self, attrs):
183 attrsD = dict(self.normalize_attrs(attrs))
184 if not attrsD.has_key('rel'): return
185 rels = attrsD['rel'].split()
186 if 'alternate' not in rels: return
187 if attrsD.get('type') not in self.FEED_TYPES: return
188 if not attrsD.has_key('href'): return
189 self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
191 class ALinkParser(BaseParser):
192 def start_a(self, attrs):
193 attrsD = dict(self.normalize_attrs(attrs))
194 if not attrsD.has_key('href'): return
195 self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
197 def makeFullURI(uri):
198 uri = uri.strip()
199 if uri.startswith('feed://'):
200 uri = 'http://' + uri.split('feed://', 1).pop()
201 for x in ['http', 'https', 'file']:
202 if uri.startswith('%s://' % x):
203 return uri
204 return 'http://%s' % uri
206 def getLinks(data, baseuri):
207 p = LinkParser(baseuri)
209 try:
210 p.feed(data)
211 except sgmllib.SGMLParseError, e:
212 _debuglog("got SGMLParseError: %s" % str(e))
214 return p.links
216 def getALinks(data, baseuri):
217 p = ALinkParser(baseuri)
219 try:
220 p.feed(data)
221 except sgmllib.SGMLParseError, e:
222 _debuglog("got SGMLParseError: %s" % str(e))
224 return p.links
226 def getLocalLinks(links, baseuri):
227 baseuri = baseuri.lower()
228 return [l for l in links if l.lower().startswith(baseuri)]
230 def isFeedLink(link):
231 return link.endswith("feed/") or link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
233 def isXMLRelatedLink(link):
234 link = link.lower()
235 return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
237 r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
238 def tryBrokenRedirect(data):
239 if '<newLocation' in data:
240 newuris = r_brokenRedirect.findall(data)
241 if newuris: return newuris[0].strip()
243 def couldBeFeedData(data):
244 if not data:
245 return False
247 data = data.lower()
248 if data.count('<html'): return 0
249 return data.count('<rss') + data.count('<rdf') + data.count('<feed')
251 def isFeed(uri):
252 _debuglog('seeing if %s is a feed' % uri)
253 protocol = urlparse.urlparse(uri)
254 if protocol[0] not in ('http', 'https'): return 0
255 data = _gatekeeper.get(uri)
256 print "isFeed -- %s" % uri
257 return (couldBeFeedData(data), uri, data)
259 def sortFeeds(feed1Info, feed2Info):
260 return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
262 def getFeedsFromSyndic8(uri):
263 feeds = []
264 try:
265 server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
266 feedids = server.syndic8.FindFeeds(uri)
267 infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
268 infolist.sort(sortFeeds)
269 feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
270 _debuglog('found %s feeds through Syndic8' % len(feeds))
271 except:
272 pass
273 return feeds
275 def urls(uri, uri_data):
276 #if _recurs is None: _recurs = [uri]
278 """fulluri = makeFullURI(uri)
280 try:
281 data = _gatekeeper.get(fulluri, check=False)
282 except:
283 return []"""
285 # is this already a feed?
286 if couldBeFeedData(uri_data):
287 result = set()
288 result.add((uri, uri_data))
289 return result
291 """newuri = tryBrokenRedirect(data)
293 if newuri and newuri not in _recurs:
294 _recurs.append(newuri)
295 return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)"""
297 # nope, it's a page, try LINK tags first
298 _debuglog('looking for LINK tags')
300 links = set()
302 try:
303 linktag_links = set(getLinks(uri_data, uri))
304 except Exception, e:
305 linktag_links = set()
306 _debuglog("Exception in getLinks: %s" % e)
308 _debuglog('found %s feeds through LINK tags' % len(links))
309 #outfeeds = process(outfeeds)
311 links = links.union(getALinks(uri_data, uri))
312 links = links.union(getLocalLinks(links, uri))
314 suffixes = [ # filenames used by popular software:
315 'atom.xml', # blogger, TypePad
316 'index.atom', # MT, apparently
317 'index.rdf', # MT
318 'rss.xml', # Dave Winer/Manila
319 'index.xml', # MT
320 'index.rss' # Slash
323 #links = links.union([urlparse.urljoin(fulluri, x) for x in suffixes])
325 links -= linktag_links
327 return linktag_links | set([url for url in links if isFeedLink(url) or isXMLRelatedLink(url)])
329 def process(urls):
330 #print "PROCESSING %s" % str(urls)
331 return [(a[1], a[2]) for a in [isFeed(_url) for _url in urls] if a[0] > 0]
333 ##### test harness ######
335 def test():
336 uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
337 failed = []
338 count = 0
339 while 1:
340 data = _gatekeeper.get(uri)
341 if data.find('Atom autodiscovery test') == -1: break
342 sys.stdout.write('.')
343 sys.stdout.flush()
344 count += 1
345 links = getLinks(data, uri)
346 if not links:
347 print '\n*** FAILED ***', uri, 'could not find link'
348 failed.append(uri)
349 elif len(links) > 1:
350 print '\n*** FAILED ***', uri, 'found too many links'
351 failed.append(uri)
352 else:
353 atomdata = urllib.urlopen(links[0]).read()
354 if atomdata.find('<link rel="alternate"') == -1:
355 print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
356 failed.append(uri)
357 else:
358 backlink = atomdata.split('href="').pop().split('"')[0]
359 if backlink != uri:
360 print '\n*** FAILED ***', uri, 'retrieved wrong feed'
361 failed.append(uri)
362 if data.find('<link rel="next" href="') == -1: break
363 uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
364 print
365 print count, 'tests executed,', len(failed), 'failed'