Lib/robotparser.py

   1 """ robotparser.py
   2
   3     Copyright (C) 2000  Bastian Kleineidam
   4
   5     You can choose between two licenses when using this package:
   6     1) GNU GPLv2
   7     2) PSF license for Python 2.2
   8
   9     The robots.txt Exclusion Protocol is implemented as specified in
  10     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  11 """
  12 import urlparse,urllib
  13
  14 __all__ = ["RobotFileParser"]
  15
  16 debug = 0
  17
  18 def _debug(msg):
  19     if debug: print msg
  20
  21
  22 class RobotFileParser:
  23     """ This class provides a set of methods to read, parse and answer
  24     questions about a single robots.txt file.
  25
  26     """
  27
  28     def __init__(self, url=''):
  29         self.entries = []
  30         self.default_entry = None
  31         self.disallow_all = False
  32         self.allow_all = False
  33         self.set_url(url)
  34         self.last_checked = 0
  35
  36     def mtime(self):
  37         """Returns the time the robots.txt file was last fetched.
  38
  39         This is useful for long-running web spiders that need to
  40         check for new robots.txt files periodically.
  41
  42         """
  43         return self.last_checked
  44
  45     def modified(self):
  46         """Sets the time the robots.txt file was last fetched to the
  47         current time.
  48
  49         """
  50         import time
  51         self.last_checked = time.time()
  52
  53     def set_url(self, url):
  54         """Sets the URL referring to a robots.txt file."""
  55         self.url = url
  56         self.host, self.path = urlparse.urlparse(url)[1:3]
  57
  58     def read(self):
  59         """Reads the robots.txt URL and feeds it to the parser."""
  60         opener = URLopener()
  61         f = opener.open(self.url)
  62         lines = []
  63         line = f.readline()
  64         while line:
  65             lines.append(line.strip())
  66             line = f.readline()
  67         self.errcode = opener.errcode
  68         if self.errcode in (401, 403):
  69             self.disallow_all = True
  70             _debug("disallow all")
  71         elif self.errcode >= 400:
  72             self.allow_all = True
  73             _debug("allow all")
  74         elif self.errcode == 200 and lines:
  75             _debug("parse lines")
  76             self.parse(lines)
  77
  78     def _add_entry(self, entry):
  79         if "*" in entry.useragents:
  80             # the default entry is considered last
  81             self.default_entry = entry
  82         else:
  83             self.entries.append(entry)
  84
  85     def parse(self, lines):
  86         """parse the input lines from a robots.txt file.
  87            We allow that a user-agent: line is not preceded by
  88            one or more blank lines."""
  89         state = 0
  90         linenumber = 0
  91         entry = Entry()
  92
  93         for line in lines:
  94             linenumber = linenumber + 1
  95             if not line:
  96                 if state==1:
  97                     _debug("line %d: warning: you should insert"
  98                            " allow: or disallow: directives below any"
  99                            " user-agent: line" % linenumber)
 100                     entry = Entry()
 101                     state = 0
 102                 elif state==2:
 103                     self._add_entry(entry)
 104                     entry = Entry()
 105                     state = 0
 106             # remove optional comment and strip line
 107             i = line.find('#')
 108             if i>=0:
 109                 line = line[:i]
 110             line = line.strip()
 111             if not line:
 112                 continue
 113             line = line.split(':', 1)
 114             if len(line) == 2:
 115                 line[0] = line[0].strip().lower()
 116                 line[1] = urllib.unquote(line[1].strip())
 117                 if line[0] == "user-agent":
 118                     if state==2:
 119                         _debug("line %d: warning: you should insert a blank"
 120                                " line before any user-agent"
 121                                " directive" % linenumber)
 122                         self._add_entry(entry)
 123                         entry = Entry()
 124                     entry.useragents.append(line[1])
 125                     state = 1
 126                 elif line[0] == "disallow":
 127                     if state==0:
 128                         _debug("line %d: error: you must insert a user-agent:"
 129                                " directive before this line" % linenumber)
 130                     else:
 131                         entry.rulelines.append(RuleLine(line[1], False))
 132                         state = 2
 133                 elif line[0] == "allow":
 134                     if state==0:
 135                         _debug("line %d: error: you must insert a user-agent:"
 136                                " directive before this line" % linenumber)
 137                     else:
 138                         entry.rulelines.append(RuleLine(line[1], True))
 139                 else:
 140                     _debug("line %d: warning: unknown key %s" % (linenumber,
 141                                line[0]))
 142             else:
 143                 _debug("line %d: error: malformed line %s"%(linenumber, line))
 144         if state==2:
 145             self.entries.append(entry)
 146         _debug("Parsed rules:\n%s" % str(self))
 147
 148
 149     def can_fetch(self, useragent, url):
 150         """using the parsed robots.txt decide if useragent can fetch url"""
 151         _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
 152                (useragent, url))
 153         if self.disallow_all:
 154             return False
 155         if self.allow_all:
 156             return True
 157         # search for given user agent matches
 158         # the first match counts
 159         url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
 160         for entry in self.entries:
 161             if entry.applies_to(useragent):
 162                 return entry.allowance(url)
 163         # try the default entry last
 164         if self.default_entry:
 165             return self.default_entry.allowance(url)
 166         # agent not found ==> access granted
 167         return True
 168
 169
 170     def __str__(self):
 171         return ''.join([str(entry) + "\n" for entry in self.entries])
 172
 173
 174 class RuleLine:
 175     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
 176        (allowance==False) followed by a path."""
 177     def __init__(self, path, allowance):
 178         if path == '' and not allowance:
 179             # an empty value means allow all
 180             allowance = True
 181         self.path = urllib.quote(path)
 182         self.allowance = allowance
 183
 184     def applies_to(self, filename):
 185         return self.path=="*" or filename.startswith(self.path)
 186
 187     def __str__(self):
 188         return (self.allowance and "Allow" or "Disallow")+": "+self.path
 189
 190
 191 class Entry:
 192     """An entry has one or more user-agents and zero or more rulelines"""
 193     def __init__(self):
 194         self.useragents = []
 195         self.rulelines = []
 196
 197     def __str__(self):
 198         ret = []
 199         for agent in self.useragents:
 200             ret.extend(["User-agent: ", agent, "\n"])
 201         for line in self.rulelines:
 202             ret.extend([str(line), "\n"])
 203         return ''.join(ret)
 204
 205     def applies_to(self, useragent):
 206         """check if this entry applies to the specified agent"""
 207         # split the name token and make it lower case
 208         useragent = useragent.split("/")[0].lower()
 209         for agent in self.useragents:
 210             if agent=='*':
 211                 # we have the catch-all agent
 212                 return True
 213             agent = agent.lower()
 214             if agent in useragent:
 215                 return True
 216         return False
 217
 218     def allowance(self, filename):
 219         """Preconditions:
 220         - our agent applies to this entry
 221         - filename is URL decoded"""
 222         for line in self.rulelines:
 223             _debug((filename, str(line), line.allowance))
 224             if line.applies_to(filename):
 225                 return line.allowance
 226         return True
 227
 228 class URLopener(urllib.FancyURLopener):
 229     def __init__(self, *args):
 230         urllib.FancyURLopener.__init__(self, *args)
 231         self.errcode = 200
 232
 233     def http_error_default(self, url, fp, errcode, errmsg, headers):
 234         self.errcode = errcode
 235         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
 236                                                         errmsg, headers)
 237
 238 def _check(a,b):
 239     if not b:
 240         ac = "access denied"
 241     else:
 242         ac = "access allowed"
 243     if a!=b:
 244         print "failed"
 245     else:
 246         print "ok (%s)" % ac
 247     print
 248
 249 def _test():
 250     global debug
 251     rp = RobotFileParser()
 252     debug = 1
 253
 254     # robots.txt that exists, gotten to by redirection
 255     rp.set_url('http://www.musi-cal.com/robots.txt')
 256     rp.read()
 257
 258     # test for re.escape
 259     _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
 260     # this should match the first rule, which is a disallow
 261     _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
 262     # various cherry pickers
 263     _check(rp.can_fetch('CherryPickerSE',
 264                        'http://www.musi-cal.com/cgi-bin/event-search'
 265                        '?city=San+Francisco'), 0)
 266     _check(rp.can_fetch('CherryPickerSE/1.0',
 267                        'http://www.musi-cal.com/cgi-bin/event-search'
 268                        '?city=San+Francisco'), 0)
 269     _check(rp.can_fetch('CherryPickerSE/1.5',
 270                        'http://www.musi-cal.com/cgi-bin/event-search'
 271                        '?city=San+Francisco'), 0)
 272     # case sensitivity
 273     _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
 274     _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
 275     # substring test
 276     _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
 277     # tests for catch-all * agent
 278     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
 279     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
 280     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
 281     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
 282
 283     # robots.txt that does not exist
 284     rp.set_url('http://www.lycos.com/robots.txt')
 285     rp.read()
 286     _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
 287
 288 if __name__ == '__main__':
 289     _test()