robotparser.py

   1 """ robotparser.py
   2
   3     Copyright (C) 2000  Bastian Kleineidam
   4
   5     You can choose between two licenses when using this package:
   6     1) GNU GPLv2
   7     2) PSF license for Python 2.2
   8
   9     The robots.txt Exclusion Protocol is implemented as specified in
  10     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  11 """
  12 import urlparse,urllib
  13
  14 __all__ = ["RobotFileParser"]
  15
  16 debug = 0
  17
  18 def _debug(msg):
  19     if debug: print msg
  20
  21
  22 class RobotFileParser:
  23     """ This class provides a set of methods to read, parse and answer
  24     questions about a single robots.txt file.
  25
  26     """
  27
  28     def __init__(self, url=''):
  29         self.entries = []
  30         self.default_entry = None
  31         self.disallow_all = False
  32         self.allow_all = False
  33         self.set_url(url)
  34         self.last_checked = 0
  35
  36     def mtime(self):
  37         """Returns the time the robots.txt file was last fetched.
  38
  39         This is useful for long-running web spiders that need to
  40         check for new robots.txt files periodically.
  41
  42         """
  43         return self.last_checked
  44
  45     def modified(self):
  46         """Sets the time the robots.txt file was last fetched to the
  47         current time.
  48
  49         """
  50         import time
  51         self.last_checked = time.time()
  52
  53     def set_url(self, url):
  54         """Sets the URL referring to a robots.txt file."""
  55         self.url = url
  56         self.host, self.path = urlparse.urlparse(url)[1:3]
  57
  58     def read(self):
  59         """Reads the robots.txt URL and feeds it to the parser."""
  60         opener = URLopener()
  61         f = opener.open(self.url)
  62         lines = []
  63         line = f.readline()
  64         while line:
  65             lines.append(line.strip())
  66             line = f.readline()
  67         self.errcode = opener.errcode
  68         if self.errcode == 401 or self.errcode == 403:
  69             self.disallow_all = True
  70             _debug("disallow all")
  71         elif self.errcode >= 400:
  72             self.allow_all = True
  73             _debug("allow all")
  74         elif self.errcode == 200 and lines:
  75             _debug("parse lines")
  76             self.parse(lines)
  77
  78     def _add_entry(self, entry):
  79         if "*" in entry.useragents:
  80             # the default entry is considered last
  81             self.default_entry = entry
  82         else:
  83             self.entries.append(entry)
  84
  85     def parse(self, lines):
  86         """parse the input lines from a robots.txt file.
  87            We allow that a user-agent: line is not preceded by
  88            one or more blank lines."""
  89         state = 0
  90         linenumber = 0
  91         entry = Entry()
  92
  93         for line in lines:
  94             linenumber = linenumber + 1
  95             if not line:
  96                 if state==1:
  97                     _debug("line %d: warning: you should insert"
  98                            " allow: or disallow: directives below any"
  99                            " user-agent: line" % linenumber)
 100                     entry = Entry()
 101                     state = 0
 102                 elif state==2:
 103                     self._add_entry(entry)
 104                     entry = Entry()
 105                     state = 0
 106             # remove optional comment and strip line
 107             i = line.find('#')
 108             if i>=0:
 109                 line = line[:i]
 110             line = line.strip()
 111             if not line:
 112                 continue
 113             line = line.split(':', 1)
 114             if len(line) == 2:
 115                 line[0] = line[0].strip().lower()
 116                 line[1] = urllib.unquote(line[1].strip())
 117                 if line[0] == "user-agent":
 118                     if state==2:
 119                         _debug("line %d: warning: you should insert a blank"
 120                                " line before any user-agent"
 121                                " directive" % linenumber)
 122                         self._add_entry(entry)
 123                         entry = Entry()
 124                     entry.useragents.append(line[1])
 125                     state = 1
 126                 elif line[0] == "disallow":
 127                     if state==0:
 128                         _debug("line %d: error: you must insert a user-agent:"
 129                                " directive before this line" % linenumber)
 130                     else:
 131                         entry.rulelines.append(RuleLine(line[1], False))
 132                         state = 2
 133                 elif line[0] == "allow":
 134                     if state==0:
 135                         _debug("line %d: error: you must insert a user-agent:"
 136                                " directive before this line" % linenumber)
 137                     else:
 138                         entry.rulelines.append(RuleLine(line[1], True))
 139                 else:
 140                     _debug("line %d: warning: unknown key %s" % (linenumber,
 141                                line[0]))
 142             else:
 143                 _debug("line %d: error: malformed line %s"%(linenumber, line))
 144         if state==2:
 145             self.entries.append(entry)
 146         _debug("Parsed rules:\n%s" % str(self))
 147
 148
 149     def can_fetch(self, useragent, url):
 150         """using the parsed robots.txt decide if useragent can fetch url"""
 151         _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
 152                (useragent, url))
 153         if self.disallow_all:
 154             return False
 155         if self.allow_all:
 156             return True
 157         # search for given user agent matches
 158         # the first match counts
 159         url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
 160         for entry in self.entries:
 161             if entry.applies_to(useragent):
 162                 return entry.allowance(url)
 163         # try the default entry last
 164         if self.default_entry:
 165             return self.default_entry.allowance(url)
 166         # agent not found ==> access granted
 167         return True
 168
 169
 170     def __str__(self):
 171         ret = ""
 172         for entry in self.entries:
 173             ret = ret + str(entry) + "\n"
 174         return ret
 175
 176
 177 class RuleLine:
 178     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
 179        (allowance==False) followed by a path."""
 180     def __init__(self, path, allowance):
 181         if path == '' and not allowance:
 182             # an empty value means allow all
 183             allowance = True
 184         self.path = urllib.quote(path)
 185         self.allowance = allowance
 186
 187     def applies_to(self, filename):
 188         return self.path=="*" or filename.startswith(self.path)
 189
 190     def __str__(self):
 191         return (self.allowance and "Allow" or "Disallow")+": "+self.path
 192
 193
 194 class Entry:
 195     """An entry has one or more user-agents and zero or more rulelines"""
 196     def __init__(self):
 197         self.useragents = []
 198         self.rulelines = []
 199
 200     def __str__(self):
 201         ret = ""
 202         for agent in self.useragents:
 203             ret = ret + "User-agent: "+agent+"\n"
 204         for line in self.rulelines:
 205             ret = ret + str(line) + "\n"
 206         return ret
 207
 208     def applies_to(self, useragent):
 209         """check if this entry applies to the specified agent"""
 210         # split the name token and make it lower case
 211         useragent = useragent.split("/")[0].lower()
 212         for agent in self.useragents:
 213             if agent=='*':
 214                 # we have the catch-all agent
 215                 return True
 216             agent = agent.lower()
 217             if agent in useragent:
 218                 return True
 219         return False
 220
 221     def allowance(self, filename):
 222         """Preconditions:
 223         - our agent applies to this entry
 224         - filename is URL decoded"""
 225         for line in self.rulelines:
 226             _debug((filename, str(line), line.allowance))
 227             if line.applies_to(filename):
 228                 return line.allowance
 229         return True
 230
 231 class URLopener(urllib.FancyURLopener):
 232     def __init__(self, *args):
 233         urllib.FancyURLopener.__init__(self, *args)
 234         self.errcode = 200
 235
 236     def prompt_user_passwd(self, host, realm):
 237         ## If robots.txt file is accessible only with a password,
 238         ## we act as if the file wasn't there.
 239         return None, None
 240
 241     def http_error_default(self, url, fp, errcode, errmsg, headers):
 242         self.errcode = errcode
 243         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
 244                                                         errmsg, headers)
 245
 246 def _check(a,b):
 247     if not b:
 248         ac = "access denied"
 249     else:
 250         ac = "access allowed"
 251     if a!=b:
 252         print "failed"
 253     else:
 254         print "ok (%s)" % ac
 255     print
 256
 257 def _test():
 258     global debug
 259     rp = RobotFileParser()
 260     debug = 1
 261
 262     # robots.txt that exists, gotten to by redirection
 263     rp.set_url('http://www.musi-cal.com/robots.txt')
 264     rp.read()
 265
 266     # test for re.escape
 267     _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
 268     # this should match the first rule, which is a disallow
 269     _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
 270     # various cherry pickers
 271     _check(rp.can_fetch('CherryPickerSE',
 272                        'http://www.musi-cal.com/cgi-bin/event-search'
 273                        '?city=San+Francisco'), 0)
 274     _check(rp.can_fetch('CherryPickerSE/1.0',
 275                        'http://www.musi-cal.com/cgi-bin/event-search'
 276                        '?city=San+Francisco'), 0)
 277     _check(rp.can_fetch('CherryPickerSE/1.5',
 278                        'http://www.musi-cal.com/cgi-bin/event-search'
 279                        '?city=San+Francisco'), 0)
 280     # case sensitivity
 281     _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
 282     _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
 283     # substring test
 284     _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
 285     # tests for catch-all * agent
 286     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
 287     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
 288     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
 289     _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
 290
 291     # robots.txt that does not exist
 292     rp.set_url('http://www.lycos.com/robots.txt')
 293     rp.read()
 294     _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
 295
 296 if __name__ == '__main__':
 297     _test()