Lib/urllib/robotparser.py

   1 """ robotparser.py
   2
   3     Copyright (C) 2000  Bastian Kleineidam
   4
   5     You can choose between two licenses when using this package:
   6     1) GNU GPLv2
   7     2) PSF license for Python 2.2
   8
   9     The robots.txt Exclusion Protocol is implemented as specified in
  10     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  11 """
  12
  13 import urllib.parse, urllib.request
  14
  15 __all__ = ["RobotFileParser"]
  16
  17 class RobotFileParser:
  18     """ This class provides a set of methods to read, parse and answer
  19     questions about a single robots.txt file.
  20
  21     """
  22
  23     def __init__(self, url=''):
  24         self.entries = []
  25         self.default_entry = None
  26         self.disallow_all = False
  27         self.allow_all = False
  28         self.set_url(url)
  29         self.last_checked = 0
  30
  31     def mtime(self):
  32         """Returns the time the robots.txt file was last fetched.
  33
  34         This is useful for long-running web spiders that need to
  35         check for new robots.txt files periodically.
  36
  37         """
  38         return self.last_checked
  39
  40     def modified(self):
  41         """Sets the time the robots.txt file was last fetched to the
  42         current time.
  43
  44         """
  45         import time
  46         self.last_checked = time.time()
  47
  48     def set_url(self, url):
  49         """Sets the URL referring to a robots.txt file."""
  50         self.url = url
  51         self.host, self.path = urllib.parse.urlparse(url)[1:3]
  52
  53     def read(self):
  54         """Reads the robots.txt URL and feeds it to the parser."""
  55         try:
  56             f = urllib.request.urlopen(self.url)
  57         except urllib.error.HTTPError as err:
  58             if err.code in (401, 403):
  59                 self.disallow_all = True
  60             elif err.code >= 400:
  61                 self.allow_all = True
  62         else:
  63             raw = f.read()
  64             self.parse(raw.decode("utf-8").splitlines())
  65
  66     def _add_entry(self, entry):
  67         if "*" in entry.useragents:
  68             # the default entry is considered last
  69             if self.default_entry is None:
  70                 # the first default entry wins
  71                 self.default_entry = entry
  72         else:
  73             self.entries.append(entry)
  74
  75     def parse(self, lines):
  76         """Parse the input lines from a robots.txt file.
  77
  78         We allow that a user-agent: line is not preceded by
  79         one or more blank lines.
  80         """
  81         # states:
  82         #   0: start state
  83         #   1: saw user-agent line
  84         #   2: saw an allow or disallow line
  85         state = 0
  86         entry = Entry()
  87
  88         for line in lines:
  89             if not line:
  90                 if state == 1:
  91                     entry = Entry()
  92                     state = 0
  93                 elif state == 2:
  94                     self._add_entry(entry)
  95                     entry = Entry()
  96                     state = 0
  97             # remove optional comment and strip line
  98             i = line.find('#')
  99             if i >= 0:
 100                 line = line[:i]
 101             line = line.strip()
 102             if not line:
 103                 continue
 104             line = line.split(':', 1)
 105             if len(line) == 2:
 106                 line[0] = line[0].strip().lower()
 107                 line[1] = urllib.parse.unquote(line[1].strip())
 108                 if line[0] == "user-agent":
 109                     if state == 2:
 110                         self._add_entry(entry)
 111                         entry = Entry()
 112                     entry.useragents.append(line[1])
 113                     state = 1
 114                 elif line[0] == "disallow":
 115                     if state != 0:
 116                         entry.rulelines.append(RuleLine(line[1], False))
 117                         state = 2
 118                 elif line[0] == "allow":
 119                     if state != 0:
 120                         entry.rulelines.append(RuleLine(line[1], True))
 121                         state = 2
 122         if state == 2:
 123             self._add_entry(entry)
 124
 125
 126     def can_fetch(self, useragent, url):
 127         """using the parsed robots.txt decide if useragent can fetch url"""
 128         if self.disallow_all:
 129             return False
 130         if self.allow_all:
 131             return True
 132         # search for given user agent matches
 133         # the first match counts
 134         parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
 135         url = urllib.parse.urlunparse(('','',parsed_url.path,
 136             parsed_url.params,parsed_url.query, parsed_url.fragment))
 137         url = urllib.parse.quote(url)
 138         if not url:
 139             url = "/"
 140         for entry in self.entries:
 141             if entry.applies_to(useragent):
 142                 return entry.allowance(url)
 143         # try the default entry last
 144         if self.default_entry:
 145             return self.default_entry.allowance(url)
 146         # agent not found ==> access granted
 147         return True
 148
 149     def __str__(self):
 150         return ''.join([str(entry) + "\n" for entry in self.entries])
 151
 152
 153 class RuleLine:
 154     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
 155        (allowance==False) followed by a path."""
 156     def __init__(self, path, allowance):
 157         if path == '' and not allowance:
 158             # an empty value means allow all
 159             allowance = True
 160         self.path = urllib.parse.quote(path)
 161         self.allowance = allowance
 162
 163     def applies_to(self, filename):
 164         return self.path == "*" or filename.startswith(self.path)
 165
 166     def __str__(self):
 167         return (self.allowance and "Allow" or "Disallow") + ": " + self.path
 168
 169
 170 class Entry:
 171     """An entry has one or more user-agents and zero or more rulelines"""
 172     def __init__(self):
 173         self.useragents = []
 174         self.rulelines = []
 175
 176     def __str__(self):
 177         ret = []
 178         for agent in self.useragents:
 179             ret.extend(["User-agent: ", agent, "\n"])
 180         for line in self.rulelines:
 181             ret.extend([str(line), "\n"])
 182         return ''.join(ret)
 183
 184     def applies_to(self, useragent):
 185         """check if this entry applies to the specified agent"""
 186         # split the name token and make it lower case
 187         useragent = useragent.split("/")[0].lower()
 188         for agent in self.useragents:
 189             if agent == '*':
 190                 # we have the catch-all agent
 191                 return True
 192             agent = agent.lower()
 193             if agent in useragent:
 194                 return True
 195         return False
 196
 197     def allowance(self, filename):
 198         """Preconditions:
 199         - our agent applies to this entry
 200         - filename is URL decoded"""
 201         for line in self.rulelines:
 202             if line.applies_to(filename):
 203                 return line.allowance
 204         return True