Make the _H #define's match the header file names. Fix comments to
[python.git] / Lib / robotparser.py
blob5b1d797fe86ea682e7a59586a026e7210d7c1b40
1 """ robotparser.py
3 Copyright (C) 2000 Bastian Kleineidam
5 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PSF license for Python 2.2
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11 """
12 import urlparse
13 import urllib
15 __all__ = ["RobotFileParser"]
18 class RobotFileParser:
19 """ This class provides a set of methods to read, parse and answer
20 questions about a single robots.txt file.
22 """
24 def __init__(self, url=''):
25 self.entries = []
26 self.default_entry = None
27 self.disallow_all = False
28 self.allow_all = False
29 self.set_url(url)
30 self.last_checked = 0
32 def mtime(self):
33 """Returns the time the robots.txt file was last fetched.
35 This is useful for long-running web spiders that need to
36 check for new robots.txt files periodically.
38 """
39 return self.last_checked
41 def modified(self):
42 """Sets the time the robots.txt file was last fetched to the
43 current time.
45 """
46 import time
47 self.last_checked = time.time()
49 def set_url(self, url):
50 """Sets the URL referring to a robots.txt file."""
51 self.url = url
52 self.host, self.path = urlparse.urlparse(url)[1:3]
54 def read(self):
55 """Reads the robots.txt URL and feeds it to the parser."""
56 opener = URLopener()
57 f = opener.open(self.url)
58 lines = []
59 line = f.readline()
60 while line:
61 lines.append(line.strip())
62 line = f.readline()
63 self.errcode = opener.errcode
64 if self.errcode in (401, 403):
65 self.disallow_all = True
66 elif self.errcode >= 400:
67 self.allow_all = True
68 elif self.errcode == 200 and lines:
69 self.parse(lines)
71 def _add_entry(self, entry):
72 if "*" in entry.useragents:
73 # the default entry is considered last
74 self.default_entry = entry
75 else:
76 self.entries.append(entry)
78 def parse(self, lines):
79 """parse the input lines from a robots.txt file.
80 We allow that a user-agent: line is not preceded by
81 one or more blank lines."""
82 state = 0
83 linenumber = 0
84 entry = Entry()
86 for line in lines:
87 linenumber = linenumber + 1
88 if not line:
89 if state == 1:
90 entry = Entry()
91 state = 0
92 elif state == 2:
93 self._add_entry(entry)
94 entry = Entry()
95 state = 0
96 # remove optional comment and strip line
97 i = line.find('#')
98 if i >= 0:
99 line = line[:i]
100 line = line.strip()
101 if not line:
102 continue
103 line = line.split(':', 1)
104 if len(line) == 2:
105 line[0] = line[0].strip().lower()
106 line[1] = urllib.unquote(line[1].strip())
107 if line[0] == "user-agent":
108 if state == 2:
109 self._add_entry(entry)
110 entry = Entry()
111 entry.useragents.append(line[1])
112 state = 1
113 elif line[0] == "disallow":
114 if state != 0:
115 entry.rulelines.append(RuleLine(line[1], False))
116 state = 2
117 elif line[0] == "allow":
118 if state != 0:
119 entry.rulelines.append(RuleLine(line[1], True))
120 if state == 2:
121 self.entries.append(entry)
124 def can_fetch(self, useragent, url):
125 """using the parsed robots.txt decide if useragent can fetch url"""
126 if self.disallow_all:
127 return False
128 if self.allow_all:
129 return True
130 # search for given user agent matches
131 # the first match counts
132 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
133 for entry in self.entries:
134 if entry.applies_to(useragent):
135 return entry.allowance(url)
136 # try the default entry last
137 if self.default_entry:
138 return self.default_entry.allowance(url)
139 # agent not found ==> access granted
140 return True
143 def __str__(self):
144 return ''.join([str(entry) + "\n" for entry in self.entries])
147 class RuleLine:
148 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
149 (allowance==False) followed by a path."""
150 def __init__(self, path, allowance):
151 if path == '' and not allowance:
152 # an empty value means allow all
153 allowance = True
154 self.path = urllib.quote(path)
155 self.allowance = allowance
157 def applies_to(self, filename):
158 return self.path == "*" or filename.startswith(self.path)
160 def __str__(self):
161 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
164 class Entry:
165 """An entry has one or more user-agents and zero or more rulelines"""
166 def __init__(self):
167 self.useragents = []
168 self.rulelines = []
170 def __str__(self):
171 ret = []
172 for agent in self.useragents:
173 ret.extend(["User-agent: ", agent, "\n"])
174 for line in self.rulelines:
175 ret.extend([str(line), "\n"])
176 return ''.join(ret)
178 def applies_to(self, useragent):
179 """check if this entry applies to the specified agent"""
180 # split the name token and make it lower case
181 useragent = useragent.split("/")[0].lower()
182 for agent in self.useragents:
183 if agent == '*':
184 # we have the catch-all agent
185 return True
186 agent = agent.lower()
187 if agent in useragent:
188 return True
189 return False
191 def allowance(self, filename):
192 """Preconditions:
193 - our agent applies to this entry
194 - filename is URL decoded"""
195 for line in self.rulelines:
196 if line.applies_to(filename):
197 return line.allowance
198 return True
200 class URLopener(urllib.FancyURLopener):
201 def __init__(self, *args):
202 urllib.FancyURLopener.__init__(self, *args)
203 self.errcode = 200
205 def prompt_user_passwd(self, host, realm):
206 ## If robots.txt file is accessible only with a password,
207 ## we act as if the file wasn't there.
208 return None, None
210 def http_error_default(self, url, fp, errcode, errmsg, headers):
211 self.errcode = errcode
212 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
213 errmsg, headers)