add release date
[python/dscho.git] / Lib / robotparser.py
blob447563fe654d86db1fe8e26c83bfd088d24bf378
1 """ robotparser.py
3 Copyright (C) 2000 Bastian Kleineidam
5 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PSF license for Python 2.2
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11 """
12 import urlparse
13 import urllib
15 __all__ = ["RobotFileParser"]
18 class RobotFileParser:
19 """ This class provides a set of methods to read, parse and answer
20 questions about a single robots.txt file.
22 """
24 def __init__(self, url=''):
25 self.entries = []
26 self.default_entry = None
27 self.disallow_all = False
28 self.allow_all = False
29 self.set_url(url)
30 self.last_checked = 0
32 def mtime(self):
33 """Returns the time the robots.txt file was last fetched.
35 This is useful for long-running web spiders that need to
36 check for new robots.txt files periodically.
38 """
39 return self.last_checked
41 def modified(self):
42 """Sets the time the robots.txt file was last fetched to the
43 current time.
45 """
46 import time
47 self.last_checked = time.time()
49 def set_url(self, url):
50 """Sets the URL referring to a robots.txt file."""
51 self.url = url
52 self.host, self.path = urlparse.urlparse(url)[1:3]
54 def read(self):
55 """Reads the robots.txt URL and feeds it to the parser."""
56 opener = URLopener()
57 f = opener.open(self.url)
58 lines = [line.strip() for line in f]
59 f.close()
60 self.errcode = opener.errcode
61 if self.errcode in (401, 403):
62 self.disallow_all = True
63 elif self.errcode >= 400:
64 self.allow_all = True
65 elif self.errcode == 200 and lines:
66 self.parse(lines)
68 def _add_entry(self, entry):
69 if "*" in entry.useragents:
70 # the default entry is considered last
71 self.default_entry = entry
72 else:
73 self.entries.append(entry)
75 def parse(self, lines):
76 """parse the input lines from a robots.txt file.
77 We allow that a user-agent: line is not preceded by
78 one or more blank lines."""
79 # states:
80 # 0: start state
81 # 1: saw user-agent line
82 # 2: saw an allow or disallow line
83 state = 0
84 linenumber = 0
85 entry = Entry()
87 for line in lines:
88 linenumber += 1
89 if not line:
90 if state == 1:
91 entry = Entry()
92 state = 0
93 elif state == 2:
94 self._add_entry(entry)
95 entry = Entry()
96 state = 0
97 # remove optional comment and strip line
98 i = line.find('#')
99 if i >= 0:
100 line = line[:i]
101 line = line.strip()
102 if not line:
103 continue
104 line = line.split(':', 1)
105 if len(line) == 2:
106 line[0] = line[0].strip().lower()
107 line[1] = urllib.unquote(line[1].strip())
108 if line[0] == "user-agent":
109 if state == 2:
110 self._add_entry(entry)
111 entry = Entry()
112 entry.useragents.append(line[1])
113 state = 1
114 elif line[0] == "disallow":
115 if state != 0:
116 entry.rulelines.append(RuleLine(line[1], False))
117 state = 2
118 elif line[0] == "allow":
119 if state != 0:
120 entry.rulelines.append(RuleLine(line[1], True))
121 state = 2
122 if state == 2:
123 self.entries.append(entry)
126 def can_fetch(self, useragent, url):
127 """using the parsed robots.txt decide if useragent can fetch url"""
128 if self.disallow_all:
129 return False
130 if self.allow_all:
131 return True
132 # search for given user agent matches
133 # the first match counts
134 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
135 for entry in self.entries:
136 if entry.applies_to(useragent):
137 return entry.allowance(url)
138 # try the default entry last
139 if self.default_entry:
140 return self.default_entry.allowance(url)
141 # agent not found ==> access granted
142 return True
145 def __str__(self):
146 return ''.join([str(entry) + "\n" for entry in self.entries])
149 class RuleLine:
150 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
151 (allowance==False) followed by a path."""
152 def __init__(self, path, allowance):
153 if path == '' and not allowance:
154 # an empty value means allow all
155 allowance = True
156 self.path = urllib.quote(path)
157 self.allowance = allowance
159 def applies_to(self, filename):
160 return self.path == "*" or filename.startswith(self.path)
162 def __str__(self):
163 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
166 class Entry:
167 """An entry has one or more user-agents and zero or more rulelines"""
168 def __init__(self):
169 self.useragents = []
170 self.rulelines = []
172 def __str__(self):
173 ret = []
174 for agent in self.useragents:
175 ret.extend(["User-agent: ", agent, "\n"])
176 for line in self.rulelines:
177 ret.extend([str(line), "\n"])
178 return ''.join(ret)
180 def applies_to(self, useragent):
181 """check if this entry applies to the specified agent"""
182 # split the name token and make it lower case
183 useragent = useragent.split("/")[0].lower()
184 for agent in self.useragents:
185 if agent == '*':
186 # we have the catch-all agent
187 return True
188 agent = agent.lower()
189 if agent in useragent:
190 return True
191 return False
193 def allowance(self, filename):
194 """Preconditions:
195 - our agent applies to this entry
196 - filename is URL decoded"""
197 for line in self.rulelines:
198 if line.applies_to(filename):
199 return line.allowance
200 return True
202 class URLopener(urllib.FancyURLopener):
203 def __init__(self, *args):
204 urllib.FancyURLopener.__init__(self, *args)
205 self.errcode = 200
207 def prompt_user_passwd(self, host, realm):
208 ## If robots.txt file is accessible only with a password,
209 ## we act as if the file wasn't there.
210 return None, None
212 def http_error_default(self, url, fp, errcode, errmsg, headers):
213 self.errcode = errcode
214 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
215 errmsg, headers)