Merged revisions 85328 via svnmerge from
[python/dscho.git] / Lib / urllib / robotparser.py
blob75be4af4091806af2928d92a0ce943b5c5f1d267
1 """ robotparser.py
3 Copyright (C) 2000 Bastian Kleineidam
5 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PSF license for Python 2.2
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11 """
13 import urllib.parse, urllib.request
15 __all__ = ["RobotFileParser"]
17 class RobotFileParser:
18 """ This class provides a set of methods to read, parse and answer
19 questions about a single robots.txt file.
21 """
23 def __init__(self, url=''):
24 self.entries = []
25 self.default_entry = None
26 self.disallow_all = False
27 self.allow_all = False
28 self.set_url(url)
29 self.last_checked = 0
31 def mtime(self):
32 """Returns the time the robots.txt file was last fetched.
34 This is useful for long-running web spiders that need to
35 check for new robots.txt files periodically.
37 """
38 return self.last_checked
40 def modified(self):
41 """Sets the time the robots.txt file was last fetched to the
42 current time.
44 """
45 import time
46 self.last_checked = time.time()
48 def set_url(self, url):
49 """Sets the URL referring to a robots.txt file."""
50 self.url = url
51 self.host, self.path = urllib.parse.urlparse(url)[1:3]
53 def read(self):
54 """Reads the robots.txt URL and feeds it to the parser."""
55 try:
56 f = urllib.request.urlopen(self.url)
57 except urllib.error.HTTPError as err:
58 if err.code in (401, 403):
59 self.disallow_all = True
60 elif err.code >= 400:
61 self.allow_all = True
62 else:
63 raw = f.read()
64 self.parse(raw.decode("utf-8").splitlines())
66 def _add_entry(self, entry):
67 if "*" in entry.useragents:
68 # the default entry is considered last
69 if self.default_entry is None:
70 # the first default entry wins
71 self.default_entry = entry
72 else:
73 self.entries.append(entry)
75 def parse(self, lines):
76 """Parse the input lines from a robots.txt file.
78 We allow that a user-agent: line is not preceded by
79 one or more blank lines.
80 """
81 # states:
82 # 0: start state
83 # 1: saw user-agent line
84 # 2: saw an allow or disallow line
85 state = 0
86 entry = Entry()
88 for line in lines:
89 if not line:
90 if state == 1:
91 entry = Entry()
92 state = 0
93 elif state == 2:
94 self._add_entry(entry)
95 entry = Entry()
96 state = 0
97 # remove optional comment and strip line
98 i = line.find('#')
99 if i >= 0:
100 line = line[:i]
101 line = line.strip()
102 if not line:
103 continue
104 line = line.split(':', 1)
105 if len(line) == 2:
106 line[0] = line[0].strip().lower()
107 line[1] = urllib.parse.unquote(line[1].strip())
108 if line[0] == "user-agent":
109 if state == 2:
110 self._add_entry(entry)
111 entry = Entry()
112 entry.useragents.append(line[1])
113 state = 1
114 elif line[0] == "disallow":
115 if state != 0:
116 entry.rulelines.append(RuleLine(line[1], False))
117 state = 2
118 elif line[0] == "allow":
119 if state != 0:
120 entry.rulelines.append(RuleLine(line[1], True))
121 state = 2
122 if state == 2:
123 self._add_entry(entry)
126 def can_fetch(self, useragent, url):
127 """using the parsed robots.txt decide if useragent can fetch url"""
128 if self.disallow_all:
129 return False
130 if self.allow_all:
131 return True
132 # search for given user agent matches
133 # the first match counts
134 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
135 url = urllib.parse.urlunparse(('','',parsed_url.path,
136 parsed_url.params,parsed_url.query, parsed_url.fragment))
137 url = urllib.parse.quote(url)
138 if not url:
139 url = "/"
140 for entry in self.entries:
141 if entry.applies_to(useragent):
142 return entry.allowance(url)
143 # try the default entry last
144 if self.default_entry:
145 return self.default_entry.allowance(url)
146 # agent not found ==> access granted
147 return True
149 def __str__(self):
150 return ''.join([str(entry) + "\n" for entry in self.entries])
153 class RuleLine:
154 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
155 (allowance==False) followed by a path."""
156 def __init__(self, path, allowance):
157 if path == '' and not allowance:
158 # an empty value means allow all
159 allowance = True
160 self.path = urllib.parse.quote(path)
161 self.allowance = allowance
163 def applies_to(self, filename):
164 return self.path == "*" or filename.startswith(self.path)
166 def __str__(self):
167 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
170 class Entry:
171 """An entry has one or more user-agents and zero or more rulelines"""
172 def __init__(self):
173 self.useragents = []
174 self.rulelines = []
176 def __str__(self):
177 ret = []
178 for agent in self.useragents:
179 ret.extend(["User-agent: ", agent, "\n"])
180 for line in self.rulelines:
181 ret.extend([str(line), "\n"])
182 return ''.join(ret)
184 def applies_to(self, useragent):
185 """check if this entry applies to the specified agent"""
186 # split the name token and make it lower case
187 useragent = useragent.split("/")[0].lower()
188 for agent in self.useragents:
189 if agent == '*':
190 # we have the catch-all agent
191 return True
192 agent = agent.lower()
193 if agent in useragent:
194 return True
195 return False
197 def allowance(self, filename):
198 """Preconditions:
199 - our agent applies to this entry
200 - filename is URL decoded"""
201 for line in self.rulelines:
202 if line.applies_to(filename):
203 return line.allowance
204 return True