1 import unittest
, StringIO
, robotparser
2 from test
import test_support
4 class RobotTestCase(unittest
.TestCase
):
5 def __init__(self
, index
, parser
, url
, good
, agent
):
6 unittest
.TestCase
.__init
__(self
)
8 self
.str = "RobotTest(%d, good, %s)" % (index
, url
)
10 self
.str = "RobotTest(%d, bad, %s)" % (index
, url
)
17 if isinstance(self
.url
, tuple):
23 self
.assertTrue(self
.parser
.can_fetch(agent
, url
))
25 self
.assertFalse(self
.parser
.can_fetch(agent
, url
))
30 tests
= unittest
.TestSuite()
32 def RobotTest(index
, robots_txt
, good_urls
, bad_urls
,
33 agent
="test_robotparser"):
35 lines
= StringIO
.StringIO(robots_txt
).readlines()
36 parser
= robotparser
.RobotFileParser()
39 tests
.addTest(RobotTestCase(index
, parser
, url
, 1, agent
))
41 tests
.addTest(RobotTestCase(index
, parser
, url
, 0, agent
))
43 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
48 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
49 Disallow: /tmp/ # these will soon disappear
53 good
= ['/','/test.html']
54 bad
= ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
56 RobotTest(1, doc
, good
, bad
)
60 # robots.txt for http://www.example.com/
63 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
65 # Cybermapper knows where to go.
66 User-agent: cybermapper
71 good
= ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
72 bad
= ['/cyberworld/map/index.html']
74 RobotTest(2, doc
, good
, bad
)
84 bad
= ['/cyberworld/map/index.html','/','/tmp/']
86 RobotTest(3, doc
, good
, bad
)
88 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
96 Disallow: /%7ejoe/index.html
99 good
= [] # XFAIL '/a/b.html'
100 bad
= ['/tmp','/tmp.html','/tmp/a.html',
101 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
105 RobotTest(4, doc
, good
, bad
, 'figtree')
106 RobotTest(5, doc
, good
, bad
, 'FigTree Robot libwww-perl/5.04')
112 Disallow: /a%3Cd.html
114 Disallow: /%7ejoe/index.html
117 good
= ['/tmp',] # XFAIL: '/a%2fb.html'
118 bad
= ['/tmp/','/tmp/a.html',
119 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
120 '/%7Ejoe/index.html']
122 RobotTest(6, doc
, good
, bad
)
124 # From bug report #523041
133 bad
= [] # Bug report says "/" should be denied, but that is not in the RFC
135 RobotTest(7, doc
, good
, bad
)
137 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
141 User-agent: Googlebot
142 Allow: /folder1/myfile.html
146 good
= ['/folder1/myfile.html']
147 bad
= ['/folder1/anotherfile.html']
149 RobotTest(8, doc
, good
, bad
, agent
="Googlebot")
151 # 9. This file is incorrect because "Googlebot" is a substring of
152 # "Googlebot-Mobile", so test 10 works just like test 9.
154 User-agent: Googlebot
157 User-agent: Googlebot-Mobile
162 bad
= ['/something.jpg']
164 RobotTest(9, doc
, good
, bad
, agent
="Googlebot")
167 bad
= ['/something.jpg']
169 RobotTest(10, doc
, good
, bad
, agent
="Googlebot-Mobile")
171 # 11. Get the order correct.
173 User-agent: Googlebot-Mobile
176 User-agent: Googlebot
181 bad
= ['/something.jpg']
183 RobotTest(11, doc
, good
, bad
, agent
="Googlebot")
185 good
= ['/something.jpg']
188 RobotTest(12, doc
, good
, bad
, agent
="Googlebot-Mobile")
191 # 13. Google also got the order wrong in #8. You need to specify the
192 # URLs from more specific to more general.
194 User-agent: Googlebot
195 Allow: /folder1/myfile.html
199 good
= ['/folder1/myfile.html']
200 bad
= ['/folder1/anotherfile.html']
202 RobotTest(13, doc
, good
, bad
, agent
="googlebot")
206 class NetworkTestCase(unittest
.TestCase
):
208 def testPasswordProtectedSite(self
):
209 test_support
.requires('network')
210 # XXX it depends on an external resource which could be unavailable
211 url
= 'http://mueblesmoraleda.com'
212 parser
= robotparser
.RobotFileParser()
217 self
.skipTest('%s is unavailable' % url
)
218 self
.assertEqual(parser
.can_fetch("*", url
+"/robots.txt"), False)
220 def testPythonOrg(self
):
221 test_support
.requires('network')
222 parser
= robotparser
.RobotFileParser(
223 "http://www.python.org/robots.txt")
225 self
.assertTrue(parser
.can_fetch("*",
226 "http://www.python.org/robots.txt"))
230 test_support
.run_unittest(tests
)
231 test_support
.run_unittest(NetworkTestCase
)
233 if __name__
=='__main__':
234 test_support
.verbose
= 1