Completely get rid of StringIO.py and cStringIO.c. I had to fix a few tests and modules beyond what Christian did, and invent a few conventions. E.g. in elementtree, I chose to write/return Unicode strings whe no encoding is given, but bytes when an explicit encoding is given. Also mimetools was made to always assume binary files.
144 lines
3.1 KiB
Python
144 lines
3.1 KiB
Python
import unittest, robotparser
|
|
import io
|
|
from test import test_support
|
|
|
|
class RobotTestCase(unittest.TestCase):
|
|
def __init__(self, index, parser, url, good, agent):
|
|
unittest.TestCase.__init__(self)
|
|
if good:
|
|
self.str = "RobotTest(%d, good, %s)" % (index, url)
|
|
else:
|
|
self.str = "RobotTest(%d, bad, %s)" % (index, url)
|
|
self.parser = parser
|
|
self.url = url
|
|
self.good = good
|
|
self.agent = agent
|
|
|
|
def runTest(self):
|
|
if isinstance(self.url, tuple):
|
|
agent, url = self.url
|
|
else:
|
|
url = self.url
|
|
agent = self.agent
|
|
if self.good:
|
|
self.failUnless(self.parser.can_fetch(agent, url))
|
|
else:
|
|
self.failIf(self.parser.can_fetch(agent, url))
|
|
|
|
def __str__(self):
|
|
return self.str
|
|
|
|
tests = unittest.TestSuite()
|
|
|
|
def RobotTest(index, robots_txt, good_urls, bad_urls,
|
|
agent="test_robotparser"):
|
|
|
|
lines = io.StringIO(robots_txt).readlines()
|
|
parser = robotparser.RobotFileParser()
|
|
parser.parse(lines)
|
|
for url in good_urls:
|
|
tests.addTest(RobotTestCase(index, parser, url, 1, agent))
|
|
for url in bad_urls:
|
|
tests.addTest(RobotTestCase(index, parser, url, 0, agent))
|
|
|
|
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
|
|
|
|
# 1.
|
|
doc = """
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
Disallow: /tmp/ # these will soon disappear
|
|
Disallow: /foo.html
|
|
"""
|
|
|
|
good = ['/','/test.html']
|
|
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
|
|
|
|
RobotTest(1, doc, good, bad)
|
|
|
|
# 2.
|
|
doc = """
|
|
# robots.txt for http://www.example.com/
|
|
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
|
|
# Cybermapper knows where to go.
|
|
User-agent: cybermapper
|
|
Disallow:
|
|
|
|
"""
|
|
|
|
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
|
|
bad = ['/cyberworld/map/index.html']
|
|
|
|
RobotTest(2, doc, good, bad)
|
|
|
|
# 3.
|
|
doc = """
|
|
# go away
|
|
User-agent: *
|
|
Disallow: /
|
|
"""
|
|
|
|
good = []
|
|
bad = ['/cyberworld/map/index.html','/','/tmp/']
|
|
|
|
RobotTest(3, doc, good, bad)
|
|
|
|
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
|
|
|
|
# 4.
|
|
doc = """
|
|
User-agent: figtree
|
|
Disallow: /tmp
|
|
Disallow: /a%3cd.html
|
|
Disallow: /a%2fb.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
|
|
good = [] # XFAIL '/a/b.html'
|
|
bad = ['/tmp','/tmp.html','/tmp/a.html',
|
|
'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
|
|
'/~joe/index.html'
|
|
]
|
|
|
|
RobotTest(4, doc, good, bad, 'figtree')
|
|
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
|
|
|
|
# 6.
|
|
doc = """
|
|
User-agent: *
|
|
Disallow: /tmp/
|
|
Disallow: /a%3Cd.html
|
|
Disallow: /a/b.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
|
|
good = ['/tmp',] # XFAIL: '/a%2fb.html'
|
|
bad = ['/tmp/','/tmp/a.html',
|
|
'/a%3cd.html','/a%3Cd.html',"/a/b.html",
|
|
'/%7Ejoe/index.html']
|
|
|
|
RobotTest(6, doc, good, bad)
|
|
|
|
# From bug report #523041
|
|
|
|
# 7.
|
|
doc = """
|
|
User-Agent: *
|
|
Disallow: /.
|
|
"""
|
|
|
|
good = ['/foo.html']
|
|
bad = [] # Bug report says "/" should be denied, but that is not in the RFC
|
|
|
|
RobotTest(7, doc, good, bad)
|
|
|
|
def test_main():
|
|
test_support.run_unittest(tests)
|
|
|
|
if __name__=='__main__':
|
|
test_support.Verbose = 1
|
|
test_main()
|