svn+ssh://pythondev@svn.python.org/python/trunk ........ r65012 | jesse.noller | 2008-07-16 15:24:06 +0200 (Wed, 16 Jul 2008) | 2 lines Apply patch for issue 3090: ARCHFLAGS parsing incorrect ........ r65035 | georg.brandl | 2008-07-16 23:19:28 +0200 (Wed, 16 Jul 2008) | 2 lines #3045: fix pydoc behavior for TEMP path with spaces. ........ r65037 | georg.brandl | 2008-07-16 23:31:41 +0200 (Wed, 16 Jul 2008) | 2 lines #1608818: errno can get set by every call to readdir(). ........ r65038 | georg.brandl | 2008-07-17 00:04:20 +0200 (Thu, 17 Jul 2008) | 2 lines #3305: self->stream can be NULL. ........ r65039 | georg.brandl | 2008-07-17 00:09:17 +0200 (Thu, 17 Jul 2008) | 2 lines #3345: fix docstring. ........ r65040 | georg.brandl | 2008-07-17 00:33:18 +0200 (Thu, 17 Jul 2008) | 2 lines #3312: fix two sqlite3 crashes. ........ r65048 | georg.brandl | 2008-07-17 01:35:54 +0200 (Thu, 17 Jul 2008) | 2 lines #3388: add a paragraph about using "with" for file objects. ........ r65057 | gregory.p.smith | 2008-07-17 05:13:05 +0200 (Thu, 17 Jul 2008) | 2 lines news note for r63052 ........ r65077 | jesse.noller | 2008-07-17 23:01:05 +0200 (Thu, 17 Jul 2008) | 3 lines Fix issue 3395, update _debugInfo to be _debug_info ........ r65091 | ronald.oussoren | 2008-07-18 07:48:03 +0200 (Fri, 18 Jul 2008) | 2 lines Last bit of a fix for issue3381 (addon for my patch in r65061) ........ r65092 | vinay.sajip | 2008-07-18 10:59:06 +0200 (Fri, 18 Jul 2008) | 1 line Issue #3389: Allow resolving dotted names for handlers in logging configuration files. Thanks to Philip Jenvey for the patch. ........ r65093 | vinay.sajip | 2008-07-18 11:00:00 +0200 (Fri, 18 Jul 2008) | 1 line Issue #3389: Allow resolving dotted names for handlers in logging configuration files. Thanks to Philip Jenvey for the patch. ........ r65094 | vinay.sajip | 2008-07-18 11:00:35 +0200 (Fri, 18 Jul 2008) | 1 line Issue #3389: Allow resolving dotted names for handlers in logging configuration files. Thanks to Philip Jenvey for the patch. ........ r65095 | vinay.sajip | 2008-07-18 11:01:10 +0200 (Fri, 18 Jul 2008) | 1 line Issue #3389: Allow resolving dotted names for handlers in logging configuration files. Thanks to Philip Jenvey for the patch. ........ r65097 | georg.brandl | 2008-07-18 12:20:59 +0200 (Fri, 18 Jul 2008) | 2 lines Remove duplicate entry in __all__. ........ r65098 | georg.brandl | 2008-07-18 12:29:30 +0200 (Fri, 18 Jul 2008) | 2 lines Correct attribute name. ........ r65099 | georg.brandl | 2008-07-18 13:15:06 +0200 (Fri, 18 Jul 2008) | 3 lines Document the different meaning of precision for {:f} and {:g}. Also document how inf and nan are formatted. #3404. ........ r65127 | raymond.hettinger | 2008-07-19 02:42:03 +0200 (Sat, 19 Jul 2008) | 1 line Improve accuracy of gamma test function ........ r65128 | raymond.hettinger | 2008-07-19 02:43:00 +0200 (Sat, 19 Jul 2008) | 1 line Add recipe to the itertools docs. ........ r65131 | georg.brandl | 2008-07-19 12:08:55 +0200 (Sat, 19 Jul 2008) | 2 lines #3378: in case of no memory, don't leak even more memory. :) ........ r65133 | georg.brandl | 2008-07-19 14:39:10 +0200 (Sat, 19 Jul 2008) | 3 lines #3302: fix segfaults when passing None for arguments that can't be NULL for the C functions. ........ r65134 | georg.brandl | 2008-07-19 14:46:12 +0200 (Sat, 19 Jul 2008) | 2 lines #3303: fix crash with invalid Py_DECREF in strcoll(). ........ r65135 | georg.brandl | 2008-07-19 15:00:22 +0200 (Sat, 19 Jul 2008) | 3 lines #3319: don't raise ZeroDivisionError if number of rounds is so low that benchtime is zero. ........ r65136 | georg.brandl | 2008-07-19 15:09:42 +0200 (Sat, 19 Jul 2008) | 3 lines #3323: mention that if inheriting from a class without __slots__, the subclass will have a __dict__ available too. ........ r65139 | georg.brandl | 2008-07-19 15:48:44 +0200 (Sat, 19 Jul 2008) | 2 lines Add ordering info for findall and finditer. ........ r65149 | raymond.hettinger | 2008-07-20 01:21:57 +0200 (Sun, 20 Jul 2008) | 1 line Fix compress() recipe in docs to use itertools. ........ r65150 | raymond.hettinger | 2008-07-20 01:58:47 +0200 (Sun, 20 Jul 2008) | 1 line Clean-up itertools docs and recipes. ........ r65151 | gregory.p.smith | 2008-07-20 02:22:08 +0200 (Sun, 20 Jul 2008) | 9 lines fix issue3120 - don't truncate handles on 64-bit Windows. This is still messy, realistically PC/_subprocess.c should never cast pointers to python numbers and back at all. I don't have a 64-bit windows build environment because microsoft apparently thinks that should cost money. Time to watch the buildbots. It builds and passes tests on 32-bit windows. ........ r65155 | georg.brandl | 2008-07-20 13:50:29 +0200 (Sun, 20 Jul 2008) | 2 lines #926501: add info where to put the docstring. ........ r65158 | neal.norwitz | 2008-07-20 21:35:23 +0200 (Sun, 20 Jul 2008) | 1 line Fix a couple of names in error messages that were wrong ........ r65159 | neal.norwitz | 2008-07-20 22:39:36 +0200 (Sun, 20 Jul 2008) | 1 line Fix misspeeld method name (negative) ........ r65176 | amaury.forgeotdarc | 2008-07-21 23:36:24 +0200 (Mon, 21 Jul 2008) | 4 lines Increment version number in NEWS file, and move items that were added after 2.6b2. (I thought there was a script to automate this kind of updates) ........ r65177 | amaury.forgeotdarc | 2008-07-22 00:00:38 +0200 (Tue, 22 Jul 2008) | 5 lines Issue2378: pdb would delete free variables when stepping into a class statement. The problem was introduced by r53954, the correction is to restore the symmetry between PyFrame_FastToLocals and PyFrame_LocalsToFast ........ r65178 | benjamin.peterson | 2008-07-22 00:05:34 +0200 (Tue, 22 Jul 2008) | 1 line don't use assert statement ........ r65183 | ronald.oussoren | 2008-07-22 09:06:00 +0200 (Tue, 22 Jul 2008) | 2 lines Fix buglet in fix for issue3381 ........ r65184 | ronald.oussoren | 2008-07-22 09:06:33 +0200 (Tue, 22 Jul 2008) | 2 lines Fix build issue on OSX 10.4, somehow this wasn't committed before. ........ r65187 | raymond.hettinger | 2008-07-22 20:54:02 +0200 (Tue, 22 Jul 2008) | 1 line Remove out-of-date section on Exact/Inexact. ........ r65188 | raymond.hettinger | 2008-07-22 21:00:47 +0200 (Tue, 22 Jul 2008) | 1 line Tuples now have both count() and index(). ........ r65189 | raymond.hettinger | 2008-07-22 21:03:05 +0200 (Tue, 22 Jul 2008) | 1 line Fix credits for math.sum() ........ r65190 | raymond.hettinger | 2008-07-22 21:18:50 +0200 (Tue, 22 Jul 2008) | 1 line One more attribution. ........ r65192 | benjamin.peterson | 2008-07-23 01:44:37 +0200 (Wed, 23 Jul 2008) | 1 line remove unneeded import ........ r65194 | benjamin.peterson | 2008-07-23 15:25:06 +0200 (Wed, 23 Jul 2008) | 1 line use isinstance ........
167 lines
3.9 KiB
Python
167 lines
3.9 KiB
Python
import io
|
|
import unittest
|
|
import urllib.robotparser
|
|
from test import support
|
|
|
|
class RobotTestCase(unittest.TestCase):
|
|
def __init__(self, index, parser, url, good, agent):
|
|
unittest.TestCase.__init__(self)
|
|
if good:
|
|
self.str = "RobotTest(%d, good, %s)" % (index, url)
|
|
else:
|
|
self.str = "RobotTest(%d, bad, %s)" % (index, url)
|
|
self.parser = parser
|
|
self.url = url
|
|
self.good = good
|
|
self.agent = agent
|
|
|
|
def runTest(self):
|
|
if isinstance(self.url, tuple):
|
|
agent, url = self.url
|
|
else:
|
|
url = self.url
|
|
agent = self.agent
|
|
if self.good:
|
|
self.failUnless(self.parser.can_fetch(agent, url))
|
|
else:
|
|
self.failIf(self.parser.can_fetch(agent, url))
|
|
|
|
def __str__(self):
|
|
return self.str
|
|
|
|
tests = unittest.TestSuite()
|
|
|
|
def RobotTest(index, robots_txt, good_urls, bad_urls,
|
|
agent="test_robotparser"):
|
|
|
|
lines = io.StringIO(robots_txt).readlines()
|
|
parser = urllib.robotparser.RobotFileParser()
|
|
parser.parse(lines)
|
|
for url in good_urls:
|
|
tests.addTest(RobotTestCase(index, parser, url, 1, agent))
|
|
for url in bad_urls:
|
|
tests.addTest(RobotTestCase(index, parser, url, 0, agent))
|
|
|
|
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
|
|
|
|
# 1.
|
|
doc = """
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
Disallow: /tmp/ # these will soon disappear
|
|
Disallow: /foo.html
|
|
"""
|
|
|
|
good = ['/','/test.html']
|
|
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
|
|
|
|
RobotTest(1, doc, good, bad)
|
|
|
|
# 2.
|
|
doc = """
|
|
# robots.txt for http://www.example.com/
|
|
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
|
|
# Cybermapper knows where to go.
|
|
User-agent: cybermapper
|
|
Disallow:
|
|
|
|
"""
|
|
|
|
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
|
|
bad = ['/cyberworld/map/index.html']
|
|
|
|
RobotTest(2, doc, good, bad)
|
|
|
|
# 3.
|
|
doc = """
|
|
# go away
|
|
User-agent: *
|
|
Disallow: /
|
|
"""
|
|
|
|
good = []
|
|
bad = ['/cyberworld/map/index.html','/','/tmp/']
|
|
|
|
RobotTest(3, doc, good, bad)
|
|
|
|
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
|
|
|
|
# 4.
|
|
doc = """
|
|
User-agent: figtree
|
|
Disallow: /tmp
|
|
Disallow: /a%3cd.html
|
|
Disallow: /a%2fb.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
|
|
good = [] # XFAIL '/a/b.html'
|
|
bad = ['/tmp','/tmp.html','/tmp/a.html',
|
|
'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
|
|
'/~joe/index.html'
|
|
]
|
|
|
|
RobotTest(4, doc, good, bad, 'figtree')
|
|
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
|
|
|
|
# 6.
|
|
doc = """
|
|
User-agent: *
|
|
Disallow: /tmp/
|
|
Disallow: /a%3Cd.html
|
|
Disallow: /a/b.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
|
|
good = ['/tmp',] # XFAIL: '/a%2fb.html'
|
|
bad = ['/tmp/','/tmp/a.html',
|
|
'/a%3cd.html','/a%3Cd.html',"/a/b.html",
|
|
'/%7Ejoe/index.html']
|
|
|
|
RobotTest(6, doc, good, bad)
|
|
|
|
# From bug report #523041
|
|
|
|
# 7.
|
|
doc = """
|
|
User-Agent: *
|
|
Disallow: /.
|
|
"""
|
|
|
|
good = ['/foo.html']
|
|
bad = [] # Bug report says "/" should be denied, but that is not in the RFC
|
|
|
|
RobotTest(7, doc, good, bad)
|
|
|
|
class NetworkTestCase(unittest.TestCase):
|
|
|
|
def testPasswordProtectedSite(self):
|
|
if not support.is_resource_enabled('network'):
|
|
return
|
|
# whole site is password-protected.
|
|
url = 'http://mueblesmoraleda.com'
|
|
parser = urllib.robotparser.RobotFileParser()
|
|
parser.set_url(url)
|
|
parser.read()
|
|
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
|
|
|
|
def testPythonOrg(self):
|
|
if not support.is_resource_enabled('network'):
|
|
return
|
|
parser = urllib.robotparser.RobotFileParser(
|
|
"http://www.python.org/robots.txt")
|
|
parser.read()
|
|
self.assertTrue(parser.can_fetch("*",
|
|
"http://www.python.org/robots.txt"))
|
|
|
|
def test_main():
|
|
support.run_unittest(NetworkTestCase)
|
|
support.run_unittest(tests)
|
|
|
|
if __name__=='__main__':
|
|
support.verbose = 1
|
|
test_main()
|