bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
This commit is contained in:
parent
7a1c027501
commit
5db5c0669e
@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
|
|||||||
|
|
||||||
.. versionadded:: 3.6
|
.. versionadded:: 3.6
|
||||||
|
|
||||||
|
.. method:: site_maps()
|
||||||
|
|
||||||
|
Returns the contents of the ``Sitemap`` parameter from
|
||||||
|
``robots.txt`` in the form of a :func:`list`. If there is no such
|
||||||
|
parameter or the ``robots.txt`` entry for this parameter has
|
||||||
|
invalid syntax, return ``None``.
|
||||||
|
|
||||||
|
.. versionadded:: 3.8
|
||||||
|
|
||||||
|
|
||||||
The following example demonstrates basic use of the :class:`RobotFileParser`
|
The following example demonstrates basic use of the :class:`RobotFileParser`
|
||||||
class::
|
class::
|
||||||
|
@ -12,6 +12,7 @@ class BaseRobotTest:
|
|||||||
agent = 'test_robotparser'
|
agent = 'test_robotparser'
|
||||||
good = []
|
good = []
|
||||||
bad = []
|
bad = []
|
||||||
|
site_maps = None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
lines = io.StringIO(self.robots_txt).readlines()
|
lines = io.StringIO(self.robots_txt).readlines()
|
||||||
@ -36,6 +37,9 @@ class BaseRobotTest:
|
|||||||
with self.subTest(url=url, agent=agent):
|
with self.subTest(url=url, agent=agent):
|
||||||
self.assertFalse(self.parser.can_fetch(agent, url))
|
self.assertFalse(self.parser.can_fetch(agent, url))
|
||||||
|
|
||||||
|
def test_site_maps(self):
|
||||||
|
self.assertEqual(self.parser.site_maps(), self.site_maps)
|
||||||
|
|
||||||
|
|
||||||
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
@ -65,6 +69,23 @@ Disallow:
|
|||||||
bad = ['/cyberworld/map/index.html']
|
bad = ['/cyberworld/map/index.html']
|
||||||
|
|
||||||
|
|
||||||
|
class SitemapTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
# robots.txt for http://www.example.com/
|
||||||
|
|
||||||
|
User-agent: *
|
||||||
|
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
|
||||||
|
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
|
||||||
|
Request-rate: 3/15
|
||||||
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
||||||
|
|
||||||
|
"""
|
||||||
|
good = ['/', '/test.html']
|
||||||
|
bad = ['/cyberworld/map/index.html']
|
||||||
|
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
|
||||||
|
'http://www.google.com/hostednews/sitemap_index.xml']
|
||||||
|
|
||||||
|
|
||||||
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
|
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
# go away
|
# go away
|
||||||
|
@ -27,6 +27,7 @@ class RobotFileParser:
|
|||||||
|
|
||||||
def __init__(self, url=''):
|
def __init__(self, url=''):
|
||||||
self.entries = []
|
self.entries = []
|
||||||
|
self.sitemaps = []
|
||||||
self.default_entry = None
|
self.default_entry = None
|
||||||
self.disallow_all = False
|
self.disallow_all = False
|
||||||
self.allow_all = False
|
self.allow_all = False
|
||||||
@ -141,6 +142,12 @@ class RobotFileParser:
|
|||||||
and numbers[1].strip().isdigit()):
|
and numbers[1].strip().isdigit()):
|
||||||
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
|
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
|
||||||
state = 2
|
state = 2
|
||||||
|
elif line[0] == "sitemap":
|
||||||
|
# According to http://www.sitemaps.org/protocol.html
|
||||||
|
# "This directive is independent of the user-agent line,
|
||||||
|
# so it doesn't matter where you place it in your file."
|
||||||
|
# Therefore we do not change the state of the parser.
|
||||||
|
self.sitemaps.append(line[1])
|
||||||
if state == 2:
|
if state == 2:
|
||||||
self._add_entry(entry)
|
self._add_entry(entry)
|
||||||
|
|
||||||
@ -189,6 +196,11 @@ class RobotFileParser:
|
|||||||
return entry.req_rate
|
return entry.req_rate
|
||||||
return self.default_entry.req_rate
|
return self.default_entry.req_rate
|
||||||
|
|
||||||
|
def site_maps(self):
|
||||||
|
if not self.sitemaps:
|
||||||
|
return None
|
||||||
|
return self.sitemaps
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
entries = self.entries
|
entries = self.entries
|
||||||
if self.default_entry is not None:
|
if self.default_entry is not None:
|
||||||
|
@ -109,6 +109,7 @@ Anthony Baxter
|
|||||||
Mike Bayer
|
Mike Bayer
|
||||||
Samuel L. Bayer
|
Samuel L. Bayer
|
||||||
Bo Bayles
|
Bo Bayles
|
||||||
|
Christopher Beacham AKA Lady Red
|
||||||
Tommy Beadle
|
Tommy Beadle
|
||||||
Donald Beaudry
|
Donald Beaudry
|
||||||
David Beazley
|
David Beazley
|
||||||
@ -1760,6 +1761,7 @@ Dik Winter
|
|||||||
Blake Winton
|
Blake Winton
|
||||||
Jean-Claude Wippler
|
Jean-Claude Wippler
|
||||||
Stéphane Wirtel
|
Stéphane Wirtel
|
||||||
|
Peter Wirtz
|
||||||
Lars Wirzenius
|
Lars Wirzenius
|
||||||
John Wiseman
|
John Wiseman
|
||||||
Chris Withers
|
Chris Withers
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
Added support for Site Maps to urllib's ``RobotFileParser`` as
|
||||||
|
:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
|
||||||
|
Patch by Lady Red, based on patch by Peter Wirtz.
|
Loading…
x
Reference in New Issue
Block a user