bpo-21475: Support the Sitemap extension in robotparser (GH-6883)

This commit is contained in:
Christopher Beacham 2018-05-16 07:52:07 -07:00 committed by Ned Deily
parent 7a1c027501
commit 5db5c0669e
5 changed files with 47 additions and 0 deletions

View File

@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
.. versionadded:: 3.6 .. versionadded:: 3.6
.. method:: site_maps()
Returns the contents of the ``Sitemap`` parameter from
``robots.txt`` in the form of a :func:`list`. If there is no such
parameter or the ``robots.txt`` entry for this parameter has
invalid syntax, return ``None``.
.. versionadded:: 3.8
The following example demonstrates basic use of the :class:`RobotFileParser` The following example demonstrates basic use of the :class:`RobotFileParser`
class:: class::

View File

@ -12,6 +12,7 @@ class BaseRobotTest:
agent = 'test_robotparser' agent = 'test_robotparser'
good = [] good = []
bad = [] bad = []
site_maps = None
def setUp(self): def setUp(self):
lines = io.StringIO(self.robots_txt).readlines() lines = io.StringIO(self.robots_txt).readlines()
@ -36,6 +37,9 @@ class BaseRobotTest:
with self.subTest(url=url, agent=agent): with self.subTest(url=url, agent=agent):
self.assertFalse(self.parser.can_fetch(agent, url)) self.assertFalse(self.parser.can_fetch(agent, url))
def test_site_maps(self):
self.assertEqual(self.parser.site_maps(), self.site_maps)
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\ robots_txt = """\
@ -65,6 +69,23 @@ Disallow:
bad = ['/cyberworld/map/index.html'] bad = ['/cyberworld/map/index.html']
class SitemapTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# robots.txt for http://www.example.com/
User-agent: *
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
"""
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
'http://www.google.com/hostednews/sitemap_index.xml']
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\ robots_txt = """\
# go away # go away

View File

@ -27,6 +27,7 @@ class RobotFileParser:
def __init__(self, url=''): def __init__(self, url=''):
self.entries = [] self.entries = []
self.sitemaps = []
self.default_entry = None self.default_entry = None
self.disallow_all = False self.disallow_all = False
self.allow_all = False self.allow_all = False
@ -141,6 +142,12 @@ class RobotFileParser:
and numbers[1].strip().isdigit()): and numbers[1].strip().isdigit()):
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2 state = 2
elif line[0] == "sitemap":
# According to http://www.sitemaps.org/protocol.html
# "This directive is independent of the user-agent line,
# so it doesn't matter where you place it in your file."
# Therefore we do not change the state of the parser.
self.sitemaps.append(line[1])
if state == 2: if state == 2:
self._add_entry(entry) self._add_entry(entry)
@ -189,6 +196,11 @@ class RobotFileParser:
return entry.req_rate return entry.req_rate
return self.default_entry.req_rate return self.default_entry.req_rate
def site_maps(self):
if not self.sitemaps:
return None
return self.sitemaps
def __str__(self): def __str__(self):
entries = self.entries entries = self.entries
if self.default_entry is not None: if self.default_entry is not None:

View File

@ -109,6 +109,7 @@ Anthony Baxter
Mike Bayer Mike Bayer
Samuel L. Bayer Samuel L. Bayer
Bo Bayles Bo Bayles
Christopher Beacham AKA Lady Red
Tommy Beadle Tommy Beadle
Donald Beaudry Donald Beaudry
David Beazley David Beazley
@ -1760,6 +1761,7 @@ Dik Winter
Blake Winton Blake Winton
Jean-Claude Wippler Jean-Claude Wippler
Stéphane Wirtel Stéphane Wirtel
Peter Wirtz
Lars Wirzenius Lars Wirzenius
John Wiseman John Wiseman
Chris Withers Chris Withers

View File

@ -0,0 +1,3 @@
Added support for Site Maps to urllib's ``RobotFileParser`` as
:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
Patch by Lady Red, based on patch by Peter Wirtz.