The motivation for adding this option is that the the functionality it provides used to be provided by sgmllib in Python2, and was used by, for example, BeautifulSoup. Without this option, the Python3 version of BeautifulSoup and the many programs that use it are crippled. The original patch was by 'kxroberto'. I modified it heavily but kept his heuristics and test. I also added additional heuristics to fix #975556, #1046092, and part of #6191. This patch should be completely backward compatible: the behavior with the default strict=True is unchanged.
459 lines
16 KiB
Python
459 lines
16 KiB
Python
"""A parser for HTML and XHTML."""
|
|
|
|
# This file is based on sgmllib.py, but the API is slightly different.
|
|
|
|
# XXX There should be a way to distinguish between PCDATA (parsed
|
|
# character data -- the normal case), RCDATA (replaceable character
|
|
# data -- only char and entity references and end tags are special)
|
|
# and CDATA (character data -- only end tags are special).
|
|
|
|
|
|
import _markupbase
|
|
import re
|
|
|
|
# Regular expressions used for parsing
|
|
|
|
interesting_normal = re.compile('[&<]')
|
|
interesting_cdata = re.compile(r'<(/|\Z)')
|
|
incomplete = re.compile('&[a-zA-Z#]')
|
|
|
|
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
|
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
|
|
|
starttagopen = re.compile('<[a-zA-Z]')
|
|
piclose = re.compile('>')
|
|
commentclose = re.compile(r'--\s*>')
|
|
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
|
|
# Note, the strict one of this pair isn't really strict, but we can't
|
|
# make it correctly strict without breaking backward compatibility.
|
|
attrfind = re.compile(
|
|
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
|
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
|
|
attrfind_tolerant = re.compile(
|
|
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
|
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
|
|
locatestarttagend = re.compile(r"""
|
|
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
|
(?:\s+ # whitespace before attribute name
|
|
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
|
(?:\s*=\s* # value indicator
|
|
(?:'[^']*' # LITA-enclosed value
|
|
|\"[^\"]*\" # LIT-enclosed value
|
|
|[^'\">\s]+ # bare value
|
|
)
|
|
)?
|
|
)
|
|
)*
|
|
\s* # trailing whitespace
|
|
""", re.VERBOSE)
|
|
locatestarttagend_tolerant = re.compile(r"""
|
|
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
|
(?:\s* # optional whitespace before attribute name
|
|
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
|
(?:\s*=\s* # value indicator
|
|
(?:'[^']*' # LITA-enclosed value
|
|
|\"[^\"]*\" # LIT-enclosed value
|
|
|[^'\">\s]+ # bare value
|
|
)
|
|
(?:\s*,)* # possibly followed by a comma
|
|
)?
|
|
)
|
|
)*
|
|
\s* # trailing whitespace
|
|
""", re.VERBOSE)
|
|
endendtag = re.compile('>')
|
|
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
|
|
|
|
|
class HTMLParseError(Exception):
|
|
"""Exception raised for all parse errors."""
|
|
|
|
def __init__(self, msg, position=(None, None)):
|
|
assert msg
|
|
self.msg = msg
|
|
self.lineno = position[0]
|
|
self.offset = position[1]
|
|
|
|
def __str__(self):
|
|
result = self.msg
|
|
if self.lineno is not None:
|
|
result = result + ", at line %d" % self.lineno
|
|
if self.offset is not None:
|
|
result = result + ", column %d" % (self.offset + 1)
|
|
return result
|
|
|
|
|
|
class HTMLParser(_markupbase.ParserBase):
|
|
"""Find tags and other markup and call handler functions.
|
|
|
|
Usage:
|
|
p = HTMLParser()
|
|
p.feed(data)
|
|
...
|
|
p.close()
|
|
|
|
Start tags are handled by calling self.handle_starttag() or
|
|
self.handle_startendtag(); end tags by self.handle_endtag(). The
|
|
data between tags is passed from the parser to the derived class
|
|
by calling self.handle_data() with the data as argument (the data
|
|
may be split up in arbitrary chunks). Entity references are
|
|
passed by calling self.handle_entityref() with the entity
|
|
reference as the argument. Numeric character references are
|
|
passed to self.handle_charref() with the string containing the
|
|
reference as the argument.
|
|
"""
|
|
|
|
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
|
|
|
def __init__(self, strict=True):
|
|
"""Initialize and reset this instance.
|
|
|
|
If strict is set to True (the default), errors are raised when invalid
|
|
HTML is encountered. If set to False, an attempt is instead made to
|
|
continue parsing, making "best guesses" about the intended meaning, in
|
|
a fashion similar to what browsers typically do.
|
|
"""
|
|
self.strict = strict
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
"""Reset this instance. Loses all unprocessed data."""
|
|
self.rawdata = ''
|
|
self.lasttag = '???'
|
|
self.interesting = interesting_normal
|
|
_markupbase.ParserBase.reset(self)
|
|
|
|
def feed(self, data):
|
|
"""Feed data to the parser.
|
|
|
|
Call this as often as you want, with as little or as much text
|
|
as you want (may include '\n').
|
|
"""
|
|
self.rawdata = self.rawdata + data
|
|
self.goahead(0)
|
|
|
|
def close(self):
|
|
"""Handle any buffered data."""
|
|
self.goahead(1)
|
|
|
|
def error(self, message):
|
|
raise HTMLParseError(message, self.getpos())
|
|
|
|
__starttag_text = None
|
|
|
|
def get_starttag_text(self):
|
|
"""Return full source of start tag: '<...>'."""
|
|
return self.__starttag_text
|
|
|
|
def set_cdata_mode(self):
|
|
self.interesting = interesting_cdata
|
|
|
|
def clear_cdata_mode(self):
|
|
self.interesting = interesting_normal
|
|
|
|
# Internal -- handle data as far as reasonable. May leave state
|
|
# and data to be processed by a subsequent call. If 'end' is
|
|
# true, force handling all data as if followed by EOF marker.
|
|
def goahead(self, end):
|
|
rawdata = self.rawdata
|
|
i = 0
|
|
n = len(rawdata)
|
|
while i < n:
|
|
match = self.interesting.search(rawdata, i) # < or &
|
|
if match:
|
|
j = match.start()
|
|
else:
|
|
j = n
|
|
if i < j: self.handle_data(rawdata[i:j])
|
|
i = self.updatepos(i, j)
|
|
if i == n: break
|
|
startswith = rawdata.startswith
|
|
if startswith('<', i):
|
|
if starttagopen.match(rawdata, i): # < + letter
|
|
k = self.parse_starttag(i)
|
|
elif startswith("</", i):
|
|
k = self.parse_endtag(i)
|
|
elif startswith("<!--", i):
|
|
k = self.parse_comment(i)
|
|
elif startswith("<?", i):
|
|
k = self.parse_pi(i)
|
|
elif startswith("<!", i):
|
|
k = self.parse_declaration(i)
|
|
elif (i + 1) < n:
|
|
self.handle_data("<")
|
|
k = i + 1
|
|
else:
|
|
break
|
|
if k < 0:
|
|
if not end:
|
|
break
|
|
if self.strict:
|
|
self.error("EOF in middle of construct")
|
|
k = rawdata.find('>', i + 1)
|
|
if k < 0:
|
|
k = rawdata.find('<', i + 1)
|
|
if k < 0:
|
|
k = i + 1
|
|
else:
|
|
k += 1
|
|
self.handle_data(rawdata[i:k])
|
|
i = self.updatepos(i, k)
|
|
elif startswith("&#", i):
|
|
match = charref.match(rawdata, i)
|
|
if match:
|
|
name = match.group()[2:-1]
|
|
self.handle_charref(name)
|
|
k = match.end()
|
|
if not startswith(';', k-1):
|
|
k = k - 1
|
|
i = self.updatepos(i, k)
|
|
continue
|
|
else:
|
|
if ";" in rawdata[i:]: #bail by consuming &#
|
|
self.handle_data(rawdata[0:2])
|
|
i = self.updatepos(i, 2)
|
|
break
|
|
elif startswith('&', i):
|
|
match = entityref.match(rawdata, i)
|
|
if match:
|
|
name = match.group(1)
|
|
self.handle_entityref(name)
|
|
k = match.end()
|
|
if not startswith(';', k-1):
|
|
k = k - 1
|
|
i = self.updatepos(i, k)
|
|
continue
|
|
match = incomplete.match(rawdata, i)
|
|
if match:
|
|
# match.group() will contain at least 2 chars
|
|
if end and match.group() == rawdata[i:]:
|
|
if self.strict:
|
|
self.error("EOF in middle of entity or char ref")
|
|
else:
|
|
if k <= i:
|
|
k = n
|
|
i = self.updatepos(i, i + 1)
|
|
# incomplete
|
|
break
|
|
elif (i + 1) < n:
|
|
# not the end of the buffer, and can't be confused
|
|
# with some other construct
|
|
self.handle_data("&")
|
|
i = self.updatepos(i, i + 1)
|
|
else:
|
|
break
|
|
else:
|
|
assert 0, "interesting.search() lied"
|
|
# end while
|
|
if end and i < n:
|
|
self.handle_data(rawdata[i:n])
|
|
i = self.updatepos(i, n)
|
|
self.rawdata = rawdata[i:]
|
|
|
|
# Internal -- parse processing instr, return end or -1 if not terminated
|
|
def parse_pi(self, i):
|
|
rawdata = self.rawdata
|
|
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
|
|
match = piclose.search(rawdata, i+2) # >
|
|
if not match:
|
|
return -1
|
|
j = match.start()
|
|
self.handle_pi(rawdata[i+2: j])
|
|
j = match.end()
|
|
return j
|
|
|
|
# Internal -- handle starttag, return end or -1 if not terminated
|
|
def parse_starttag(self, i):
|
|
self.__starttag_text = None
|
|
endpos = self.check_for_whole_start_tag(i)
|
|
if endpos < 0:
|
|
return endpos
|
|
rawdata = self.rawdata
|
|
self.__starttag_text = rawdata[i:endpos]
|
|
|
|
# Now parse the data between i+1 and j into a tag and attrs
|
|
attrs = []
|
|
match = tagfind.match(rawdata, i+1)
|
|
assert match, 'unexpected call to parse_starttag()'
|
|
k = match.end()
|
|
self.lasttag = tag = rawdata[i+1:k].lower()
|
|
|
|
while k < endpos:
|
|
if self.strict:
|
|
m = attrfind.match(rawdata, k)
|
|
else:
|
|
m = attrfind_tolerant.search(rawdata, k)
|
|
if not m:
|
|
break
|
|
attrname, rest, attrvalue = m.group(1, 2, 3)
|
|
if not rest:
|
|
attrvalue = None
|
|
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
|
attrvalue[:1] == '"' == attrvalue[-1:]:
|
|
attrvalue = attrvalue[1:-1]
|
|
attrvalue = self.unescape(attrvalue)
|
|
attrs.append((attrname.lower(), attrvalue))
|
|
k = m.end()
|
|
|
|
end = rawdata[k:endpos].strip()
|
|
if end not in (">", "/>"):
|
|
lineno, offset = self.getpos()
|
|
if "\n" in self.__starttag_text:
|
|
lineno = lineno + self.__starttag_text.count("\n")
|
|
offset = len(self.__starttag_text) \
|
|
- self.__starttag_text.rfind("\n")
|
|
else:
|
|
offset = offset + len(self.__starttag_text)
|
|
if self.strict:
|
|
self.error("junk characters in start tag: %r"
|
|
% (rawdata[k:endpos][:20],))
|
|
self.handle_data(rawdata[i:endpos])
|
|
return endpos
|
|
if end.endswith('/>'):
|
|
# XHTML-style empty tag: <span attr="value" />
|
|
self.handle_startendtag(tag, attrs)
|
|
else:
|
|
self.handle_starttag(tag, attrs)
|
|
if tag in self.CDATA_CONTENT_ELEMENTS:
|
|
self.set_cdata_mode()
|
|
return endpos
|
|
|
|
# Internal -- check to see if we have a complete starttag; return end
|
|
# or -1 if incomplete.
|
|
def check_for_whole_start_tag(self, i):
|
|
rawdata = self.rawdata
|
|
if self.strict:
|
|
m = locatestarttagend.match(rawdata, i)
|
|
else:
|
|
m = locatestarttagend_tolerant.match(rawdata, i)
|
|
if m:
|
|
j = m.end()
|
|
next = rawdata[j:j+1]
|
|
if next == ">":
|
|
return j + 1
|
|
if next == "/":
|
|
if rawdata.startswith("/>", j):
|
|
return j + 2
|
|
if rawdata.startswith("/", j):
|
|
# buffer boundary
|
|
return -1
|
|
# else bogus input
|
|
if self.strict:
|
|
self.updatepos(i, j + 1)
|
|
self.error("malformed empty start tag")
|
|
if j > i:
|
|
return j
|
|
else:
|
|
return i + 1
|
|
if next == "":
|
|
# end of input
|
|
return -1
|
|
if next in ("abcdefghijklmnopqrstuvwxyz=/"
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
|
|
# end of input in or before attribute value, or we have the
|
|
# '/' from a '/>' ending
|
|
return -1
|
|
if self.strict:
|
|
self.updatepos(i, j)
|
|
self.error("malformed start tag")
|
|
if j > i:
|
|
return j
|
|
else:
|
|
return i + 1
|
|
raise AssertionError("we should not get here!")
|
|
|
|
# Internal -- parse endtag, return end or -1 if incomplete
|
|
def parse_endtag(self, i):
|
|
rawdata = self.rawdata
|
|
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
|
|
match = endendtag.search(rawdata, i+1) # >
|
|
if not match:
|
|
return -1
|
|
j = match.end()
|
|
match = endtagfind.match(rawdata, i) # </ + tag + >
|
|
if not match:
|
|
if self.strict:
|
|
self.error("bad end tag: %r" % (rawdata[i:j],))
|
|
k = rawdata.find('<', i + 1, j)
|
|
if k > i:
|
|
j = k
|
|
if j <= i:
|
|
j = i + 1
|
|
self.handle_data(rawdata[i:j])
|
|
return j
|
|
tag = match.group(1)
|
|
self.handle_endtag(tag.lower())
|
|
self.clear_cdata_mode()
|
|
return j
|
|
|
|
# Overridable -- finish processing of start+end tag: <tag.../>
|
|
def handle_startendtag(self, tag, attrs):
|
|
self.handle_starttag(tag, attrs)
|
|
self.handle_endtag(tag)
|
|
|
|
# Overridable -- handle start tag
|
|
def handle_starttag(self, tag, attrs):
|
|
pass
|
|
|
|
# Overridable -- handle end tag
|
|
def handle_endtag(self, tag):
|
|
pass
|
|
|
|
# Overridable -- handle character reference
|
|
def handle_charref(self, name):
|
|
pass
|
|
|
|
# Overridable -- handle entity reference
|
|
def handle_entityref(self, name):
|
|
pass
|
|
|
|
# Overridable -- handle data
|
|
def handle_data(self, data):
|
|
pass
|
|
|
|
# Overridable -- handle comment
|
|
def handle_comment(self, data):
|
|
pass
|
|
|
|
# Overridable -- handle declaration
|
|
def handle_decl(self, decl):
|
|
pass
|
|
|
|
# Overridable -- handle processing instruction
|
|
def handle_pi(self, data):
|
|
pass
|
|
|
|
def unknown_decl(self, data):
|
|
if self.strict:
|
|
self.error("unknown declaration: %r" % (data,))
|
|
|
|
# Internal -- helper to remove special character quoting
|
|
entitydefs = None
|
|
def unescape(self, s):
|
|
if '&' not in s:
|
|
return s
|
|
def replaceEntities(s):
|
|
s = s.groups()[0]
|
|
if s[0] == "#":
|
|
s = s[1:]
|
|
if s[0] in ['x','X']:
|
|
c = int(s[1:], 16)
|
|
else:
|
|
c = int(s)
|
|
return chr(c)
|
|
else:
|
|
# Cannot use name2codepoint directly, because HTMLParser
|
|
# supports apos, which is not part of HTML 4
|
|
import html.entities
|
|
if HTMLParser.entitydefs is None:
|
|
entitydefs = HTMLParser.entitydefs = {'apos':"'"}
|
|
for k, v in html.entities.name2codepoint.items():
|
|
entitydefs[k] = chr(v)
|
|
try:
|
|
return self.entitydefs[s]
|
|
except KeyError:
|
|
return '&'+s+';'
|
|
|
|
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
|
|
replaceEntities, s, re.ASCII)
|