In CDATA mode, make sure entity-reference syntax is not interpreted;
entity references are not allowed in that mode. Do a better job of scanning <!DOCTYPE ...> declarations; based on the code in HTMLParser.py.
This commit is contained in:
parent
e16c7aee4b
commit
fb38c76e0f
@ -5,7 +5,8 @@
|
|||||||
# XXX There should be a way to distinguish between PCDATA (parsed
|
# XXX There should be a way to distinguish between PCDATA (parsed
|
||||||
# character data -- the normal case), RCDATA (replaceable character
|
# character data -- the normal case), RCDATA (replaceable character
|
||||||
# data -- only char and entity references and end tags are special)
|
# data -- only char and entity references and end tags are special)
|
||||||
# and CDATA (character data -- only end tags are special).
|
# and CDATA (character data -- only end tags are special). RCDATA is
|
||||||
|
# not supported at all.
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
|
|||||||
special = re.compile('<![^<>]*>')
|
special = re.compile('<![^<>]*>')
|
||||||
commentopen = re.compile('<!--')
|
commentopen = re.compile('<!--')
|
||||||
commentclose = re.compile(r'--\s*>')
|
commentclose = re.compile(r'--\s*>')
|
||||||
|
declopen = re.compile('<!')
|
||||||
|
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
|
||||||
|
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
|
||||||
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
||||||
attrfind = re.compile(
|
attrfind = re.compile(
|
||||||
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
@ -160,6 +164,10 @@ class SGMLParser:
|
|||||||
i = k
|
i = k
|
||||||
continue
|
continue
|
||||||
elif rawdata[i] == '&':
|
elif rawdata[i] == '&':
|
||||||
|
if self.literal:
|
||||||
|
self.handle_data(rawdata[i])
|
||||||
|
i = i+1
|
||||||
|
continue
|
||||||
match = charref.match(rawdata, i)
|
match = charref.match(rawdata, i)
|
||||||
if match:
|
if match:
|
||||||
name = match.group(1)
|
name = match.group(1)
|
||||||
@ -210,11 +218,20 @@ class SGMLParser:
|
|||||||
|
|
||||||
# Internal -- parse declaration.
|
# Internal -- parse declaration.
|
||||||
def parse_declaration(self, i):
|
def parse_declaration(self, i):
|
||||||
|
# This is some sort of declaration; in "HTML as
|
||||||
|
# deployed," this should only be the document type
|
||||||
|
# declaration ("<!DOCTYPE html...>").
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
j = i + 2
|
j = i + 2
|
||||||
|
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
|
||||||
|
if rawdata[j:j+1] in ("-", ""):
|
||||||
|
# Start of comment followed by buffer boundary,
|
||||||
|
# or just a buffer boundary.
|
||||||
|
return -1
|
||||||
|
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
||||||
n = len(rawdata)
|
n = len(rawdata)
|
||||||
while j < n:
|
while j < n:
|
||||||
c = rawdata[j:j+1]
|
c = rawdata[j]
|
||||||
if c == ">":
|
if c == ">":
|
||||||
# end of declaration syntax
|
# end of declaration syntax
|
||||||
self.handle_decl(rawdata[i+2:j])
|
self.handle_decl(rawdata[i+2:j])
|
||||||
@ -222,15 +239,16 @@ class SGMLParser:
|
|||||||
if c in "\"'":
|
if c in "\"'":
|
||||||
m = declstringlit.match(rawdata, j)
|
m = declstringlit.match(rawdata, j)
|
||||||
if not m:
|
if not m:
|
||||||
# incomplete or an error?
|
return -1 # incomplete
|
||||||
return -1
|
j = m.end()
|
||||||
|
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||||
|
m = declname.match(rawdata, j)
|
||||||
|
if not m:
|
||||||
|
return -1 # incomplete
|
||||||
j = m.end()
|
j = m.end()
|
||||||
else:
|
else:
|
||||||
m = decldata.match(rawdata, j)
|
raise SGMLParseError(
|
||||||
if not m:
|
"unexpected char in declaration: %s" % `rawdata[j]`)
|
||||||
# incomplete or an error?
|
|
||||||
return -1
|
|
||||||
j = m.end()
|
|
||||||
# end of buffer between tokens
|
# end of buffer between tokens
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user