Re-factor the SGMLParser class to use the new markupbase.ParserBase class.
Use a new internal method, error(), consistently to raise parse errors; the new base class also uses this. Adjust the parse_comment() method to return the new offset into the buffer instead of the number of characters scanned; this was the only helper method that did it this way, so we have better consistency now. Required to share the new base class. This fixes SF bug #448482 and #453706.
This commit is contained in:
parent
bfc8fea1e0
commit
a3bae3369c
@ -9,6 +9,7 @@
|
|||||||
# not supported at all.
|
# not supported at all.
|
||||||
|
|
||||||
|
|
||||||
|
import markupbase
|
||||||
import re
|
import re
|
||||||
|
|
||||||
__all__ = ["SGMLParser"]
|
__all__ = ["SGMLParser"]
|
||||||
@ -27,24 +28,14 @@ charref = re.compile('&#([0-9]+)[^0-9]')
|
|||||||
starttagopen = re.compile('<[>a-zA-Z]')
|
starttagopen = re.compile('<[>a-zA-Z]')
|
||||||
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
||||||
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
||||||
piopen = re.compile('<\?')
|
|
||||||
piclose = re.compile('>')
|
piclose = re.compile('>')
|
||||||
endtagopen = re.compile('</[<>a-zA-Z]')
|
|
||||||
endbracket = re.compile('[<>]')
|
endbracket = re.compile('[<>]')
|
||||||
special = re.compile('<![^<>]*>')
|
|
||||||
commentopen = re.compile('<!--')
|
|
||||||
commentclose = re.compile(r'--\s*>')
|
commentclose = re.compile(r'--\s*>')
|
||||||
declopen = re.compile('<!')
|
|
||||||
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
|
|
||||||
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
|
|
||||||
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
||||||
attrfind = re.compile(
|
attrfind = re.compile(
|
||||||
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
|
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
|
||||||
|
|
||||||
decldata = re.compile(r'[^>\'\"]+')
|
|
||||||
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
|
|
||||||
|
|
||||||
|
|
||||||
class SGMLParseError(RuntimeError):
|
class SGMLParseError(RuntimeError):
|
||||||
"""Exception raised for all parse errors."""
|
"""Exception raised for all parse errors."""
|
||||||
@ -62,7 +53,7 @@ class SGMLParseError(RuntimeError):
|
|||||||
# chunks). Entity references are passed by calling
|
# chunks). Entity references are passed by calling
|
||||||
# self.handle_entityref() with the entity reference as argument.
|
# self.handle_entityref() with the entity reference as argument.
|
||||||
|
|
||||||
class SGMLParser:
|
class SGMLParser(markupbase.ParserBase):
|
||||||
|
|
||||||
def __init__(self, verbose=0):
|
def __init__(self, verbose=0):
|
||||||
"""Initialize and reset this instance."""
|
"""Initialize and reset this instance."""
|
||||||
@ -76,6 +67,7 @@ class SGMLParser:
|
|||||||
self.lasttag = '???'
|
self.lasttag = '???'
|
||||||
self.nomoretags = 0
|
self.nomoretags = 0
|
||||||
self.literal = 0
|
self.literal = 0
|
||||||
|
markupbase.ParserBase.reset(self)
|
||||||
|
|
||||||
def setnomoretags(self):
|
def setnomoretags(self):
|
||||||
"""Enter literal mode (CDATA) till EOF.
|
"""Enter literal mode (CDATA) till EOF.
|
||||||
@ -106,6 +98,9 @@ class SGMLParser:
|
|||||||
"""Handle the remaining data."""
|
"""Handle the remaining data."""
|
||||||
self.goahead(1)
|
self.goahead(1)
|
||||||
|
|
||||||
|
def error(self, message):
|
||||||
|
raise SGMLParseError(message)
|
||||||
|
|
||||||
# Internal -- handle data as far as reasonable. May leave state
|
# Internal -- handle data as far as reasonable. May leave state
|
||||||
# and data to be processed by a subsequent call. If 'end' is
|
# and data to be processed by a subsequent call. If 'end' is
|
||||||
# true, force handling all data as if followed by EOF marker.
|
# true, force handling all data as if followed by EOF marker.
|
||||||
@ -119,9 +114,10 @@ class SGMLParser:
|
|||||||
i = n
|
i = n
|
||||||
break
|
break
|
||||||
match = interesting.search(rawdata, i)
|
match = interesting.search(rawdata, i)
|
||||||
if match: j = match.start(0)
|
if match: j = match.start()
|
||||||
else: j = n
|
else: j = n
|
||||||
if i < j: self.handle_data(rawdata[i:j])
|
if i < j:
|
||||||
|
self.handle_data(rawdata[i:j])
|
||||||
i = j
|
i = j
|
||||||
if i == n: break
|
if i == n: break
|
||||||
if rawdata[i] == '<':
|
if rawdata[i] == '<':
|
||||||
@ -134,36 +130,31 @@ class SGMLParser:
|
|||||||
if k < 0: break
|
if k < 0: break
|
||||||
i = k
|
i = k
|
||||||
continue
|
continue
|
||||||
if endtagopen.match(rawdata, i):
|
if rawdata.startswith("</", i):
|
||||||
k = self.parse_endtag(i)
|
k = self.parse_endtag(i)
|
||||||
if k < 0: break
|
if k < 0: break
|
||||||
i = k
|
i = k
|
||||||
self.literal = 0
|
self.literal = 0
|
||||||
continue
|
continue
|
||||||
if commentopen.match(rawdata, i):
|
|
||||||
if self.literal:
|
if self.literal:
|
||||||
self.handle_data(rawdata[i])
|
if n > (i + 1):
|
||||||
|
self.handle_data("<")
|
||||||
i = i+1
|
i = i+1
|
||||||
|
else:
|
||||||
|
# incomplete
|
||||||
|
break
|
||||||
continue
|
continue
|
||||||
|
if rawdata.startswith("<!--", i):
|
||||||
k = self.parse_comment(i)
|
k = self.parse_comment(i)
|
||||||
if k < 0: break
|
if k < 0: break
|
||||||
i = i+k
|
i = k
|
||||||
continue
|
|
||||||
if piopen.match(rawdata, i):
|
|
||||||
if self.literal:
|
|
||||||
self.handle_data(rawdata[i])
|
|
||||||
i = i+1
|
|
||||||
continue
|
continue
|
||||||
|
if rawdata.startswith("<?", i):
|
||||||
k = self.parse_pi(i)
|
k = self.parse_pi(i)
|
||||||
if k < 0: break
|
if k < 0: break
|
||||||
i = i+k
|
i = i+k
|
||||||
continue
|
continue
|
||||||
match = special.match(rawdata, i)
|
if rawdata.startswith("<!", i):
|
||||||
if match:
|
|
||||||
if self.literal:
|
|
||||||
self.handle_data(rawdata[i])
|
|
||||||
i = i+1
|
|
||||||
continue
|
|
||||||
# This is some sort of declaration; in "HTML as
|
# This is some sort of declaration; in "HTML as
|
||||||
# deployed," this should only be the document type
|
# deployed," this should only be the document type
|
||||||
# declaration ("<!DOCTYPE html...>").
|
# declaration ("<!DOCTYPE html...>").
|
||||||
@ -191,7 +182,7 @@ class SGMLParser:
|
|||||||
if rawdata[i-1] != ';': i = i-1
|
if rawdata[i-1] != ';': i = i-1
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise SGMLParseError('neither < nor & ??')
|
self.error('neither < nor & ??')
|
||||||
# We get here only if incomplete matches but
|
# We get here only if incomplete matches but
|
||||||
# nothing else
|
# nothing else
|
||||||
match = incomplete.match(rawdata, i)
|
match = incomplete.match(rawdata, i)
|
||||||
@ -212,59 +203,26 @@ class SGMLParser:
|
|||||||
# XXX if end: check for empty stack
|
# XXX if end: check for empty stack
|
||||||
|
|
||||||
# Internal -- parse comment, return length or -1 if not terminated
|
# Internal -- parse comment, return length or -1 if not terminated
|
||||||
def parse_comment(self, i):
|
def parse_comment(self, i, report=1):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
if rawdata[i:i+4] != '<!--':
|
if rawdata[i:i+4] != '<!--':
|
||||||
raise SGMLParseError('unexpected call to parse_comment()')
|
self.error('unexpected call to parse_comment()')
|
||||||
match = commentclose.search(rawdata, i+4)
|
match = commentclose.search(rawdata, i+4)
|
||||||
if not match:
|
if not match:
|
||||||
return -1
|
return -1
|
||||||
|
if report:
|
||||||
j = match.start(0)
|
j = match.start(0)
|
||||||
self.handle_comment(rawdata[i+4: j])
|
self.handle_comment(rawdata[i+4: j])
|
||||||
j = match.end(0)
|
return match.end(0)
|
||||||
return j-i
|
|
||||||
|
|
||||||
# Internal -- parse declaration.
|
# Extensions for the DOCTYPE scanner:
|
||||||
def parse_declaration(self, i):
|
_decl_otherchars = '='
|
||||||
# This is some sort of declaration; in "HTML as
|
|
||||||
# deployed," this should only be the document type
|
|
||||||
# declaration ("<!DOCTYPE html...>").
|
|
||||||
rawdata = self.rawdata
|
|
||||||
j = i + 2
|
|
||||||
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
|
|
||||||
if rawdata[j:j+1] in ("-", ""):
|
|
||||||
# Start of comment followed by buffer boundary,
|
|
||||||
# or just a buffer boundary.
|
|
||||||
return -1
|
|
||||||
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
|
||||||
n = len(rawdata)
|
|
||||||
while j < n:
|
|
||||||
c = rawdata[j]
|
|
||||||
if c == ">":
|
|
||||||
# end of declaration syntax
|
|
||||||
self.handle_decl(rawdata[i+2:j])
|
|
||||||
return j + 1
|
|
||||||
if c in "\"'":
|
|
||||||
m = declstringlit.match(rawdata, j)
|
|
||||||
if not m:
|
|
||||||
return -1 # incomplete
|
|
||||||
j = m.end()
|
|
||||||
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
|
||||||
m = declname.match(rawdata, j)
|
|
||||||
if not m:
|
|
||||||
return -1 # incomplete
|
|
||||||
j = m.end()
|
|
||||||
else:
|
|
||||||
raise SGMLParseError(
|
|
||||||
"unexpected char in declaration: %s" % `rawdata[j]`)
|
|
||||||
# end of buffer between tokens
|
|
||||||
return -1
|
|
||||||
|
|
||||||
# Internal -- parse processing instr, return length or -1 if not terminated
|
# Internal -- parse processing instr, return length or -1 if not terminated
|
||||||
def parse_pi(self, i):
|
def parse_pi(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
if rawdata[i:i+2] != '<?':
|
if rawdata[i:i+2] != '<?':
|
||||||
raise SGMLParseError('unexpected call to parse_pi()')
|
self.error('unexpected call to parse_pi()')
|
||||||
match = piclose.search(rawdata, i+2)
|
match = piclose.search(rawdata, i+2)
|
||||||
if not match:
|
if not match:
|
||||||
return -1
|
return -1
|
||||||
@ -311,7 +269,7 @@ class SGMLParser:
|
|||||||
else:
|
else:
|
||||||
match = tagfind.match(rawdata, i+1)
|
match = tagfind.match(rawdata, i+1)
|
||||||
if not match:
|
if not match:
|
||||||
raise SGMLParseError('unexpected call to parse_starttag')
|
self.error('unexpected call to parse_starttag')
|
||||||
k = match.end(0)
|
k = match.end(0)
|
||||||
tag = rawdata[i+1:k].lower()
|
tag = rawdata[i+1:k].lower()
|
||||||
self.lasttag = tag
|
self.lasttag = tag
|
||||||
@ -465,6 +423,7 @@ class SGMLParser:
|
|||||||
def unknown_endtag(self, tag): pass
|
def unknown_endtag(self, tag): pass
|
||||||
def unknown_charref(self, ref): pass
|
def unknown_charref(self, ref): pass
|
||||||
def unknown_entityref(self, ref): pass
|
def unknown_entityref(self, ref): pass
|
||||||
|
def unknown_decl(self, data): pass
|
||||||
|
|
||||||
|
|
||||||
class TestSGMLParser(SGMLParser):
|
class TestSGMLParser(SGMLParser):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user