Sjoerd Mullender writes:

Here is my current version of xmllib.py and the documentation.  This
version has some API changes with respect to the version currently in
Python (also the one in 1.5.2a).
This version supports XML namespaces.
This commit is contained in:
Guido van Rossum 1998-12-18 20:17:13 +00:00
parent 6de7d0c338
commit b083a9fb54
2 changed files with 177 additions and 108 deletions

View File

@ -14,7 +14,28 @@ for parsing text files formatted in XML (eXtended Markup Language).
The \class{XMLParser} class must be instantiated without arguments. The \class{XMLParser} class must be instantiated without arguments.
\end{classdesc} \end{classdesc}
This class provides the following interface methods: This class provides the following interface methods and instance variables:
\begin{memberdesc}{attributes}
A mapping of element names to mappings. The latter mapping maps
attribute names that are valid for the element to the default value of
the attribute, or if there is no default to \code{None}. The default
value is the empty dictionary.
\end{memberdesc}
\begin{memberdesc}{elements}
A mapping of element names to tuples. The tuples contain a function
for handling the start and end tag respectively of the element, or
\code{None} if the method \method{unknown_starttag()} or
\method{unknown_endtag()} is to be called. The default value is the
empty dictionary.
\end{memberdesc}
\begin{memberdesc}{entitydefs}
A mapping of entitynames to their values. The default value contains
definitions for \code{'lt'}, \code{'gt'}, \code{'amp'}, \code{'quot'},
and \code{'apos'}.
\end{memberdesc}
\begin{methoddesc}{reset}{} \begin{methoddesc}{reset}{}
Reset the instance. Loses all unprocessed data. This is called Reset the instance. Loses all unprocessed data. This is called
@ -33,7 +54,7 @@ when the close tag matching the last unclosed open tag is encountered.
\begin{methoddesc}{feed}{data} \begin{methoddesc}{feed}{data}
Feed some text to the parser. It is processed insofar as it consists Feed some text to the parser. It is processed insofar as it consists
of complete elements; incomplete data is buffered until more data is of complete tags; incomplete data is buffered until more data is
fed or \method{close()} is called. fed or \method{close()} is called.
\end{methoddesc} \end{methoddesc}
@ -65,29 +86,29 @@ the root element.
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_starttag}{tag, method, attributes} \begin{methoddesc}{handle_starttag}{tag, method, attributes}
This method is called to handle start tags for which a This method is called to handle start tags for which a start tag
\method{start_\var{tag}()} method has been defined. The \var{tag} handler is defined in the instance variable \member{elements}. The
argument is the name of the tag, and the \var{method} argument is the \var{tag} argument is the name of the tag, and the \var{method}
bound method which should be used to support semantic interpretation argument is the function (method) which should be used to support semantic
of the start tag. The \var{attributes} argument is a dictionary of interpretation of the start tag. The \var{attributes} argument is a
attributes, the key being the \var{name} and the value being the dictionary of attributes, the key being the \var{name} and the value
\var{value} of the attribute found inside the tag's \code{<>} brackets. being the \var{value} of the attribute found inside the tag's
Character and entity references in the \var{value} have \code{<>} brackets. Character and entity references in the
been interpreted. For instance, for the tag \var{value} have been interpreted. For instance, for the start tag
\code{<A HREF="http://www.cwi.nl/">}, this method would be called as \code{<A HREF="http://www.cwi.nl/">}, this method would be called as
\code{handle_starttag('A', self.start_A, \{'HREF': 'http://www.cwi.nl/'\})}. \code{handle_starttag('A', self.elements['A'][0], \{'HREF': 'http://www.cwi.nl/'\})}.
The base implementation simply calls \var{method} with \var{attributes} The base implementation simply calls \var{method} with \var{attributes}
as the only argument. as the only argument.
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_endtag}{tag, method} \begin{methoddesc}{handle_endtag}{tag, method}
This method is called to handle endtags for which an This method is called to handle endtags for which an end tag handler
\method{end_\var{tag}()} method has been defined. The \var{tag} is defined in the instance variable \member{elements}. The \var{tag}
argument is the name of the tag, and the argument is the name of the tag, and the \var{method} argument is the
\var{method} argument is the bound method which should be used to function (method) which should be used to support semantic
support semantic interpretation of the end tag. If no interpretation of the end tag. For instance, for the endtag
\method{end_\var{tag}()} method is defined for the closing element, this \code{</A>}, this method would be called as \code{handle_endtag('A',
handler is not called. The base implementation simply calls self.elements['A'][1])}. The base implementation simply calls
\var{method}. \var{method}.
\end{methoddesc} \end{methoddesc}
@ -149,7 +170,7 @@ closing delimiter, but not the delimiter itself. For example, the
instruction \samp{<?XML text?>} will cause this method to be called instruction \samp{<?XML text?>} will cause this method to be called
with the arguments \code{'XML'} and \code{'text'}. The default method with the arguments \code{'XML'} and \code{'text'}. The default method
does nothing. Note that if a document starts with \samp{<?xml does nothing. Note that if a document starts with \samp{<?xml
...?>}, \method{handle_xml()} is called to handle it. ..?>}, \method{handle_xml()} is called to handle it.
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_special}{data} \begin{methoddesc}{handle_special}{data}
@ -196,32 +217,21 @@ intended to be overridden by a derived class; the base class
implementation does nothing. implementation does nothing.
\end{methoddesc} \end{methoddesc}
Apart from overriding or extending the methods listed above, derived \subsection{XML Namespaces}
classes may also define methods and variables of the following form to
define processing of specific tags. Tag names in the input stream are
case dependent; the \var{tag} occurring in method names must be in the
correct case:
\begin{methoddescni}{start_\var{tag}}{attributes} This module has support for XML namespaces as defined in the XML
This method is called to process an opening tag \var{tag}. The Namespaces proposed recommendation.
\var{attributes} argument has the same meaning as described for
\method{handle_starttag()} above. In fact, the base implementation of
\method{handle_starttag()} calls this method.
\end{methoddescni}
\begin{methoddescni}{end_\var{tag}}{} Tag and attribute names that are defined in an XML namespace are
This method is called to process a closing tag \var{tag}. handled as if the name of the tag or element consisted of the
\end{methoddescni} namespace (i.e. the URL that defines the namespace) followed by a
space and the name of the tag or attribute. For instance, the tag
\code{<html xmlns='http://www.w3.org/TR/REC-html40'>} is treated as if
the tag name was \code{'http://www.w3.org/TR/REC-html40 html'}, and
the tag \code{<html:a href='http://frob.com'>} inside the above
mentioned element is treated as if the tag name were
\code{'http://www.w3.org/TR/REC-html40 a'} and the attribute name as
if it were \code{'http://www.w3.org/TR/REC-html40 src'}.
\begin{memberdescni}{\var{tag}_attributes} An older draft of the XML Namespaces proposal is also recognized, but
If a class or instance variable \member{\var{tag}_attributes} exists, it triggers a warning.
should be a list or a dictionary. If a list, the elements of the list
are the valid attributes for the element \var{tag}; if a dictionary,
the keys are the valid attributes for the element \var{tag}, and the
values the default values of the attributes, or \code{None} if there
is no default.
In addition to the attributes that were present in the tag, the
attribute dictionary that is passed to \method{handle_starttag()} and
\method{unknown_starttag()} contains values for all attributes that
have a default value.
\end{memberdescni}

View File

@ -5,7 +5,7 @@ import re
import string import string
version = '0.1' version = '0.2'
# Regular expressions used for parsing # Regular expressions used for parsing
@ -64,6 +64,13 @@ commentclose = re.compile('-->')
doubledash = re.compile('--') doubledash = re.compile('--')
attrtrans = string.maketrans(' \r\n\t', ' ') attrtrans = string.maketrans(' \r\n\t', ' ')
# definitions for XML namespaces
_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
ncname = re.compile(_NCName + '$')
qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
'(?P<local>' + _NCName + ')$')
xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
# XML parser base class -- find tags and call handler functions. # XML parser base class -- find tags and call handler functions.
# Usage: p = XMLParser(); p.feed(data); ...; p.close(). # Usage: p = XMLParser(); p.feed(data); ...; p.close().
@ -76,10 +83,11 @@ attrtrans = string.maketrans(' \r\n\t', ' ')
# as argument. # as argument.
class XMLParser: class XMLParser:
attributes = {} # default, to be overridden
elements = {} # default, to be overridden
# Interface -- initialize and reset this instance # Interface -- initialize and reset this instance
def __init__(self, verbose=0): def __init__(self):
self.verbose = verbose
self.reset() self.reset()
# Interface -- reset this instance. Loses all unprocessed data # Interface -- reset this instance. Loses all unprocessed data
@ -92,6 +100,7 @@ class XMLParser:
self.__at_start = 1 self.__at_start = 1
self.__seen_doctype = None self.__seen_doctype = None
self.__seen_starttag = 0 self.__seen_starttag = 0
self.__namespaces = {'xml':None} # xml is implicitly declared
# For derived classes only -- enter literal mode (CDATA) till EOF # For derived classes only -- enter literal mode (CDATA) till EOF
def setnomoretags(self): def setnomoretags(self):
@ -333,7 +342,7 @@ class XMLParser:
if self.stack: if self.stack:
self.syntax_error('missing end tags') self.syntax_error('missing end tags')
while self.stack: while self.stack:
self.finish_endtag(self.stack[-1]) self.finish_endtag(self.stack[-1][0])
# Internal -- parse comment, return length or -1 if not terminated # Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i): def parse_comment(self, i):
@ -413,7 +422,7 @@ class XMLParser:
self.handle_cdata(rawdata[i+9:res.start(0)]) self.handle_cdata(rawdata[i+9:res.start(0)])
return res.end(0) return res.end(0)
__xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None} __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
# Internal -- handle a processing instruction tag # Internal -- handle a processing instruction tag
def parse_proc(self, i): def parse_proc(self, i):
rawdata = self.rawdata rawdata = self.rawdata
@ -428,29 +437,45 @@ class XMLParser:
raise RuntimeError, 'unexpected call to parse_proc' raise RuntimeError, 'unexpected call to parse_proc'
k = res.end(0) k = res.end(0)
name = res.group(0) name = res.group(0)
if name == 'xml:namespace':
self.syntax_error('old-fashioned namespace declaration')
# namespace declaration
# this must come after the <?xml?> declaration (if any)
# and before the <!DOCTYPE> (if any).
if self.__seen_doctype or self.__seen_starttag:
self.syntax_error('xml:namespace declaration too late in document')
attrdict, namespace, k = self.parse_attributes(name, k, j)
if namespace:
self.syntax_error('namespace declaration inside namespace declaration')
for attrname in attrdict.keys():
if not self.__xml_namespace_attributes.has_key(attrname):
self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
self.syntax_error('xml:namespace without required attributes')
prefix = attrdict.get('prefix')
if ncname.match(prefix) is None:
self.syntax_error('xml:namespace illegal prefix value')
return end.end(0)
if self.__namespaces.has_key(prefix):
self.syntax_error('xml:namespace prefix not unique')
self.__namespaces[prefix] = attrdict['ns']
else:
if string.find(string.lower(name), 'xml') >= 0: if string.find(string.lower(name), 'xml') >= 0:
self.syntax_error('illegal processing instruction target name') self.syntax_error('illegal processing instruction target name')
self.handle_proc(name, rawdata[k:j]) self.handle_proc(name, rawdata[k:j])
return end.end(0) return end.end(0)
# Internal -- parse attributes between i and j # Internal -- parse attributes between i and j
def parse_attributes(self, tag, i, j, attributes = None): def parse_attributes(self, tag, i, j):
rawdata = self.rawdata rawdata = self.rawdata
# Now parse the data between i and j into a tag and attrs
attrdict = {} attrdict = {}
try: namespace = {}
# convert attributes list to dictionary
d = {}
for a in attributes:
d[a] = None
attributes = d
except TypeError:
pass
while i < j: while i < j:
res = attrfind.match(rawdata, i) res = attrfind.match(rawdata, i)
if res is None: if res is None:
break break
attrname, attrvalue = res.group('name', 'value') attrname, attrvalue = res.group('name', 'value')
i = res.end(0)
if attrvalue is None: if attrvalue is None:
self.syntax_error("no value specified for attribute `%s'" % attrname) self.syntax_error("no value specified for attribute `%s'" % attrname)
attrvalue = attrname attrvalue = attrname
@ -459,22 +484,19 @@ class XMLParser:
attrvalue = attrvalue[1:-1] attrvalue = attrvalue[1:-1]
else: else:
self.syntax_error("attribute `%s' value not quoted" % attrname) self.syntax_error("attribute `%s' value not quoted" % attrname)
res = xmlns.match(attrname)
if res is not None:
# namespace declaration
ncname = res.group('ncname')
namespace[ncname or ''] = attrvalue or None
continue
if '<' in attrvalue: if '<' in attrvalue:
self.syntax_error("`<' illegal in attribute value") self.syntax_error("`<' illegal in attribute value")
if attributes is not None and not attributes.has_key(attrname):
self.syntax_error("unknown attribute `%s' of element `%s'" %
(attrname, tag))
if attrdict.has_key(attrname): if attrdict.has_key(attrname):
self.syntax_error("attribute `%s' specified twice" % attrname) self.syntax_error("attribute `%s' specified twice" % attrname)
attrvalue = string.translate(attrvalue, attrtrans) attrvalue = string.translate(attrvalue, attrtrans)
attrdict[attrname] = self.translate_references(attrvalue) attrdict[attrname] = self.translate_references(attrvalue)
i = res.end(0) return attrdict, namespace, i
if attributes is not None:
# fill in with default attributes
for key, val in attributes.items():
if val is not None and not attrdict.has_key(key):
attrdict[key] = val
return attrdict, i
# Internal -- handle starttag, return length or -1 if not terminated # Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i): def parse_starttag(self, i):
@ -487,19 +509,63 @@ class XMLParser:
if tag is None or tag.end(0) != end.end(0): if tag is None or tag.end(0) != end.end(0):
self.syntax_error('garbage in starttag') self.syntax_error('garbage in starttag')
return end.end(0) return end.end(0)
tagname = tag.group('tagname') nstag = tagname = tag.group('tagname')
if not self.__seen_starttag and self.__seen_doctype and \ if not self.__seen_starttag and self.__seen_doctype and \
tagname != self.__seen_doctype: tagname != self.__seen_doctype:
self.syntax_error('starttag does not match DOCTYPE') self.syntax_error('starttag does not match DOCTYPE')
if self.__seen_starttag and not self.stack: if self.__seen_starttag and not self.stack:
self.syntax_error('multiple elements on top level') self.syntax_error('multiple elements on top level')
if hasattr(self, tagname + '_attributes'):
attributes = getattr(self, tagname + '_attributes')
else:
attributes = None
k, j = tag.span('attrs') k, j = tag.span('attrs')
attrdict, k = self.parse_attributes(tagname, k, j, attributes) attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
self.finish_starttag(tagname, attrdict) self.stack.append((tagname, nsdict, nstag))
res = qname.match(tagname)
if res is not None:
prefix, nstag = res.group('prefix', 'local')
if prefix is None:
prefix = ''
ns = None
for t, d, nst in self.stack:
if d.has_key(prefix):
ns = d[prefix]
if ns is None and prefix != '':
ns = self.__namespaces.get(prefix)
if ns is not None:
nstag = ns + ' ' + nstag
elif prefix != '':
nstag = prefix + ':' + nstag # undo split
self.stack[-1] = tagname, nsdict, nstag
# translate namespace of attributes
nattrdict = {}
for key, val in attrdict.items():
res = qname.match(key)
if res is not None:
aprefix, key = res.group('prefix', 'local')
if aprefix is None:
aprefix = ''
ans = None
for t, d, nst in self.stack:
if d.has_key(aprefix):
ans = d[aprefix]
if ans is None and aprefix != '':
ans = self.__namespaces.get(aprefix)
if ans is not None:
key = ans + ' ' + key
elif aprefix != '':
key = aprefix + ':' + key
elif ns is not None:
key = ns + ' ' + key
nattrdict[key] = val
attrdict = nattrdict
attributes = self.attributes.get(nstag)
if attributes is not None:
for key in attrdict.keys():
if not attributes.has_key(key):
self.syntax_error("unknown attribute `%s' in tag `%s'" % (key, tagname))
for key, val in attributes.items():
if val is not None and not attrdict.has_key(key):
attrdict[key] = val
method = self.elements.get(nstag, (None, None))[0]
self.finish_starttag(nstag, attrdict, method)
if tag.group('slash') == '/': if tag.group('slash') == '/':
self.finish_endtag(tagname) self.finish_endtag(tagname)
return tag.end(0) return tag.end(0)
@ -521,7 +587,7 @@ class XMLParser:
else: else:
tag = res.group(0) tag = res.group(0)
if self.literal: if self.literal:
if not self.stack or tag != self.stack[-1]: if not self.stack or tag != self.stack[-1][0]:
self.handle_data(rawdata[i]) self.handle_data(rawdata[i])
return i+1 return i+1
self.literal = 0 self.literal = 0
@ -532,21 +598,14 @@ class XMLParser:
return end.end(0) return end.end(0)
# Internal -- finish processing of start tag # Internal -- finish processing of start tag
# Return -1 for unknown tag, 1 for balanced tag def finish_starttag(self, tagname, attrdict, method):
def finish_starttag(self, tag, attrs): if method is not None:
self.stack.append(tag) self.handle_starttag(tagname, method, attrdict)
methodname = 'start_' + tag
if hasattr(self, methodname):
method = getattr(self, methodname)
self.handle_starttag(tag, method, attrs)
return 1
else: else:
self.unknown_starttag(tag, attrs) self.unknown_starttag(tagname, attrdict)
return -1
# Internal -- finish processing of end tag # Internal -- finish processing of end tag
def finish_endtag(self, tag): def finish_endtag(self, tag):
methodname = 'end_' + tag
if not tag: if not tag:
self.syntax_error('name-less end tag') self.syntax_error('name-less end tag')
found = len(self.stack) - 1 found = len(self.stack) - 1
@ -554,27 +613,27 @@ class XMLParser:
self.unknown_endtag(tag) self.unknown_endtag(tag)
return return
else: else:
if tag not in self.stack: found = -1
for i in range(len(self.stack)):
if tag == self.stack[i][0]:
found = i
if found == -1:
self.syntax_error('unopened end tag') self.syntax_error('unopened end tag')
if hasattr(self, methodname): method = self.elements.get(tag, (None, None))[1]
method = getattr(self, methodname) if method is not None:
self.handle_endtag(tag, method) self.handle_endtag(tag, method)
else: else:
self.unknown_endtag(tag) self.unknown_endtag(tag)
return return
found = len(self.stack)
for i in range(found):
if self.stack[i] == tag:
found = i
while len(self.stack) > found: while len(self.stack) > found:
if found < len(self.stack) - 1: if found < len(self.stack) - 1:
self.syntax_error('missing close tag for %s' % self.stack[-1]) self.syntax_error('missing close tag for %s' % self.stack[-1][2])
tag = self.stack[-1] nstag = self.stack[-1][2]
if hasattr(self, methodname): method = self.elements.get(nstag, (None, None))[1]
method = getattr(self, methodname) if method is not None:
self.handle_endtag(tag, method) self.handle_endtag(nstag, method)
else: else:
self.unknown_endtag(tag) self.unknown_endtag(nstag)
del self.stack[-1] del self.stack[-1]
# Overridable -- handle xml processing instruction # Overridable -- handle xml processing instruction
@ -654,9 +713,9 @@ class XMLParser:
class TestXMLParser(XMLParser): class TestXMLParser(XMLParser):
def __init__(self, verbose=0): def __init__(self):
self.testdata = "" self.testdata = ""
XMLParser.__init__(self, verbose) XMLParser.__init__(self)
def handle_xml(self, encoding, standalone): def handle_xml(self, encoding, standalone):
self.flush() self.flush()