This patch includes test cases and documentation updates, as well as NEWS file updates. This patch also updates the sre modules so that they don't import the string module, breaking direct circular imports.
789 lines
26 KiB
Python
789 lines
26 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# convert re-style regular expression to sre pattern
|
|
#
|
|
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# See the sre.py file for information on usage and redistribution.
|
|
#
|
|
|
|
"""Internal support module for sre"""
|
|
|
|
# XXX: show string offset and offending character for all errors
|
|
|
|
import sys
|
|
|
|
from sre_constants import *
|
|
|
|
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
|
REPEAT_CHARS = "*+?{"
|
|
|
|
DIGITS = tuple("0123456789")
|
|
|
|
OCTDIGITS = tuple("01234567")
|
|
HEXDIGITS = tuple("0123456789abcdefABCDEF")
|
|
|
|
WHITESPACE = tuple(" \t\n\r\v\f")
|
|
|
|
ESCAPES = {
|
|
r"\a": (LITERAL, ord("\a")),
|
|
r"\b": (LITERAL, ord("\b")),
|
|
r"\f": (LITERAL, ord("\f")),
|
|
r"\n": (LITERAL, ord("\n")),
|
|
r"\r": (LITERAL, ord("\r")),
|
|
r"\t": (LITERAL, ord("\t")),
|
|
r"\v": (LITERAL, ord("\v")),
|
|
r"\\": (LITERAL, ord("\\"))
|
|
}
|
|
|
|
CATEGORIES = {
|
|
r"\A": (AT, AT_BEGINNING_STRING), # start of string
|
|
r"\b": (AT, AT_BOUNDARY),
|
|
r"\B": (AT, AT_NON_BOUNDARY),
|
|
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
|
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
|
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
|
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
|
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
|
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
|
r"\Z": (AT, AT_END_STRING), # end of string
|
|
}
|
|
|
|
FLAGS = {
|
|
# standard flags
|
|
"i": SRE_FLAG_IGNORECASE,
|
|
"L": SRE_FLAG_LOCALE,
|
|
"m": SRE_FLAG_MULTILINE,
|
|
"s": SRE_FLAG_DOTALL,
|
|
"x": SRE_FLAG_VERBOSE,
|
|
# extensions
|
|
"t": SRE_FLAG_TEMPLATE,
|
|
"u": SRE_FLAG_UNICODE,
|
|
}
|
|
|
|
class Pattern:
|
|
# master pattern object. keeps track of global attributes
|
|
def __init__(self):
|
|
self.flags = 0
|
|
self.open = []
|
|
self.groups = 1
|
|
self.groupdict = {}
|
|
def opengroup(self, name=None):
|
|
gid = self.groups
|
|
self.groups = gid + 1
|
|
if name is not None:
|
|
ogid = self.groupdict.get(name, None)
|
|
if ogid is not None:
|
|
raise error, ("redefinition of group name %s as group %d; "
|
|
"was group %d" % (repr(name), gid, ogid))
|
|
self.groupdict[name] = gid
|
|
self.open.append(gid)
|
|
return gid
|
|
def closegroup(self, gid):
|
|
self.open.remove(gid)
|
|
def checkgroup(self, gid):
|
|
return gid < self.groups and gid not in self.open
|
|
|
|
class SubPattern:
|
|
# a subpattern, in intermediate form
|
|
def __init__(self, pattern, data=None):
|
|
self.pattern = pattern
|
|
if data is None:
|
|
data = []
|
|
self.data = data
|
|
self.width = None
|
|
def dump(self, level=0):
|
|
nl = 1
|
|
seqtypes = type(()), type([])
|
|
for op, av in self.data:
|
|
print level*" " + op,; nl = 0
|
|
if op == "in":
|
|
# member sublanguage
|
|
print; nl = 1
|
|
for op, a in av:
|
|
print (level+1)*" " + op, a
|
|
elif op == "branch":
|
|
print; nl = 1
|
|
i = 0
|
|
for a in av[1]:
|
|
if i > 0:
|
|
print level*" " + "or"
|
|
a.dump(level+1); nl = 1
|
|
i = i + 1
|
|
elif type(av) in seqtypes:
|
|
for a in av:
|
|
if isinstance(a, SubPattern):
|
|
if not nl: print
|
|
a.dump(level+1); nl = 1
|
|
else:
|
|
print a, ; nl = 0
|
|
else:
|
|
print av, ; nl = 0
|
|
if not nl: print
|
|
def __repr__(self):
|
|
return repr(self.data)
|
|
def __len__(self):
|
|
return len(self.data)
|
|
def __delitem__(self, index):
|
|
del self.data[index]
|
|
def __getitem__(self, index):
|
|
return self.data[index]
|
|
def __setitem__(self, index, code):
|
|
self.data[index] = code
|
|
def __getslice__(self, start, stop):
|
|
return SubPattern(self.pattern, self.data[start:stop])
|
|
def insert(self, index, code):
|
|
self.data.insert(index, code)
|
|
def append(self, code):
|
|
self.data.append(code)
|
|
def getwidth(self):
|
|
# determine the width (min, max) for this subpattern
|
|
if self.width:
|
|
return self.width
|
|
lo = hi = 0L
|
|
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
|
|
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
|
|
for op, av in self.data:
|
|
if op is BRANCH:
|
|
i = sys.maxint
|
|
j = 0
|
|
for av in av[1]:
|
|
l, h = av.getwidth()
|
|
i = min(i, l)
|
|
j = max(j, h)
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op is CALL:
|
|
i, j = av.getwidth()
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op is SUBPATTERN:
|
|
i, j = av[1].getwidth()
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op in REPEATCODES:
|
|
i, j = av[2].getwidth()
|
|
lo = lo + long(i) * av[0]
|
|
hi = hi + long(j) * av[1]
|
|
elif op in UNITCODES:
|
|
lo = lo + 1
|
|
hi = hi + 1
|
|
elif op == SUCCESS:
|
|
break
|
|
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
|
|
return self.width
|
|
|
|
class Tokenizer:
|
|
def __init__(self, string):
|
|
self.string = string
|
|
self.index = 0
|
|
self.__next()
|
|
def __next(self):
|
|
if self.index >= len(self.string):
|
|
self.next = None
|
|
return
|
|
char = self.string[self.index]
|
|
if char[0] == "\\":
|
|
try:
|
|
c = self.string[self.index + 1]
|
|
except IndexError:
|
|
raise error, "bogus escape (end of line)"
|
|
char = char + c
|
|
self.index = self.index + len(char)
|
|
self.next = char
|
|
def match(self, char, skip=1):
|
|
if char == self.next:
|
|
if skip:
|
|
self.__next()
|
|
return 1
|
|
return 0
|
|
def get(self):
|
|
this = self.next
|
|
self.__next()
|
|
return this
|
|
def tell(self):
|
|
return self.index, self.next
|
|
def seek(self, index):
|
|
self.index, self.next = index
|
|
|
|
def isident(char):
|
|
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
|
|
|
|
def isdigit(char):
|
|
return "0" <= char <= "9"
|
|
|
|
def isname(name):
|
|
# check that group name is a valid string
|
|
if not isident(name[0]):
|
|
return False
|
|
for char in name:
|
|
if not isident(char) and not isdigit(char):
|
|
return False
|
|
return True
|
|
|
|
def _group(escape, groups):
|
|
# check if the escape string represents a valid group
|
|
try:
|
|
gid = int(escape[1:])
|
|
if gid and gid < groups:
|
|
return gid
|
|
except ValueError:
|
|
pass
|
|
return None # not a valid group
|
|
|
|
def _class_escape(source, escape):
|
|
# handle escape code inside character class
|
|
code = ESCAPES.get(escape)
|
|
if code:
|
|
return code
|
|
code = CATEGORIES.get(escape)
|
|
if code:
|
|
return code
|
|
try:
|
|
if escape[1:2] == "x":
|
|
# hexadecimal escape (exactly two digits)
|
|
while source.next in HEXDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
escape = escape[2:]
|
|
if len(escape) != 2:
|
|
raise error, "bogus escape: %s" % repr("\\" + escape)
|
|
return LITERAL, int(escape, 16) & 0xff
|
|
elif escape[1:2] in OCTDIGITS:
|
|
# octal escape (up to three digits)
|
|
while source.next in OCTDIGITS and len(escape) < 5:
|
|
escape = escape + source.get()
|
|
escape = escape[1:]
|
|
return LITERAL, int(escape, 8) & 0xff
|
|
if len(escape) == 2:
|
|
return LITERAL, ord(escape[1])
|
|
except ValueError:
|
|
pass
|
|
raise error, "bogus escape: %s" % repr(escape)
|
|
|
|
def _escape(source, escape, state):
|
|
# handle escape code in expression
|
|
code = CATEGORIES.get(escape)
|
|
if code:
|
|
return code
|
|
code = ESCAPES.get(escape)
|
|
if code:
|
|
return code
|
|
try:
|
|
if escape[1:2] == "x":
|
|
# hexadecimal escape
|
|
while source.next in HEXDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
if len(escape) != 4:
|
|
raise ValueError
|
|
return LITERAL, int(escape[2:], 16) & 0xff
|
|
elif escape[1:2] == "0":
|
|
# octal escape
|
|
while source.next in OCTDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
return LITERAL, int(escape[1:], 8) & 0xff
|
|
elif escape[1:2] in DIGITS:
|
|
# octal escape *or* decimal group reference (sigh)
|
|
if source.next in DIGITS:
|
|
escape = escape + source.get()
|
|
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
|
|
source.next in OCTDIGITS):
|
|
# got three octal digits; this is an octal escape
|
|
escape = escape + source.get()
|
|
return LITERAL, int(escape[1:], 8) & 0xff
|
|
# got at least one decimal digit; this is a group reference
|
|
group = _group(escape, state.groups)
|
|
if group:
|
|
if not state.checkgroup(group):
|
|
raise error, "cannot refer to open group"
|
|
return GROUPREF, group
|
|
raise ValueError
|
|
if len(escape) == 2:
|
|
return LITERAL, ord(escape[1])
|
|
except ValueError:
|
|
pass
|
|
raise error, "bogus escape: %s" % repr(escape)
|
|
|
|
def _parse_sub(source, state, nested=1):
|
|
# parse an alternation: a|b|c
|
|
|
|
items = []
|
|
itemsappend = items.append
|
|
sourcematch = source.match
|
|
while 1:
|
|
itemsappend(_parse(source, state))
|
|
if sourcematch("|"):
|
|
continue
|
|
if not nested:
|
|
break
|
|
if not source.next or sourcematch(")", 0):
|
|
break
|
|
else:
|
|
raise error, "pattern not properly closed"
|
|
|
|
if len(items) == 1:
|
|
return items[0]
|
|
|
|
subpattern = SubPattern(state)
|
|
subpatternappend = subpattern.append
|
|
|
|
# check if all items share a common prefix
|
|
while 1:
|
|
prefix = None
|
|
for item in items:
|
|
if not item:
|
|
break
|
|
if prefix is None:
|
|
prefix = item[0]
|
|
elif item[0] != prefix:
|
|
break
|
|
else:
|
|
# all subitems start with a common "prefix".
|
|
# move it out of the branch
|
|
for item in items:
|
|
del item[0]
|
|
subpatternappend(prefix)
|
|
continue # check next one
|
|
break
|
|
|
|
# check if the branch can be replaced by a character set
|
|
for item in items:
|
|
if len(item) != 1 or item[0][0] != LITERAL:
|
|
break
|
|
else:
|
|
# we can store this as a character set instead of a
|
|
# branch (the compiler may optimize this even more)
|
|
set = []
|
|
setappend = set.append
|
|
for item in items:
|
|
setappend(item[0])
|
|
subpatternappend((IN, set))
|
|
return subpattern
|
|
|
|
subpattern.append((BRANCH, (None, items)))
|
|
return subpattern
|
|
|
|
def _parse_sub_cond(source, state, condgroup):
|
|
item_yes = _parse(source, state)
|
|
if source.match("|"):
|
|
item_no = _parse(source, state)
|
|
if source.match("|"):
|
|
raise error, "conditional backref with more than two branches"
|
|
else:
|
|
item_no = None
|
|
if source.next and not source.match(")", 0):
|
|
raise error, "pattern not properly closed"
|
|
subpattern = SubPattern(state)
|
|
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
|
return subpattern
|
|
|
|
def _parse(source, state):
|
|
# parse a simple pattern
|
|
subpattern = SubPattern(state)
|
|
|
|
# precompute constants into local variables
|
|
subpatternappend = subpattern.append
|
|
sourceget = source.get
|
|
sourcematch = source.match
|
|
_len = len
|
|
PATTERNENDERS = ("|", ")")
|
|
ASSERTCHARS = ("=", "!", "<")
|
|
LOOKBEHINDASSERTCHARS = ("=", "!")
|
|
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
|
|
|
|
while 1:
|
|
|
|
if source.next in PATTERNENDERS:
|
|
break # end of subpattern
|
|
this = sourceget()
|
|
if this is None:
|
|
break # end of pattern
|
|
|
|
if state.flags & SRE_FLAG_VERBOSE:
|
|
# skip whitespace and comments
|
|
if this in WHITESPACE:
|
|
continue
|
|
if this == "#":
|
|
while 1:
|
|
this = sourceget()
|
|
if this in (None, "\n"):
|
|
break
|
|
continue
|
|
|
|
if this and this[0] not in SPECIAL_CHARS:
|
|
subpatternappend((LITERAL, ord(this)))
|
|
|
|
elif this == "[":
|
|
# character set
|
|
set = []
|
|
setappend = set.append
|
|
## if sourcematch(":"):
|
|
## pass # handle character classes
|
|
if sourcematch("^"):
|
|
setappend((NEGATE, None))
|
|
# check remaining characters
|
|
start = set[:]
|
|
while 1:
|
|
this = sourceget()
|
|
if this == "]" and set != start:
|
|
break
|
|
elif this and this[0] == "\\":
|
|
code1 = _class_escape(source, this)
|
|
elif this:
|
|
code1 = LITERAL, ord(this)
|
|
else:
|
|
raise error, "unexpected end of regular expression"
|
|
if sourcematch("-"):
|
|
# potential range
|
|
this = sourceget()
|
|
if this == "]":
|
|
if code1[0] is IN:
|
|
code1 = code1[1][0]
|
|
setappend(code1)
|
|
setappend((LITERAL, ord("-")))
|
|
break
|
|
elif this:
|
|
if this[0] == "\\":
|
|
code2 = _class_escape(source, this)
|
|
else:
|
|
code2 = LITERAL, ord(this)
|
|
if code1[0] != LITERAL or code2[0] != LITERAL:
|
|
raise error, "bad character range"
|
|
lo = code1[1]
|
|
hi = code2[1]
|
|
if hi < lo:
|
|
raise error, "bad character range"
|
|
setappend((RANGE, (lo, hi)))
|
|
else:
|
|
raise error, "unexpected end of regular expression"
|
|
else:
|
|
if code1[0] is IN:
|
|
code1 = code1[1][0]
|
|
setappend(code1)
|
|
|
|
# XXX: <fl> should move set optimization to compiler!
|
|
if _len(set)==1 and set[0][0] is LITERAL:
|
|
subpatternappend(set[0]) # optimization
|
|
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
|
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
|
|
else:
|
|
# XXX: <fl> should add charmap optimization here
|
|
subpatternappend((IN, set))
|
|
|
|
elif this and this[0] in REPEAT_CHARS:
|
|
# repeat previous item
|
|
if this == "?":
|
|
min, max = 0, 1
|
|
elif this == "*":
|
|
min, max = 0, MAXREPEAT
|
|
|
|
elif this == "+":
|
|
min, max = 1, MAXREPEAT
|
|
elif this == "{":
|
|
here = source.tell()
|
|
min, max = 0, MAXREPEAT
|
|
lo = hi = ""
|
|
while source.next in DIGITS:
|
|
lo = lo + source.get()
|
|
if sourcematch(","):
|
|
while source.next in DIGITS:
|
|
hi = hi + sourceget()
|
|
else:
|
|
hi = lo
|
|
if not sourcematch("}"):
|
|
subpatternappend((LITERAL, ord(this)))
|
|
source.seek(here)
|
|
continue
|
|
if lo:
|
|
min = int(lo)
|
|
if hi:
|
|
max = int(hi)
|
|
if max < min:
|
|
raise error, "bad repeat interval"
|
|
else:
|
|
raise error, "not supported"
|
|
# figure out which item to repeat
|
|
if subpattern:
|
|
item = subpattern[-1:]
|
|
else:
|
|
item = None
|
|
if not item or (_len(item) == 1 and item[0][0] == AT):
|
|
raise error, "nothing to repeat"
|
|
if item[0][0] in REPEATCODES:
|
|
raise error, "multiple repeat"
|
|
if sourcematch("?"):
|
|
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
|
else:
|
|
subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
|
|
|
elif this == ".":
|
|
subpatternappend((ANY, None))
|
|
|
|
elif this == "(":
|
|
group = 1
|
|
name = None
|
|
condgroup = None
|
|
if sourcematch("?"):
|
|
group = 0
|
|
# options
|
|
if sourcematch("P"):
|
|
# python extensions
|
|
if sourcematch("<"):
|
|
# named group: skip forward to end of name
|
|
name = ""
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error, "unterminated name"
|
|
if char == ">":
|
|
break
|
|
name = name + char
|
|
group = 1
|
|
if not isname(name):
|
|
raise error, "bad character in group name"
|
|
elif sourcematch("="):
|
|
# named backreference
|
|
name = ""
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error, "unterminated name"
|
|
if char == ")":
|
|
break
|
|
name = name + char
|
|
if not isname(name):
|
|
raise error, "bad character in group name"
|
|
gid = state.groupdict.get(name)
|
|
if gid is None:
|
|
raise error, "unknown group name"
|
|
subpatternappend((GROUPREF, gid))
|
|
continue
|
|
else:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error, "unexpected end of pattern"
|
|
raise error, "unknown specifier: ?P%s" % char
|
|
elif sourcematch(":"):
|
|
# non-capturing group
|
|
group = 2
|
|
elif sourcematch("#"):
|
|
# comment
|
|
while 1:
|
|
if source.next is None or source.next == ")":
|
|
break
|
|
sourceget()
|
|
if not sourcematch(")"):
|
|
raise error, "unbalanced parenthesis"
|
|
continue
|
|
elif source.next in ASSERTCHARS:
|
|
# lookahead assertions
|
|
char = sourceget()
|
|
dir = 1
|
|
if char == "<":
|
|
if source.next not in LOOKBEHINDASSERTCHARS:
|
|
raise error, "syntax error"
|
|
dir = -1 # lookbehind
|
|
char = sourceget()
|
|
p = _parse_sub(source, state)
|
|
if not sourcematch(")"):
|
|
raise error, "unbalanced parenthesis"
|
|
if char == "=":
|
|
subpatternappend((ASSERT, (dir, p)))
|
|
else:
|
|
subpatternappend((ASSERT_NOT, (dir, p)))
|
|
continue
|
|
elif sourcematch("("):
|
|
# conditional backreference group
|
|
condname = ""
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error, "unterminated name"
|
|
if char == ")":
|
|
break
|
|
condname = condname + char
|
|
group = 2
|
|
if isname(condname):
|
|
condgroup = state.groupdict.get(condname)
|
|
if condgroup is None:
|
|
raise error, "unknown group name"
|
|
else:
|
|
try:
|
|
condgroup = int(condname)
|
|
except ValueError:
|
|
raise error, "bad character in group name"
|
|
else:
|
|
# flags
|
|
if not source.next in FLAGS:
|
|
raise error, "unexpected end of pattern"
|
|
while source.next in FLAGS:
|
|
state.flags = state.flags | FLAGS[sourceget()]
|
|
if group:
|
|
# parse group contents
|
|
if group == 2:
|
|
# anonymous group
|
|
group = None
|
|
else:
|
|
group = state.opengroup(name)
|
|
if condgroup:
|
|
p = _parse_sub_cond(source, state, condgroup)
|
|
else:
|
|
p = _parse_sub(source, state)
|
|
if not sourcematch(")"):
|
|
raise error, "unbalanced parenthesis"
|
|
if group is not None:
|
|
state.closegroup(group)
|
|
subpatternappend((SUBPATTERN, (group, p)))
|
|
else:
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error, "unexpected end of pattern"
|
|
if char == ")":
|
|
break
|
|
raise error, "unknown extension"
|
|
|
|
elif this == "^":
|
|
subpatternappend((AT, AT_BEGINNING))
|
|
|
|
elif this == "$":
|
|
subpattern.append((AT, AT_END))
|
|
|
|
elif this and this[0] == "\\":
|
|
code = _escape(source, this, state)
|
|
subpatternappend(code)
|
|
|
|
else:
|
|
raise error, "parser error"
|
|
|
|
return subpattern
|
|
|
|
def parse(str, flags=0, pattern=None):
|
|
# parse 're' pattern into list of (opcode, argument) tuples
|
|
|
|
source = Tokenizer(str)
|
|
|
|
if pattern is None:
|
|
pattern = Pattern()
|
|
pattern.flags = flags
|
|
pattern.str = str
|
|
|
|
p = _parse_sub(source, pattern, 0)
|
|
|
|
tail = source.get()
|
|
if tail == ")":
|
|
raise error, "unbalanced parenthesis"
|
|
elif tail:
|
|
raise error, "bogus characters at end of regular expression"
|
|
|
|
if flags & SRE_FLAG_DEBUG:
|
|
p.dump()
|
|
|
|
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
|
|
# the VERBOSE flag was switched on inside the pattern. to be
|
|
# on the safe side, we'll parse the whole thing again...
|
|
return parse(str, p.pattern.flags)
|
|
|
|
return p
|
|
|
|
def parse_template(source, pattern):
|
|
# parse 're' replacement string into list of literals and
|
|
# group references
|
|
s = Tokenizer(source)
|
|
sget = s.get
|
|
p = []
|
|
a = p.append
|
|
def literal(literal, p=p, pappend=a):
|
|
if p and p[-1][0] is LITERAL:
|
|
p[-1] = LITERAL, p[-1][1] + literal
|
|
else:
|
|
pappend((LITERAL, literal))
|
|
sep = source[:0]
|
|
if type(sep) is type(""):
|
|
makechar = chr
|
|
else:
|
|
makechar = unichr
|
|
while 1:
|
|
this = sget()
|
|
if this is None:
|
|
break # end of replacement string
|
|
if this and this[0] == "\\":
|
|
# group
|
|
if this == "\\g":
|
|
name = ""
|
|
if s.match("<"):
|
|
while 1:
|
|
char = sget()
|
|
if char is None:
|
|
raise error, "unterminated group name"
|
|
if char == ">":
|
|
break
|
|
name = name + char
|
|
if not name:
|
|
raise error, "bad group name"
|
|
try:
|
|
index = int(name)
|
|
except ValueError:
|
|
if not isname(name):
|
|
raise error, "bad character in group name"
|
|
try:
|
|
index = pattern.groupindex[name]
|
|
except KeyError:
|
|
raise IndexError, "unknown group name"
|
|
a((MARK, index))
|
|
elif len(this) > 1 and this[1] in DIGITS:
|
|
code = None
|
|
while 1:
|
|
group = _group(this, pattern.groups+1)
|
|
if group:
|
|
if (s.next not in DIGITS or
|
|
not _group(this + s.next, pattern.groups+1)):
|
|
code = MARK, group
|
|
break
|
|
elif s.next in OCTDIGITS:
|
|
this = this + sget()
|
|
else:
|
|
break
|
|
if not code:
|
|
this = this[1:]
|
|
code = LITERAL, makechar(int(this[-6:], 8) & 0xff)
|
|
if code[0] is LITERAL:
|
|
literal(code[1])
|
|
else:
|
|
a(code)
|
|
else:
|
|
try:
|
|
this = makechar(ESCAPES[this][1])
|
|
except KeyError:
|
|
pass
|
|
literal(this)
|
|
else:
|
|
literal(this)
|
|
# convert template to groups and literals lists
|
|
i = 0
|
|
groups = []
|
|
groupsappend = groups.append
|
|
literals = [None] * len(p)
|
|
for c, s in p:
|
|
if c is MARK:
|
|
groupsappend((i, s))
|
|
# literal[i] is already None
|
|
else:
|
|
literals[i] = s
|
|
i = i + 1
|
|
return groups, literals
|
|
|
|
def expand_template(template, match):
|
|
g = match.group
|
|
sep = match.string[:0]
|
|
groups, literals = template
|
|
literals = literals[:]
|
|
try:
|
|
for index, group in groups:
|
|
literals[index] = s = g(group)
|
|
if s is None:
|
|
raise IndexError
|
|
except IndexError:
|
|
raise error, "empty group"
|
|
return sep.join(literals)
|