Moved gencodec.py to the Tools/unicode/ directory.

Added new support for decoding tables.

Cleaned up the implementation a bit.
This commit is contained in:
Marc-André Lemburg 2005-10-21 13:45:17 +00:00
parent 3144130217
commit c5694c8bf4

View File

@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores.
The tool also writes marshalled versions of the mapping tables to the The tool also writes marshalled versions of the mapping tables to the
same location (with .mapping extension). same location (with .mapping extension).
Written by Marc-Andre Lemburg (mal@lemburg.com). Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
Unicode table maps for decoding.
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright Guido van Rossum, 2000. (c) Copyright Guido van Rossum, 2000.
(c) Copyright Marc-Andre Lemburg, 2005.
"""#" """#"
import re,os,time,marshal import re, os, time, marshal, codecs
# Create numeric tables or character based ones ? # Maximum allowed size of charmap tables
numeric = 1 MAX_TABLE_SIZE = 8192
# Standard undefined Unicode code point
UNI_UNDEFINED = unichr(0xFFFE)
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
'\s+' '\s+'
@ -69,8 +74,15 @@ def readmap(filename):
enc2uni = {} enc2uni = {}
identity = [] identity = []
unmapped = range(256) unmapped = range(256)
for i in range(256):
unmapped[i] = i # UTC mapping tables per convention don't include the identity
# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
# explicitly mapped to different characters or undefined
for i in range(32) + [127]:
identity.append(i)
unmapped.remove(i)
enc2uni[i] = (i, 'CONTROL CHARACTER')
for line in lines: for line in lines:
line = line.strip() line = line.strip()
if not line or line[0] == '#': if not line or line[0] == '#':
@ -82,22 +94,23 @@ def readmap(filename):
enc,uni,comment = m.groups() enc,uni,comment = m.groups()
enc = parsecodes(enc) enc = parsecodes(enc)
uni = parsecodes(uni) uni = parsecodes(uni)
if not comment: if comment is None:
comment = '' comment = ''
else: else:
comment = comment[1:] comment = comment[1:].strip()
if enc < 256: if enc < 256:
unmapped.remove(enc) if enc in unmapped:
unmapped.remove(enc)
if enc == uni: if enc == uni:
identity.append(enc) identity.append(enc)
else: enc2uni[enc] = (uni,comment)
enc2uni[enc] = (uni,comment)
else: else:
enc2uni[enc] = (uni,comment) enc2uni[enc] = (uni,comment)
# If there are more identity-mapped entries than unmapped entries, # If there are more identity-mapped entries than unmapped entries,
# it pays to generate an identity dictionary first, and add explicit # it pays to generate an identity dictionary first, and add explicit
# mappings to None for the rest # mappings to None for the rest
if len(identity)>=len(unmapped): if len(identity) >= len(unmapped):
for enc in unmapped: for enc in unmapped:
enc2uni[enc] = (None, "") enc2uni[enc] = (None, "")
enc2uni['IDENTITY'] = 256 enc2uni['IDENTITY'] = 256
@ -112,44 +125,146 @@ def hexrepr(t):
len(t) len(t)
except: except:
return '0x%04x' % t return '0x%04x' % t
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')' try:
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
except TypeError, why:
print '* failed to convert %r: %s' % (t, why)
raise
def unicoderepr(t): def python_mapdef_code(varname, map, comments=1):
if t is None: l = []
return 'None' append = l.append
if numeric: if map.has_key("IDENTITY"):
return hexrepr(t) append("%s = codecs.make_identity_dict(range(%d))" %
(varname, map["IDENTITY"]))
append("%s.update({" % varname)
splits = 1
del map["IDENTITY"]
identity = 1
else: else:
try: append("%s = {" % varname)
len(t) splits = 0
except: identity = 0
return repr(unichr(t))
return repr(''.join(map(unichr, t)))
def keyrepr(t): mappings = map.items()
mappings.sort()
if t is None: i = 0
return 'None' for mapkey, mapvalue in mappings:
if numeric: mapcomment = ''
return hexrepr(t) if isinstance(mapkey, tuple):
else: (mapkey, mapcomment) = mapkey
try: if isinstance(mapvalue, tuple):
len(t) (mapvalue, mapcomment) = mapvalue
except: if mapkey is None:
if t < 256: continue
return repr(chr(t)) if (identity and
mapkey == mapvalue and
mapkey < 256):
# No need to include identity mappings, since these
# are already set for the first 256 code points.
continue
key = hexrepr(mapkey)
value = hexrepr(mapvalue)
if mapcomment and comments:
append(' %s: %s,\t# %s' % (key, value, mapcomment))
else:
append(' %s: %s,' % (key, value))
i += 1
if i == 4096:
# Split the definition into parts to that the Python
# parser doesn't dump core
if splits == 0:
append('}')
else: else:
return repr(unichr(t)) append('})')
return repr(''.join(map(chr, t))) append('%s.update({' % varname)
i = 0
splits = splits + 1
if splits == 0:
append('}')
else:
append('})')
def codegen(name,map,comments=1): return l
def python_tabledef_code(varname, map, comments=1):
l = []
append = l.append
append('%s = (' % varname)
# Analyze map and create table dict
mappings = map.items()
mappings.sort()
table = {}
maxkey = 0
if map.has_key('IDENTITY'):
for key in range(256):
table[key] = (key, '')
maxkey = 255
del map['IDENTITY']
for mapkey, mapvalue in mappings:
mapcomment = ''
if isinstance(mapkey, tuple):
(mapkey, mapcomment) = mapkey
if isinstance(mapvalue, tuple):
(mapvalue, mapcomment) = mapvalue
if mapkey is None:
continue
table[mapkey] = (mapvalue, mapcomment)
if mapkey > maxkey:
maxkey = mapkey
if maxkey > MAX_TABLE_SIZE:
# Table too large
return None
# Create table code
for key in range(maxkey + 1):
if key not in table:
mapvalue = None
mapcomment = 'UNDEFINED'
else:
mapvalue, mapcomment = table[key]
if mapvalue is None:
mapchar = UNI_UNDEFINED
else:
if isinstance(mapvalue, tuple):
# 1-n mappings not supported
return None
else:
mapchar = unichr(mapvalue)
if mapcomment and comments:
append(' %r\t# %s -> %s' % (mapchar,
hexrepr(key),
mapcomment))
else:
append(' %r' % mapchar)
append(')')
return l
def codegen(name, map, comments=1):
""" Returns Python source for the given map. """ Returns Python source for the given map.
Comments are included in the source, if comments is true (default). Comments are included in the source, if comments is true (default).
""" """
# Generate code
decoding_map_code = python_mapdef_code(
'decoding_map',
map,
comments=comments)
decoding_table_code = python_tabledef_code(
'decoding_table',
map,
comments=comments)
encoding_map_code = python_mapdef_code(
'encoding_map',
codecs.make_encoding_map(map),
comments=comments)
l = [ l = [
'''\ '''\
""" Python Character Mapping Codec generated from '%s' with gencodec.py. """ Python Character Mapping Codec generated from '%s' with gencodec.py.
@ -167,9 +282,16 @@ class Codec(codecs.Codec):
return codecs.charmap_encode(input,errors,encoding_map) return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'): def decode(self,input,errors='strict'):
''' % name
return codecs.charmap_decode(input,errors,decoding_map) ]
if decoding_table_code:
l.append('''\
return codecs.charmap_decode(input,errors,decoding_table)''')
else:
l.append('''\
return codecs.charmap_decode(input,errors,decoding_map)''')
l.append('''
class StreamWriter(Codec,codecs.StreamWriter): class StreamWriter(Codec,codecs.StreamWriter):
pass pass
@ -183,54 +305,21 @@ def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter) return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map ### Decoding Map
''' % name,
]
if map.has_key("IDENTITY"):
l.append("decoding_map = codecs.make_identity_dict(range(%d))"
% map["IDENTITY"])
l.append("decoding_map.update({")
splits = 1
del map["IDENTITY"]
else:
l.append("decoding_map = {")
splits = 0
mappings = map.items()
mappings.sort()
append = l.append
i = 0
for e,value in mappings:
try:
(u,c) = value
except TypeError:
u = value
c = ''
key = keyrepr(e)
if c and comments:
append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
else:
append('\t%s: %s,' % (key,unicoderepr(u)))
i += 1
if i == 4096:
# Split the definition into parts to that the Python
# parser doesn't dump core
if splits == 0:
append('}')
else:
append('})')
append('decoding_map.update({')
i = 0
splits = splits + 1
if splits == 0:
append('}')
else:
append('})')
append('''
### Encoding Map
encoding_map = codecs.make_encoding_map(decoding_map)
''') ''')
l.extend(decoding_map_code)
# Add optional decoding table
if decoding_table_code:
l.append('''
### Decoding Table
''')
l.extend(decoding_table_code)
l.append('''
### Encoding Map
''')
l.extend(encoding_map_code)
return '\n'.join(l) return '\n'.join(l)
def pymap(name,map,pyfile,comments=1): def pymap(name,map,pyfile,comments=1):
@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
mapnames = os.listdir(dir) mapnames = os.listdir(dir)
for mapname in mapnames: for mapname in mapnames:
mappathname = os.path.join(dir, mapname)
name = os.path.split(mapname)[1] name = os.path.split(mapname)[1]
name = name.replace('-','_') name = name.replace('-','_')
name = name.split('.')[0] name = name.split('.')[0]
@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
if not map: if not map:
print '* map is empty; skipping' print '* map is empty; skipping'
else: else:
pymap(mapname, map, prefix + codefile,comments) pymap(mappathname, map, prefix + codefile,comments)
marshalmap(mapname, map, prefix + marshalfile) marshalmap(mappathname, map, prefix + marshalfile)
except ValueError: except ValueError, why:
print '* conversion failed' print '* conversion failed: %s' % why
raise
def rewritepythondir(dir,prefix='',comments=1): def rewritepythondir(dir,prefix='',comments=1):