Moved gencodec.py to the Tools/unicode/ directory.
Added new support for decoding tables. Cleaned up the implementation a bit.
This commit is contained in:
parent
3144130217
commit
c5694c8bf4
@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores.
|
|||||||
The tool also writes marshalled versions of the mapping tables to the
|
The tool also writes marshalled versions of the mapping tables to the
|
||||||
same location (with .mapping extension).
|
same location (with .mapping extension).
|
||||||
|
|
||||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
|
||||||
|
Unicode table maps for decoding.
|
||||||
|
|
||||||
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
||||||
(c) Copyright Guido van Rossum, 2000.
|
(c) Copyright Guido van Rossum, 2000.
|
||||||
|
(c) Copyright Marc-Andre Lemburg, 2005.
|
||||||
|
|
||||||
"""#"
|
"""#"
|
||||||
|
|
||||||
import re,os,time,marshal
|
import re, os, time, marshal, codecs
|
||||||
|
|
||||||
# Create numeric tables or character based ones ?
|
# Maximum allowed size of charmap tables
|
||||||
numeric = 1
|
MAX_TABLE_SIZE = 8192
|
||||||
|
|
||||||
|
# Standard undefined Unicode code point
|
||||||
|
UNI_UNDEFINED = unichr(0xFFFE)
|
||||||
|
|
||||||
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
|
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
|
||||||
'\s+'
|
'\s+'
|
||||||
@ -69,8 +74,15 @@ def readmap(filename):
|
|||||||
enc2uni = {}
|
enc2uni = {}
|
||||||
identity = []
|
identity = []
|
||||||
unmapped = range(256)
|
unmapped = range(256)
|
||||||
for i in range(256):
|
|
||||||
unmapped[i] = i
|
# UTC mapping tables per convention don't include the identity
|
||||||
|
# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
|
||||||
|
# explicitly mapped to different characters or undefined
|
||||||
|
for i in range(32) + [127]:
|
||||||
|
identity.append(i)
|
||||||
|
unmapped.remove(i)
|
||||||
|
enc2uni[i] = (i, 'CONTROL CHARACTER')
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line or line[0] == '#':
|
if not line or line[0] == '#':
|
||||||
@ -82,22 +94,23 @@ def readmap(filename):
|
|||||||
enc,uni,comment = m.groups()
|
enc,uni,comment = m.groups()
|
||||||
enc = parsecodes(enc)
|
enc = parsecodes(enc)
|
||||||
uni = parsecodes(uni)
|
uni = parsecodes(uni)
|
||||||
if not comment:
|
if comment is None:
|
||||||
comment = ''
|
comment = ''
|
||||||
else:
|
else:
|
||||||
comment = comment[1:]
|
comment = comment[1:].strip()
|
||||||
if enc < 256:
|
if enc < 256:
|
||||||
unmapped.remove(enc)
|
if enc in unmapped:
|
||||||
|
unmapped.remove(enc)
|
||||||
if enc == uni:
|
if enc == uni:
|
||||||
identity.append(enc)
|
identity.append(enc)
|
||||||
else:
|
enc2uni[enc] = (uni,comment)
|
||||||
enc2uni[enc] = (uni,comment)
|
|
||||||
else:
|
else:
|
||||||
enc2uni[enc] = (uni,comment)
|
enc2uni[enc] = (uni,comment)
|
||||||
|
|
||||||
# If there are more identity-mapped entries than unmapped entries,
|
# If there are more identity-mapped entries than unmapped entries,
|
||||||
# it pays to generate an identity dictionary first, and add explicit
|
# it pays to generate an identity dictionary first, and add explicit
|
||||||
# mappings to None for the rest
|
# mappings to None for the rest
|
||||||
if len(identity)>=len(unmapped):
|
if len(identity) >= len(unmapped):
|
||||||
for enc in unmapped:
|
for enc in unmapped:
|
||||||
enc2uni[enc] = (None, "")
|
enc2uni[enc] = (None, "")
|
||||||
enc2uni['IDENTITY'] = 256
|
enc2uni['IDENTITY'] = 256
|
||||||
@ -112,44 +125,146 @@ def hexrepr(t):
|
|||||||
len(t)
|
len(t)
|
||||||
except:
|
except:
|
||||||
return '0x%04x' % t
|
return '0x%04x' % t
|
||||||
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
|
try:
|
||||||
|
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
|
||||||
|
except TypeError, why:
|
||||||
|
print '* failed to convert %r: %s' % (t, why)
|
||||||
|
raise
|
||||||
|
|
||||||
def unicoderepr(t):
|
def python_mapdef_code(varname, map, comments=1):
|
||||||
|
|
||||||
if t is None:
|
l = []
|
||||||
return 'None'
|
append = l.append
|
||||||
if numeric:
|
if map.has_key("IDENTITY"):
|
||||||
return hexrepr(t)
|
append("%s = codecs.make_identity_dict(range(%d))" %
|
||||||
|
(varname, map["IDENTITY"]))
|
||||||
|
append("%s.update({" % varname)
|
||||||
|
splits = 1
|
||||||
|
del map["IDENTITY"]
|
||||||
|
identity = 1
|
||||||
else:
|
else:
|
||||||
try:
|
append("%s = {" % varname)
|
||||||
len(t)
|
splits = 0
|
||||||
except:
|
identity = 0
|
||||||
return repr(unichr(t))
|
|
||||||
return repr(''.join(map(unichr, t)))
|
|
||||||
|
|
||||||
def keyrepr(t):
|
mappings = map.items()
|
||||||
|
mappings.sort()
|
||||||
if t is None:
|
i = 0
|
||||||
return 'None'
|
for mapkey, mapvalue in mappings:
|
||||||
if numeric:
|
mapcomment = ''
|
||||||
return hexrepr(t)
|
if isinstance(mapkey, tuple):
|
||||||
else:
|
(mapkey, mapcomment) = mapkey
|
||||||
try:
|
if isinstance(mapvalue, tuple):
|
||||||
len(t)
|
(mapvalue, mapcomment) = mapvalue
|
||||||
except:
|
if mapkey is None:
|
||||||
if t < 256:
|
continue
|
||||||
return repr(chr(t))
|
if (identity and
|
||||||
|
mapkey == mapvalue and
|
||||||
|
mapkey < 256):
|
||||||
|
# No need to include identity mappings, since these
|
||||||
|
# are already set for the first 256 code points.
|
||||||
|
continue
|
||||||
|
key = hexrepr(mapkey)
|
||||||
|
value = hexrepr(mapvalue)
|
||||||
|
if mapcomment and comments:
|
||||||
|
append(' %s: %s,\t# %s' % (key, value, mapcomment))
|
||||||
|
else:
|
||||||
|
append(' %s: %s,' % (key, value))
|
||||||
|
i += 1
|
||||||
|
if i == 4096:
|
||||||
|
# Split the definition into parts to that the Python
|
||||||
|
# parser doesn't dump core
|
||||||
|
if splits == 0:
|
||||||
|
append('}')
|
||||||
else:
|
else:
|
||||||
return repr(unichr(t))
|
append('})')
|
||||||
return repr(''.join(map(chr, t)))
|
append('%s.update({' % varname)
|
||||||
|
i = 0
|
||||||
|
splits = splits + 1
|
||||||
|
if splits == 0:
|
||||||
|
append('}')
|
||||||
|
else:
|
||||||
|
append('})')
|
||||||
|
|
||||||
def codegen(name,map,comments=1):
|
return l
|
||||||
|
|
||||||
|
def python_tabledef_code(varname, map, comments=1):
|
||||||
|
|
||||||
|
l = []
|
||||||
|
append = l.append
|
||||||
|
append('%s = (' % varname)
|
||||||
|
|
||||||
|
# Analyze map and create table dict
|
||||||
|
mappings = map.items()
|
||||||
|
mappings.sort()
|
||||||
|
table = {}
|
||||||
|
maxkey = 0
|
||||||
|
if map.has_key('IDENTITY'):
|
||||||
|
for key in range(256):
|
||||||
|
table[key] = (key, '')
|
||||||
|
maxkey = 255
|
||||||
|
del map['IDENTITY']
|
||||||
|
for mapkey, mapvalue in mappings:
|
||||||
|
mapcomment = ''
|
||||||
|
if isinstance(mapkey, tuple):
|
||||||
|
(mapkey, mapcomment) = mapkey
|
||||||
|
if isinstance(mapvalue, tuple):
|
||||||
|
(mapvalue, mapcomment) = mapvalue
|
||||||
|
if mapkey is None:
|
||||||
|
continue
|
||||||
|
table[mapkey] = (mapvalue, mapcomment)
|
||||||
|
if mapkey > maxkey:
|
||||||
|
maxkey = mapkey
|
||||||
|
if maxkey > MAX_TABLE_SIZE:
|
||||||
|
# Table too large
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Create table code
|
||||||
|
for key in range(maxkey + 1):
|
||||||
|
if key not in table:
|
||||||
|
mapvalue = None
|
||||||
|
mapcomment = 'UNDEFINED'
|
||||||
|
else:
|
||||||
|
mapvalue, mapcomment = table[key]
|
||||||
|
if mapvalue is None:
|
||||||
|
mapchar = UNI_UNDEFINED
|
||||||
|
else:
|
||||||
|
if isinstance(mapvalue, tuple):
|
||||||
|
# 1-n mappings not supported
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
mapchar = unichr(mapvalue)
|
||||||
|
if mapcomment and comments:
|
||||||
|
append(' %r\t# %s -> %s' % (mapchar,
|
||||||
|
hexrepr(key),
|
||||||
|
mapcomment))
|
||||||
|
else:
|
||||||
|
append(' %r' % mapchar)
|
||||||
|
|
||||||
|
append(')')
|
||||||
|
return l
|
||||||
|
|
||||||
|
def codegen(name, map, comments=1):
|
||||||
|
|
||||||
""" Returns Python source for the given map.
|
""" Returns Python source for the given map.
|
||||||
|
|
||||||
Comments are included in the source, if comments is true (default).
|
Comments are included in the source, if comments is true (default).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
# Generate code
|
||||||
|
decoding_map_code = python_mapdef_code(
|
||||||
|
'decoding_map',
|
||||||
|
map,
|
||||||
|
comments=comments)
|
||||||
|
decoding_table_code = python_tabledef_code(
|
||||||
|
'decoding_table',
|
||||||
|
map,
|
||||||
|
comments=comments)
|
||||||
|
encoding_map_code = python_mapdef_code(
|
||||||
|
'encoding_map',
|
||||||
|
codecs.make_encoding_map(map),
|
||||||
|
comments=comments)
|
||||||
|
|
||||||
l = [
|
l = [
|
||||||
'''\
|
'''\
|
||||||
""" Python Character Mapping Codec generated from '%s' with gencodec.py.
|
""" Python Character Mapping Codec generated from '%s' with gencodec.py.
|
||||||
@ -167,9 +282,16 @@ class Codec(codecs.Codec):
|
|||||||
return codecs.charmap_encode(input,errors,encoding_map)
|
return codecs.charmap_encode(input,errors,encoding_map)
|
||||||
|
|
||||||
def decode(self,input,errors='strict'):
|
def decode(self,input,errors='strict'):
|
||||||
|
''' % name
|
||||||
return codecs.charmap_decode(input,errors,decoding_map)
|
]
|
||||||
|
if decoding_table_code:
|
||||||
|
l.append('''\
|
||||||
|
return codecs.charmap_decode(input,errors,decoding_table)''')
|
||||||
|
else:
|
||||||
|
l.append('''\
|
||||||
|
return codecs.charmap_decode(input,errors,decoding_map)''')
|
||||||
|
|
||||||
|
l.append('''
|
||||||
class StreamWriter(Codec,codecs.StreamWriter):
|
class StreamWriter(Codec,codecs.StreamWriter):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -183,54 +305,21 @@ def getregentry():
|
|||||||
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
|
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
|
||||||
|
|
||||||
### Decoding Map
|
### Decoding Map
|
||||||
''' % name,
|
|
||||||
]
|
|
||||||
|
|
||||||
if map.has_key("IDENTITY"):
|
|
||||||
l.append("decoding_map = codecs.make_identity_dict(range(%d))"
|
|
||||||
% map["IDENTITY"])
|
|
||||||
l.append("decoding_map.update({")
|
|
||||||
splits = 1
|
|
||||||
del map["IDENTITY"]
|
|
||||||
else:
|
|
||||||
l.append("decoding_map = {")
|
|
||||||
splits = 0
|
|
||||||
|
|
||||||
mappings = map.items()
|
|
||||||
mappings.sort()
|
|
||||||
append = l.append
|
|
||||||
i = 0
|
|
||||||
for e,value in mappings:
|
|
||||||
try:
|
|
||||||
(u,c) = value
|
|
||||||
except TypeError:
|
|
||||||
u = value
|
|
||||||
c = ''
|
|
||||||
key = keyrepr(e)
|
|
||||||
if c and comments:
|
|
||||||
append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
|
|
||||||
else:
|
|
||||||
append('\t%s: %s,' % (key,unicoderepr(u)))
|
|
||||||
i += 1
|
|
||||||
if i == 4096:
|
|
||||||
# Split the definition into parts to that the Python
|
|
||||||
# parser doesn't dump core
|
|
||||||
if splits == 0:
|
|
||||||
append('}')
|
|
||||||
else:
|
|
||||||
append('})')
|
|
||||||
append('decoding_map.update({')
|
|
||||||
i = 0
|
|
||||||
splits = splits + 1
|
|
||||||
if splits == 0:
|
|
||||||
append('}')
|
|
||||||
else:
|
|
||||||
append('})')
|
|
||||||
append('''
|
|
||||||
### Encoding Map
|
|
||||||
|
|
||||||
encoding_map = codecs.make_encoding_map(decoding_map)
|
|
||||||
''')
|
''')
|
||||||
|
l.extend(decoding_map_code)
|
||||||
|
|
||||||
|
# Add optional decoding table
|
||||||
|
if decoding_table_code:
|
||||||
|
l.append('''
|
||||||
|
### Decoding Table
|
||||||
|
''')
|
||||||
|
l.extend(decoding_table_code)
|
||||||
|
|
||||||
|
l.append('''
|
||||||
|
### Encoding Map
|
||||||
|
''')
|
||||||
|
l.extend(encoding_map_code)
|
||||||
|
|
||||||
return '\n'.join(l)
|
return '\n'.join(l)
|
||||||
|
|
||||||
def pymap(name,map,pyfile,comments=1):
|
def pymap(name,map,pyfile,comments=1):
|
||||||
@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
|
|||||||
|
|
||||||
mapnames = os.listdir(dir)
|
mapnames = os.listdir(dir)
|
||||||
for mapname in mapnames:
|
for mapname in mapnames:
|
||||||
|
mappathname = os.path.join(dir, mapname)
|
||||||
name = os.path.split(mapname)[1]
|
name = os.path.split(mapname)[1]
|
||||||
name = name.replace('-','_')
|
name = name.replace('-','_')
|
||||||
name = name.split('.')[0]
|
name = name.split('.')[0]
|
||||||
@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
|
|||||||
if not map:
|
if not map:
|
||||||
print '* map is empty; skipping'
|
print '* map is empty; skipping'
|
||||||
else:
|
else:
|
||||||
pymap(mapname, map, prefix + codefile,comments)
|
pymap(mappathname, map, prefix + codefile,comments)
|
||||||
marshalmap(mapname, map, prefix + marshalfile)
|
marshalmap(mappathname, map, prefix + marshalfile)
|
||||||
except ValueError:
|
except ValueError, why:
|
||||||
print '* conversion failed'
|
print '* conversion failed: %s' % why
|
||||||
|
raise
|
||||||
|
|
||||||
def rewritepythondir(dir,prefix='',comments=1):
|
def rewritepythondir(dir,prefix='',comments=1):
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user