290 lines
7.1 KiB
Python
290 lines
7.1 KiB
Python
|
""" Unicode Mapping Parser and Codec Generator.
|
||
|
|
||
|
This script parses Unicode mapping files as available from the Unicode
|
||
|
site (ftp.unicode.org) and creates Python codec modules from them. The
|
||
|
codecs use the standard character mapping codec to actually apply the
|
||
|
mapping.
|
||
|
|
||
|
Synopsis: gencodec.py dir codec_prefix
|
||
|
|
||
|
All files in dir are scanned and those producing non-empty mappings
|
||
|
will be written to <codec_prefix><mapname>.py with <mapname> being the
|
||
|
first part of the map's filename ('a' in a.b.c.txt) converted to
|
||
|
lowercase with hyphens replaced by underscores.
|
||
|
|
||
|
The tool also write marhsalled versions of the mapping tables to the
|
||
|
same location (with .mapping extension).
|
||
|
|
||
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||
|
|
||
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
||
|
|
||
|
"""#"
|
||
|
|
||
|
import string,re,os,time,marshal
|
||
|
|
||
|
# Create numeric tables or character based ones ?
|
||
|
numeric = 1
|
||
|
|
||
|
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
|
||
|
'\s+'
|
||
|
'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
|
||
|
'\s*'
|
||
|
'(#.+)?')
|
||
|
|
||
|
def parsecodes(codes,
|
||
|
|
||
|
split=string.split,atoi=string.atoi,len=len,
|
||
|
filter=filter,range=range):
|
||
|
|
||
|
""" Converts code combinations to either a single code integer
|
||
|
or a tuple of integers.
|
||
|
|
||
|
meta-codes (in angular brackets, e.g. <LR> and <RL>) are
|
||
|
ignored.
|
||
|
|
||
|
Empty codes or illegal ones are returned as None.
|
||
|
|
||
|
"""
|
||
|
if not codes:
|
||
|
return None
|
||
|
l = split(codes,'+')
|
||
|
if len(l) == 1:
|
||
|
return atoi(l[0],16)
|
||
|
for i in range(len(l)):
|
||
|
try:
|
||
|
l[i] = atoi(l[i],16)
|
||
|
except ValueError:
|
||
|
l[i] = None
|
||
|
l = filter(lambda x: x is not None, l)
|
||
|
if len(l) == 1:
|
||
|
return l[0]
|
||
|
else:
|
||
|
return tuple(l)
|
||
|
|
||
|
def readmap(filename,
|
||
|
|
||
|
strip=string.strip):
|
||
|
|
||
|
f = open(filename,'r')
|
||
|
lines = f.readlines()
|
||
|
f.close()
|
||
|
enc2uni = {}
|
||
|
for line in lines:
|
||
|
line = strip(line)
|
||
|
if not line or line[0] == '#':
|
||
|
continue
|
||
|
m = mapRE.match(line)
|
||
|
if not m:
|
||
|
#print '* not matched: %s' % repr(line)
|
||
|
continue
|
||
|
enc,uni,comment = m.groups()
|
||
|
enc = parsecodes(enc)
|
||
|
uni = parsecodes(uni)
|
||
|
if not comment:
|
||
|
comment = ''
|
||
|
else:
|
||
|
comment = comment[1:]
|
||
|
if enc != uni:
|
||
|
enc2uni[enc] = (uni,comment)
|
||
|
return enc2uni
|
||
|
|
||
|
def hexrepr(t,
|
||
|
|
||
|
join=string.join):
|
||
|
|
||
|
if t is None:
|
||
|
return 'None'
|
||
|
try:
|
||
|
len(t)
|
||
|
except:
|
||
|
return '0x%04x' % t
|
||
|
return '(' + join(map(lambda t: '0x%04x' % t, t),', ') + ')'
|
||
|
|
||
|
def unicoderepr(t,
|
||
|
|
||
|
join=string.join):
|
||
|
|
||
|
if t is None:
|
||
|
return 'None'
|
||
|
if numeric:
|
||
|
return hexrepr(t)
|
||
|
else:
|
||
|
try:
|
||
|
len(t)
|
||
|
except:
|
||
|
return repr(unichr(t))
|
||
|
return repr(join(map(unichr, t),''))
|
||
|
|
||
|
def keyrepr(t,
|
||
|
|
||
|
join=string.join):
|
||
|
|
||
|
if t is None:
|
||
|
return 'None'
|
||
|
if numeric:
|
||
|
return hexrepr(t)
|
||
|
else:
|
||
|
try:
|
||
|
len(t)
|
||
|
except:
|
||
|
if t < 256:
|
||
|
return repr(chr(t))
|
||
|
else:
|
||
|
return repr(unichr(t))
|
||
|
return repr(join(map(chr, t),''))
|
||
|
|
||
|
def codegen(name,map,comments=1):
|
||
|
|
||
|
""" Returns Python source for the given map.
|
||
|
|
||
|
Comments are included in the source, if comments is true (default).
|
||
|
|
||
|
"""
|
||
|
l = [
|
||
|
'''\
|
||
|
""" Python Character Mapping Codec generated from '%s'.
|
||
|
|
||
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||
|
|
||
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
||
|
|
||
|
"""#"
|
||
|
|
||
|
import codecs
|
||
|
|
||
|
### Codec APIs
|
||
|
|
||
|
class Codec(codecs.Codec):
|
||
|
|
||
|
def encode(self,input,errors='strict'):
|
||
|
|
||
|
return codecs.charmap_encode(input,errors,encoding_map)
|
||
|
|
||
|
def decode(self,input,errors='strict'):
|
||
|
|
||
|
return codecs.charmap_decode(input,errors,decoding_map)
|
||
|
|
||
|
class StreamWriter(Codec,codecs.StreamWriter):
|
||
|
pass
|
||
|
|
||
|
class StreamReader(Codec,codecs.StreamReader):
|
||
|
pass
|
||
|
|
||
|
### encodings module API
|
||
|
|
||
|
def getregentry():
|
||
|
|
||
|
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
|
||
|
|
||
|
### Decoding Map
|
||
|
|
||
|
decoding_map = {
|
||
|
''' % name,
|
||
|
]
|
||
|
mappings = map.items()
|
||
|
mappings.sort()
|
||
|
append = l.append
|
||
|
i = 0
|
||
|
splits = 0
|
||
|
for e,value in mappings:
|
||
|
try:
|
||
|
(u,c) = value
|
||
|
except TypeError:
|
||
|
u = value
|
||
|
c = ''
|
||
|
key = keyrepr(e)
|
||
|
if c and comments:
|
||
|
append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
|
||
|
else:
|
||
|
append('\t%s: %s,' % (key,unicoderepr(u)))
|
||
|
i = i + 1
|
||
|
if i == 4096:
|
||
|
# Split the definition into parts to that the Python
|
||
|
# parser doesn't dump core
|
||
|
if splits == 0:
|
||
|
append('}')
|
||
|
else:
|
||
|
append('})')
|
||
|
append('map.update({')
|
||
|
i = 0
|
||
|
splits = splits + 1
|
||
|
if splits == 0:
|
||
|
append('}')
|
||
|
else:
|
||
|
append('})')
|
||
|
append('''
|
||
|
### Encoding Map
|
||
|
|
||
|
encoding_map = {}
|
||
|
for k,v in decoding_map.items():
|
||
|
encoding_map[v] = k
|
||
|
''')
|
||
|
return string.join(l,'\n')
|
||
|
|
||
|
def pymap(name,map,pyfile,comments=1):
|
||
|
|
||
|
code = codegen(name,map,comments)
|
||
|
f = open(pyfile,'w')
|
||
|
f.write(code)
|
||
|
f.close()
|
||
|
|
||
|
def marshalmap(name,map,marshalfile):
|
||
|
|
||
|
d = {}
|
||
|
for e,(u,c) in map.items():
|
||
|
d[e] = (u,c)
|
||
|
f = open(marshalfile,'wb')
|
||
|
marshal.dump(d,f)
|
||
|
f.close()
|
||
|
|
||
|
def convertdir(dir,prefix='',comments=1):
|
||
|
|
||
|
mapnames = os.listdir(dir)
|
||
|
for mapname in mapnames:
|
||
|
name = os.path.split(mapname)[1]
|
||
|
name = string.replace(name,'-','_')
|
||
|
name = string.split(name, '.')[0]
|
||
|
name = string.lower(name)
|
||
|
codefile = name + '.py'
|
||
|
marshalfile = name + '.mapping'
|
||
|
print 'converting %s to %s and %s' % (mapname,
|
||
|
prefix + codefile,
|
||
|
prefix + marshalfile)
|
||
|
try:
|
||
|
map = readmap(os.path.join(dir,mapname))
|
||
|
if not map:
|
||
|
print '* map is empty; skipping'
|
||
|
else:
|
||
|
pymap(mapname, map, prefix + codefile,comments)
|
||
|
marshalmap(mapname, map, prefix + marshalfile)
|
||
|
except ValueError:
|
||
|
print '* conversion failed'
|
||
|
|
||
|
def rewritepythondir(dir,prefix='',comments=1):
|
||
|
|
||
|
mapnames = os.listdir(dir)
|
||
|
for mapname in mapnames:
|
||
|
if mapname[-len('.mapping'):] != '.mapping':
|
||
|
continue
|
||
|
codefile = mapname[:-len('.mapping')] + '.py'
|
||
|
print 'converting %s to %s' % (mapname,
|
||
|
prefix + codefile)
|
||
|
try:
|
||
|
map = marshal.load(open(os.path.join(dir,mapname),
|
||
|
'rb'))
|
||
|
if not map:
|
||
|
print '* map is empty; skipping'
|
||
|
else:
|
||
|
pymap(mapname, map, prefix + codefile,comments)
|
||
|
except ValueError, why:
|
||
|
print '* conversion failed: %s' % why
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
|
||
|
import sys
|
||
|
if 1:
|
||
|
apply(convertdir,tuple(sys.argv[1:]))
|
||
|
else:
|
||
|
apply(rewritepythondir,tuple(sys.argv[1:]))
|