Moved gencodec.py to the Tools/unicode/ directory.

Added new support for decoding tables. Cleaned up the implementation a bit.
2005-10-21 13:45:17 +00:00 · 2005-10-21 13:45:17 +00:00 · c5694c8bf4
commit c5694c8bf4
parent 3144130217
1 changed files with 183 additions and 92 deletions
--- a/Tools/unicode/gencodec.py
+++ b/Tools/unicode/gencodec.py
@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores.
 The tool also writes marshalled versions of the mapping tables to the
 same location (with .mapping extension).
-Written by Marc-Andre Lemburg (mal@lemburg.com).
+Written by Marc-Andre Lemburg (mal@lemburg.com).  Modified to generate
 Unicode table maps for decoding.
 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 (c) Copyright Guido van Rossum, 2000.
 (c) Copyright Marc-Andre Lemburg, 2005.
 """#"
-import re,os,time,marshal
+import re, os, time, marshal, codecs
-# Create numeric tables or character based ones ?
+# Maximum allowed size of charmap tables
-numeric = 1
+MAX_TABLE_SIZE = 8192
 # Standard undefined Unicode code point
 UNI_UNDEFINED = unichr(0xFFFE)
 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
                   '\s+'
@ -69,8 +74,15 @@ def readmap(filename):
    enc2uni = {}
    identity = []
    unmapped = range(256)
-    for i in range(256):
+
-        unmapped[i] = i
+    # UTC mapping tables per convention don't include the identity
    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
    # explicitly mapped to different characters or undefined
    for i in range(32) + [127]:
        identity.append(i)
        unmapped.remove(i)
        enc2uni[i] = (i, 'CONTROL CHARACTER')
    for line in lines:
        line = line.strip()
        if not line or line[0] == '#':
@ -82,22 +94,23 @@ def readmap(filename):
        enc,uni,comment = m.groups()
        enc = parsecodes(enc)
        uni = parsecodes(uni)
-        if not comment:
+        if comment is None:
            comment = ''
        else:
-            comment = comment[1:]
+            comment = comment[1:].strip()
        if enc < 256:
-            unmapped.remove(enc)
+            if enc in unmapped:
                unmapped.remove(enc)
            if enc == uni:
                identity.append(enc)
-            else:
+            enc2uni[enc] = (uni,comment)
                enc2uni[enc] = (uni,comment)
        else:
            enc2uni[enc] = (uni,comment)
    # If there are more identity-mapped entries than unmapped entries,
    # it pays to generate an identity dictionary first, and add explicit
    # mappings to None for the rest
-    if len(identity)>=len(unmapped):
+    if len(identity) >= len(unmapped):
        for enc in unmapped:
            enc2uni[enc] = (None, "")
        enc2uni['IDENTITY'] = 256
@ -112,44 +125,146 @@ def hexrepr(t):
        len(t)
    except:
        return '0x%04x' % t
-    return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
+    try:
        return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
    except TypeError, why:
        print '* failed to convert %r: %s' % (t, why)
        raise
-def unicoderepr(t):
+def python_mapdef_code(varname, map, comments=1):
-    if t is None:
+    l = []
-        return 'None'
+    append = l.append
-    if numeric:
+    if map.has_key("IDENTITY"):
-        return hexrepr(t)
+        append("%s = codecs.make_identity_dict(range(%d))" %
               (varname, map["IDENTITY"]))
        append("%s.update({" % varname)
        splits = 1
        del map["IDENTITY"]
        identity = 1
    else:
-        try:
+        append("%s = {" % varname)
-            len(t)
+        splits = 0
-        except:
+        identity = 0
            return repr(unichr(t))
        return repr(''.join(map(unichr, t)))
-def keyrepr(t):
+    mappings = map.items()
-
+    mappings.sort()
-    if t is None:
+    i = 0
-        return 'None'
+    for mapkey, mapvalue in mappings:
-    if numeric:
+        mapcomment = ''
-        return hexrepr(t)
+        if isinstance(mapkey, tuple):
-    else:
+            (mapkey, mapcomment) = mapkey
-        try:
+        if isinstance(mapvalue, tuple):
-            len(t)
+            (mapvalue, mapcomment) = mapvalue
-        except:
+        if mapkey is None:
-            if t < 256:
+            continue
-                return repr(chr(t))
+        if (identity and
            mapkey == mapvalue and
            mapkey < 256):
            # No need to include identity mappings, since these
            # are already set for the first 256 code points.
            continue
        key = hexrepr(mapkey)
        value = hexrepr(mapvalue)
        if mapcomment and comments:
            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
        else:
            append('    %s: %s,' % (key, value))
        i += 1
        if i == 4096:
            # Split the definition into parts to that the Python
            # parser doesn't dump core
            if splits == 0:
                append('}')
            else:
-                return repr(unichr(t))
+                append('})')
-        return repr(''.join(map(chr, t)))
+            append('%s.update({' % varname)
            i = 0
            splits = splits + 1
    if splits == 0:
        append('}')
    else:
        append('})')
-def codegen(name,map,comments=1):
+    return l
 def python_tabledef_code(varname, map, comments=1):
    l = []
    append = l.append
    append('%s = (' % varname)
    # Analyze map and create table dict
    mappings = map.items()
    mappings.sort()
    table = {}
    maxkey = 0
    if map.has_key('IDENTITY'):
        for key in range(256):
            table[key] = (key, '')
        maxkey = 255
        del map['IDENTITY']
    for mapkey, mapvalue in mappings:
        mapcomment = ''
        if isinstance(mapkey, tuple):
            (mapkey, mapcomment) = mapkey
        if isinstance(mapvalue, tuple):
            (mapvalue, mapcomment) = mapvalue
        if mapkey is None:
            continue
        table[mapkey] = (mapvalue, mapcomment)
        if mapkey > maxkey:
            maxkey = mapkey
    if maxkey > MAX_TABLE_SIZE:
        # Table too large
        return None
    # Create table code
    for key in range(maxkey + 1):
        if key not in table:
            mapvalue = None
            mapcomment = 'UNDEFINED'
        else:
            mapvalue, mapcomment = table[key]
        if mapvalue is None:
            mapchar = UNI_UNDEFINED
        else:
            if isinstance(mapvalue, tuple):
                # 1-n mappings not supported
                return None
            else:
                mapchar = unichr(mapvalue)
        if mapcomment and comments:
            append('    %r\t#  %s -> %s' % (mapchar,
                                            hexrepr(key),
                                            mapcomment))
        else:
            append('    %r' % mapchar)
    append(')')
    return l
 def codegen(name, map, comments=1):
    """ Returns Python source for the given map.
        Comments are included in the source, if comments is true (default).
    """
    # Generate code
    decoding_map_code = python_mapdef_code(
        'decoding_map',
        map,
        comments=comments)
    decoding_table_code = python_tabledef_code(
        'decoding_table',
        map,
        comments=comments)
    encoding_map_code = python_mapdef_code(
        'encoding_map',
        codecs.make_encoding_map(map),
        comments=comments)
    l = [
        '''\
 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
@ -167,9 +282,16 @@ class Codec(codecs.Codec):
        return codecs.charmap_encode(input,errors,encoding_map)
    def decode(self,input,errors='strict'):
-
+''' % name
-        return codecs.charmap_decode(input,errors,decoding_map)
+        ]
-
+    if decoding_table_code:
        l.append('''\
        return codecs.charmap_decode(input,errors,decoding_table)''')
    else:
        l.append('''\
        return codecs.charmap_decode(input,errors,decoding_map)''')
    l.append('''    
 class StreamWriter(Codec,codecs.StreamWriter):
    pass
@ -183,54 +305,21 @@ def getregentry():
    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
 ### Decoding Map
 ''' % name,
        ]
    if map.has_key("IDENTITY"):
        l.append("decoding_map = codecs.make_identity_dict(range(%d))"
                 % map["IDENTITY"])
        l.append("decoding_map.update({")
        splits = 1
        del map["IDENTITY"]
    else:
        l.append("decoding_map = {")
        splits = 0
    mappings = map.items()
    mappings.sort()
    append = l.append
    i = 0
    for e,value in mappings:
        try:
            (u,c) = value
        except TypeError:
            u = value
            c = ''
        key = keyrepr(e)
        if c and comments:
            append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
        else:
            append('\t%s: %s,' % (key,unicoderepr(u)))
        i += 1
        if i == 4096:
            # Split the definition into parts to that the Python
            # parser doesn't dump core
            if splits == 0:
                append('}')
            else:
                append('})')
            append('decoding_map.update({')
            i = 0
            splits = splits + 1
    if splits == 0:
        append('}')
    else:
        append('})')
    append('''
 ### Encoding Map
 encoding_map = codecs.make_encoding_map(decoding_map)
 ''')
    l.extend(decoding_map_code)
    # Add optional decoding table
    if decoding_table_code:
        l.append('''
 ### Decoding Table
 ''')
        l.extend(decoding_table_code)
    l.append('''
 ### Encoding Map
 ''')
    l.extend(encoding_map_code)
    return '\n'.join(l)
 def pymap(name,map,pyfile,comments=1):
@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
    mapnames = os.listdir(dir)
    for mapname in mapnames:
        mappathname = os.path.join(dir, mapname)
        name = os.path.split(mapname)[1]
        name = name.replace('-','_')
        name = name.split('.')[0]
@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
            if not map:
                print '* map is empty; skipping'
            else:
-                pymap(mapname, map, prefix + codefile,comments)
+                pymap(mappathname, map, prefix + codefile,comments)
-                marshalmap(mapname, map, prefix + marshalfile)
+                marshalmap(mappathname, map, prefix + marshalfile)
-        except ValueError:
+        except ValueError, why:
-            print '* conversion failed'
+            print '* conversion failed: %s' % why
            raise
 def rewritepythondir(dir,prefix='',comments=1):