svn+ssh://pythondev@svn.python.org/python/branches/p3yk ........ r55077 | guido.van.rossum | 2007-05-02 11:54:37 -0700 (Wed, 02 May 2007) | 2 lines Use the new print syntax, at least. ........ r55142 | fred.drake | 2007-05-04 21:27:30 -0700 (Fri, 04 May 2007) | 1 line remove old cruftiness ........ r55143 | fred.drake | 2007-05-04 21:52:16 -0700 (Fri, 04 May 2007) | 1 line make this work with the new Python ........ r55162 | neal.norwitz | 2007-05-06 22:29:18 -0700 (Sun, 06 May 2007) | 1 line Get asdl code gen working with Python 2.3. Should continue to work with 3.0 ........ r55164 | neal.norwitz | 2007-05-07 00:00:38 -0700 (Mon, 07 May 2007) | 1 line Verify checkins to p3yk (sic) branch go to 3000 list. ........ r55166 | neal.norwitz | 2007-05-07 00:12:35 -0700 (Mon, 07 May 2007) | 1 line Fix this test so it runs again by importing warnings_test properly. ........ r55167 | neal.norwitz | 2007-05-07 01:03:22 -0700 (Mon, 07 May 2007) | 8 lines So long xrange. range() now supports values that are outside -sys.maxint to sys.maxint. floats raise a TypeError. This has been sitting for a long time. It probably has some problems and needs cleanup. Objects/rangeobject.c now uses 4-space indents since it is almost completely new. ........ r55171 | guido.van.rossum | 2007-05-07 10:21:26 -0700 (Mon, 07 May 2007) | 4 lines Fix two tests that were previously depending on significant spaces at the end of a line (and before that on Python 2.x print behavior that has no exact equivalent in 3.0). ........
432 lines
9.8 KiB
Python
432 lines
9.8 KiB
Python
import re, unicodedata, sys
|
|
|
|
if sys.maxunicode == 65535:
|
|
raise RuntimeError, "need UCS-4 Python"
|
|
|
|
def gen_category(cats):
|
|
for i in range(0, 0x110000):
|
|
if unicodedata.category(unichr(i)) in cats:
|
|
yield(i)
|
|
|
|
def gen_bidirectional(cats):
|
|
for i in range(0, 0x110000):
|
|
if unicodedata.bidirectional(unichr(i)) in cats:
|
|
yield(i)
|
|
|
|
def compact_set(l):
|
|
single = []
|
|
tuple = []
|
|
prev = None
|
|
span = 0
|
|
for e in l:
|
|
if prev is None:
|
|
prev = e
|
|
span = 0
|
|
continue
|
|
if prev+span+1 != e:
|
|
if span > 2:
|
|
tuple.append((prev,prev+span+1))
|
|
else:
|
|
for i in range(prev, prev+span+1):
|
|
single.append(i)
|
|
prev = e
|
|
span = 0
|
|
else:
|
|
span += 1
|
|
if span:
|
|
tuple.append((prev,prev+span+1))
|
|
else:
|
|
single.append(prev)
|
|
tuple = " + ".join(["list(range(%d,%d))" % t for t in tuple])
|
|
if not single:
|
|
return "set(%s)" % tuple
|
|
if not tuple:
|
|
return "set(%s)" % repr(single)
|
|
return "set(%s + %s)" % (repr(single),tuple)
|
|
|
|
############## Read the tables in the RFC #######################
|
|
|
|
data = open("rfc3454.txt").readlines()
|
|
|
|
tables = []
|
|
curname = None
|
|
for l in data:
|
|
l = l.strip()
|
|
if not l:
|
|
continue
|
|
# Skip RFC page breaks
|
|
if l.startswith("Hoffman & Blanchet") or\
|
|
l.startswith("RFC 3454"):
|
|
continue
|
|
# Find start/end lines
|
|
m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
|
|
if m:
|
|
if m.group(1) == "Start":
|
|
if curname:
|
|
raise "Double Start",(curname, l)
|
|
curname = m.group(2)
|
|
table = {}
|
|
tables.append((curname, table))
|
|
continue
|
|
else:
|
|
if not curname:
|
|
raise "End without start", l
|
|
curname = None
|
|
continue
|
|
if not curname:
|
|
continue
|
|
# Now we are in a table
|
|
fields = l.split(";")
|
|
if len(fields) > 1:
|
|
# Drop comment field
|
|
fields = fields[:-1]
|
|
if len(fields) == 1:
|
|
fields = fields[0].split("-")
|
|
if len(fields) > 1:
|
|
# range
|
|
try:
|
|
start, end = fields
|
|
except ValueError:
|
|
raise "Unpacking problem", l
|
|
else:
|
|
start = end = fields[0]
|
|
start = int(start, 16)
|
|
end = int(end, 16)
|
|
for i in range(start, end+1):
|
|
table[i] = i
|
|
else:
|
|
code, value = fields
|
|
value = value.strip()
|
|
if value:
|
|
value = [int(v, 16) for v in value.split(" ")]
|
|
else:
|
|
# table B.1
|
|
value = None
|
|
table[int(code, 16)] = value
|
|
|
|
########### Generate compact Python versions of the tables #############
|
|
|
|
print """# This file is generated by mkstringprep.py. DO NOT EDIT.
|
|
\"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
|
|
|
|
There are two kinds of tables: sets, for which a member test is provided,
|
|
and mappings, for which a mapping function is provided.
|
|
\"\"\"
|
|
|
|
import unicodedata
|
|
"""
|
|
|
|
print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
|
|
|
|
# A.1 is the table of unassigned characters
|
|
# XXX Plane 15 PUA is listed as unassigned in Python.
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "A.1"
|
|
table = set(table.keys())
|
|
Cn = set(gen_category(["Cn"]))
|
|
|
|
# FDD0..FDEF are process internal codes
|
|
Cn -= set(range(0xFDD0, 0xFDF0))
|
|
# not a character
|
|
Cn -= set(range(0xFFFE, 0x110000, 0x10000))
|
|
Cn -= set(range(0xFFFF, 0x110000, 0x10000))
|
|
|
|
# assert table == Cn
|
|
|
|
print """
|
|
def in_table_a1(code):
|
|
if unicodedata.category(code) != 'Cn': return False
|
|
c = ord(code)
|
|
if 0xFDD0 <= c < 0xFDF0: return False
|
|
return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
|
|
"""
|
|
|
|
# B.1 cannot easily be derived
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "B.1"
|
|
table = table.keys()
|
|
table.sort()
|
|
print """
|
|
b1_set = """ + compact_set(table) + """
|
|
def in_table_b1(code):
|
|
return ord(code) in b1_set
|
|
"""
|
|
|
|
# B.2 and B.3 is case folding.
|
|
# It takes CaseFolding.txt into account, which is
|
|
# not available in the Python database. Since
|
|
# B.2 is derived from B.3, we process B.3 first.
|
|
# B.3 supposedly *is* CaseFolding-3.2.0.txt.
|
|
|
|
name, table_b2 = tables[0]
|
|
del tables[0]
|
|
assert name == "B.2"
|
|
|
|
name, table_b3 = tables[0]
|
|
del tables[0]
|
|
assert name == "B.3"
|
|
|
|
# B.3 is mostly Python's .lower, except for a number
|
|
# of special cases, e.g. considering canonical forms.
|
|
|
|
b3_exceptions = {}
|
|
|
|
for k,v in table_b2.items():
|
|
if map(ord, unichr(k).lower()) != v:
|
|
b3_exceptions[k] = u"".join(map(unichr,v))
|
|
|
|
b3 = b3_exceptions.items()
|
|
b3.sort()
|
|
|
|
print """
|
|
b3_exceptions = {"""
|
|
for i,(k,v) in enumerate(b3):
|
|
print "0x%x:%s," % (k, repr(v)),
|
|
if i % 4 == 3:
|
|
print
|
|
print "}"
|
|
|
|
print """
|
|
def map_table_b3(code):
|
|
r = b3_exceptions.get(ord(code))
|
|
if r is not None: return r
|
|
return code.lower()
|
|
"""
|
|
|
|
def map_table_b3(code):
|
|
r = b3_exceptions.get(ord(code))
|
|
if r is not None: return r
|
|
return code.lower()
|
|
|
|
# B.2 is case folding for NFKC. This is the same as B.3,
|
|
# except where NormalizeWithKC(Fold(a)) !=
|
|
# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
|
|
|
|
def map_table_b2(a):
|
|
al = map_table_b3(a)
|
|
b = unicodedata.normalize("NFKC", al)
|
|
bl = u"".join([map_table_b3(ch) for ch in b])
|
|
c = unicodedata.normalize("NFKC", bl)
|
|
if b != c:
|
|
return c
|
|
else:
|
|
return al
|
|
|
|
specials = {}
|
|
for k,v in table_b2.items():
|
|
if map(ord, map_table_b2(unichr(k))) != v:
|
|
specials[k] = v
|
|
|
|
# B.3 should not add any additional special cases
|
|
assert specials == {}
|
|
|
|
print """
|
|
def map_table_b2(a):
|
|
al = map_table_b3(a)
|
|
b = unicodedata.normalize("NFKC", al)
|
|
bl = u"".join([map_table_b3(ch) for ch in b])
|
|
c = unicodedata.normalize("NFKC", bl)
|
|
if b != c:
|
|
return c
|
|
else:
|
|
return al
|
|
"""
|
|
|
|
# C.1.1 is a table with a single character
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.1.1"
|
|
assert table == {0x20:0x20}
|
|
|
|
print """
|
|
def in_table_c11(code):
|
|
return code == u" "
|
|
"""
|
|
|
|
# C.1.2 is the rest of all space characters
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.1.2"
|
|
|
|
# table = set(table.keys())
|
|
# Zs = set(gen_category(["Zs"])) - set([0x20])
|
|
# assert Zs == table
|
|
|
|
print """
|
|
def in_table_c12(code):
|
|
return unicodedata.category(code) == "Zs" and code != u" "
|
|
|
|
def in_table_c11_c12(code):
|
|
return unicodedata.category(code) == "Zs"
|
|
"""
|
|
|
|
# C.2.1 ASCII control characters
|
|
name, table_c21 = tables[0]
|
|
del tables[0]
|
|
assert name == "C.2.1"
|
|
|
|
Cc = set(gen_category(["Cc"]))
|
|
Cc_ascii = Cc & set(range(128))
|
|
table_c21 = set(table_c21.keys())
|
|
assert Cc_ascii == table_c21
|
|
|
|
print """
|
|
def in_table_c21(code):
|
|
return ord(code) < 128 and unicodedata.category(code) == "Cc"
|
|
"""
|
|
|
|
# C.2.2 Non-ASCII control characters. It also includes
|
|
# a number of characters in category Cf.
|
|
name, table_c22 = tables[0]
|
|
del tables[0]
|
|
assert name == "C.2.2"
|
|
|
|
Cc_nonascii = Cc - Cc_ascii
|
|
table_c22 = set(table_c22.keys())
|
|
assert len(Cc_nonascii - table_c22) == 0
|
|
|
|
specials = list(table_c22 - Cc_nonascii)
|
|
specials.sort()
|
|
|
|
print """c22_specials = """ + compact_set(specials) + """
|
|
def in_table_c22(code):
|
|
c = ord(code)
|
|
if c < 128: return False
|
|
if unicodedata.category(code) == "Cc": return True
|
|
return c in c22_specials
|
|
|
|
def in_table_c21_c22(code):
|
|
return unicodedata.category(code) == "Cc" or \\
|
|
ord(code) in c22_specials
|
|
"""
|
|
|
|
# C.3 Private use
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.3"
|
|
|
|
Co = set(gen_category(["Co"]))
|
|
assert set(table.keys()) == Co
|
|
|
|
print """
|
|
def in_table_c3(code):
|
|
return unicodedata.category(code) == "Co"
|
|
"""
|
|
|
|
# C.4 Non-character code points, xFFFE, xFFFF
|
|
# plus process internal codes
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.4"
|
|
|
|
nonchar = set(range(0xFDD0,0xFDF0) +
|
|
range(0xFFFE,0x110000,0x10000) +
|
|
range(0xFFFF,0x110000,0x10000))
|
|
table = set(table.keys())
|
|
assert table == nonchar
|
|
|
|
print """
|
|
def in_table_c4(code):
|
|
c = ord(code)
|
|
if c < 0xFDD0: return False
|
|
if c < 0xFDF0: return True
|
|
return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
|
|
"""
|
|
|
|
# C.5 Surrogate codes
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.5"
|
|
|
|
Cs = set(gen_category(["Cs"]))
|
|
assert set(table.keys()) == Cs
|
|
|
|
print """
|
|
def in_table_c5(code):
|
|
return unicodedata.category(code) == "Cs"
|
|
"""
|
|
|
|
# C.6 Inappropriate for plain text
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.6"
|
|
|
|
table = table.keys()
|
|
table.sort()
|
|
|
|
print """
|
|
c6_set = """ + compact_set(table) + """
|
|
def in_table_c6(code):
|
|
return ord(code) in c6_set
|
|
"""
|
|
|
|
# C.7 Inappropriate for canonical representation
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.7"
|
|
|
|
table = table.keys()
|
|
table.sort()
|
|
|
|
print """
|
|
c7_set = """ + compact_set(table) + """
|
|
def in_table_c7(code):
|
|
return ord(code) in c7_set
|
|
"""
|
|
|
|
# C.8 Change display properties or are deprecated
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.8"
|
|
|
|
table = table.keys()
|
|
table.sort()
|
|
|
|
print """
|
|
c8_set = """ + compact_set(table) + """
|
|
def in_table_c8(code):
|
|
return ord(code) in c8_set
|
|
"""
|
|
|
|
# C.9 Tagging characters
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "C.9"
|
|
|
|
table = table.keys()
|
|
table.sort()
|
|
|
|
print """
|
|
c9_set = """ + compact_set(table) + """
|
|
def in_table_c9(code):
|
|
return ord(code) in c9_set
|
|
"""
|
|
|
|
# D.1 Characters with bidirectional property "R" or "AL"
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "D.1"
|
|
|
|
RandAL = set(gen_bidirectional(["R","AL"]))
|
|
assert set(table.keys()) == RandAL
|
|
|
|
print """
|
|
def in_table_d1(code):
|
|
return unicodedata.bidirectional(code) in ("R","AL")
|
|
"""
|
|
|
|
# D.2 Characters with bidirectional property "L"
|
|
name, table = tables[0]
|
|
del tables[0]
|
|
assert name == "D.2"
|
|
|
|
L = set(gen_bidirectional(["L"]))
|
|
assert set(table.keys()) == L
|
|
|
|
print """
|
|
def in_table_d2(code):
|
|
return unicodedata.bidirectional(code) == "L"
|
|
"""
|