7037261: j.l.Character.isLowerCase/isUpperCase need to match the Unicode Standard

Updated j.l.c.lsLowerCase/isUpperCase

Reviewed-by: okutsu
This commit is contained in:
Xueming Shen 2011-04-28 20:18:57 -07:00
parent faa5e45f7a
commit 4463efaf61
14 changed files with 3147 additions and 63 deletions

View File

@ -356,6 +356,7 @@ JAVA_JAVA_java = \
java/util/regex/Matcher.java \
java/util/regex/MatchResult.java \
java/util/regex/ASCII.java \
java/util/regex/UnicodeProp.java \
java/util/regex/PatternSyntaxException.java \
java/util/prefs/Preferences.java \
java/util/prefs/AbstractPreferences.java \

View File

@ -345,30 +345,35 @@ $(GENSRCDIR)/java/lang/CharacterDataLatin1.java \
-template $(CHARACTERDATA)/CharacterDataLatin1.java.template \
-spec $(UNICODEDATA)/UnicodeData.txt \
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-o $(GENSRCDIR)/java/lang/CharacterDataLatin1.java -string \
-usecharforbyte -latin1 8
$(BOOT_JAVA_CMD) -jar $(GENERATECHARACTER_JARFILE) -plane 0 \
-template $(CHARACTERDATA)/CharacterData00.java.template \
-spec $(UNICODEDATA)/UnicodeData.txt \
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-o $(GENSRCDIR)/java/lang/CharacterData00.java -string \
-usecharforbyte 11 4 1
$(BOOT_JAVA_CMD) -jar $(GENERATECHARACTER_JARFILE) -plane 1 \
-template $(CHARACTERDATA)/CharacterData01.java.template \
-spec $(UNICODEDATA)/UnicodeData.txt \
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-o $(GENSRCDIR)/java/lang/CharacterData01.java -string \
-usecharforbyte 11 4 1
$(BOOT_JAVA_CMD) -jar $(GENERATECHARACTER_JARFILE) -plane 2 \
-template $(CHARACTERDATA)/CharacterData02.java.template \
-spec $(UNICODEDATA)/UnicodeData.txt \
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-o $(GENSRCDIR)/java/lang/CharacterData02.java -string \
-usecharforbyte 11 4 1
$(BOOT_JAVA_CMD) -jar $(GENERATECHARACTER_JARFILE) -plane 14 \
-template $(CHARACTERDATA)/CharacterData0E.java.template \
-spec $(UNICODEDATA)/UnicodeData.txt \
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-o $(GENSRCDIR)/java/lang/CharacterData0E.java -string \
-usecharforbyte 11 4 1

View File

@ -73,11 +73,37 @@ class CharacterData00 extends CharacterData {
return props;
}
int getPropertiesEx(int ch) {
char offset = (char)ch;
int props = $$LookupEx(offset);
return props;
}
int getType(int ch) {
int props = getProperties(ch);
return (props & $$maskType);
}
boolean isOtherLowercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherLowercase) != 0;
}
boolean isOtherUppercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherUppercase) != 0;
}
boolean isOtherAlphabetic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherAlphabetic) != 0;
}
boolean isIdeographic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskIdeographic) != 0;
}
boolean isJavaIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);

View File

@ -72,11 +72,37 @@ class CharacterData01 extends CharacterData {
return props;
}
int getPropertiesEx(int ch) {
char offset = (char)ch;
int props = $$LookupEx(offset);
return props;
}
int getType(int ch) {
int props = getProperties(ch);
return (props & $$maskType);
}
boolean isOtherLowercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherLowercase) != 0;
}
boolean isOtherUppercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherUppercase) != 0;
}
boolean isOtherAlphabetic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherAlphabetic) != 0;
}
boolean isIdeographic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskIdeographic) != 0;
}
boolean isJavaIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);

View File

@ -66,11 +66,37 @@ class CharacterData02 extends CharacterData {
*/
int getProperties(int ch) {
char offset = (char)ch;
char offset = (char)ch;
int props = $$Lookup(offset);
return props;
}
int getPropertiesEx(int ch) {
char offset = (char)ch;
int props = $$LookupEx(offset);
return props;
}
boolean isOtherLowercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherLowercase) != 0;
}
boolean isOtherUppercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherUppercase) != 0;
}
boolean isOtherAlphabetic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherAlphabetic) != 0;
}
boolean isIdeographic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskIdeographic) != 0;
}
int getType(int ch) {
int props = getProperties(ch);
return (props & $$maskType);

View File

@ -66,11 +66,37 @@ class CharacterData0E extends CharacterData {
*/
int getProperties(int ch) {
char offset = (char)ch;
char offset = (char)ch;
int props = $$Lookup(offset);
return props;
}
int getPropertiesEx(int ch) {
char offset = (char)ch;
int props = $$LookupEx(offset);
return props;
}
boolean isOtherLowercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherLowercase) != 0;
}
boolean isOtherUppercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherUppercase) != 0;
}
boolean isOtherAlphabetic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherAlphabetic) != 0;
}
boolean isIdeographic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskIdeographic) != 0;
}
int getType(int ch) {
int props = getProperties(ch);
return (props & $$maskType);

View File

@ -67,11 +67,37 @@ class CharacterDataLatin1 extends CharacterData {
*/
int getProperties(int ch) {
char offset = (char)ch;
char offset = (char)ch;
int props = $$Lookup(offset);
return props;
}
int getPropertiesEx(int ch) {
char offset = (char)ch;
int props = $$LookupEx(offset);
return props;
}
boolean isOtherLowercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherLowercase) != 0;
}
boolean isOtherUppercase(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherUppercase) != 0;
}
boolean isOtherAlphabetic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskOtherAlphabetic) != 0;
}
boolean isIdeographic(int ch) {
int props = getPropertiesEx(ch);
return (props & $$maskIdeographic) != 0;
}
int getType(int ch) {
int props = getProperties(ch);
return (props & $$maskType);

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,3 @@
/*
* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@ -34,6 +33,7 @@ import java.io.PrintWriter;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.File;
import java.util.List;
import build.tools.generatecharacter.CharacterName;
@ -68,18 +68,17 @@ public class GenerateCharacter {
final static boolean DEBUG = false;
final static int MAX_UNICODE_VALUE = 0xFFFF;
final static String commandMarker = "$$";
static String ROOT = "";
static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";
static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
static String DefaultPropListFileName = ROOT + "PropList.txt";
static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
static String DefaultJavaOutputFileName = ROOT + "Character.java";
static String DefaultCTemplateFileName = ROOT + "Character.c.template";
static String DefaultCOutputFileName = ROOT + "Character.c";
static String CharacterDataClassName = "CharacterData";
static int plane = 0;
static int plane = 0;
/* The overall idea is that, in the generated Character class source code,
most character property data is stored in a special multi-level table whose
@ -105,7 +104,11 @@ public class GenerateCharacter {
entries are short rather than byte).
*/
/* The character properties are currently encoded into 32 bits in the following manner:
/* The character properties are currently encoded into A (32 bits)and B (16 bits)
two parts.
A: the low 32 bits are defined in the following manner:
1 bit Mirrored property.
4 bits Bidirectional category (see below) (unused if -nobidi switch specified)
9 bits A signed offset used for converting case .
@ -148,6 +151,14 @@ public class GenerateCharacter {
will produce the desired numeric value.
5 bits The digit offset (see description of previous field)
5 bits Character type (see below)
B: the high 16 bits are defined as:
1 bit Other_Lowercase property
1 bit Other_Uppercase property
1 bit Other_Alphabetic property
1 bit Other_Math property
1 bit Ideographic property
1 bit Noncharacter codepoint property
*/
@ -173,9 +184,22 @@ public class GenerateCharacter {
// case offset are 9 bits
maskCase = 0x01FF,
shiftBidi = 27, maskBidi = 0x78000000,
shiftMirrored = 31, maskMirrored = 0x80000000,
shiftMirrored = 31, //maskMirrored = 0x80000000,
shiftPlane = 16, maskPlane = 0xFF0000;
// maskMirrored needs to be long, if up 16-bit
private static final long maskMirrored = 0x80000000L;
// bit masks identify the 16-bit priperty field described above, in B
// table
private static final long
maskOtherLowercase = 0x100000000L,
maskOtherUppercase = 0x200000000L,
maskOtherAlphabetic = 0x400000000L,
maskOtherMath = 0x800000000L,
maskIdeographic = 0x1000000000L,
maskNoncharacterCP = 0x2000000000L;
// Can compare masked values with these to determine
// numeric or lexical types.
public static int
@ -258,7 +282,7 @@ public class GenerateCharacter {
* The specification file is assumed to contain its data in sorted order by
* character code; as a result, the array passed as an argument to this method
* has its components in the same sorted order, with one entry for each defined
* Unicode character or character range. (A range is indicated by two consecutive
* Unicode character or character range. (A range is indicated by two consecutive
* entries, such that the name of the first entry begins with "<" and ends with
* "First>" and the second entry begins with "<" and ends with "Last>".) This is
* therefore a sparse representation of the character property data.
@ -282,7 +306,8 @@ public class GenerateCharacter {
* @see GenerateCharacter#buildOne
*/
static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps) {
static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
{
long[] result;
if (bLatin1 == true) {
result = new long[256];
@ -290,13 +315,13 @@ public class GenerateCharacter {
result = new long[1<<16];
}
int k=0;
int codePoint = plane<<16;
int codePoint = plane<<16;
UnicodeSpec nonCharSpec = new UnicodeSpec();
for (int j = 0; j < data.length && k < result.length; j++) {
if (data[j].codePoint == codePoint) {
result[k] = buildOne(codePoint, data[j], specialMaps);
++k;
++codePoint;
++codePoint;
}
else if(data[j].codePoint > codePoint) {
if (data[j].name.endsWith("Last>")) {
@ -304,7 +329,7 @@ public class GenerateCharacter {
while (codePoint < data[j].codePoint && k < result.length) {
result[k] = buildOne(codePoint, data[j], specialMaps);
++k;
++codePoint;
++codePoint;
}
}
else {
@ -312,15 +337,14 @@ public class GenerateCharacter {
while (codePoint < data[j].codePoint && k < result.length) {
result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
++k;
++codePoint;
++codePoint;
}
}
k = data[j].codePoint & 0xFFFF;
codePoint = data[j].codePoint;
codePoint = data[j].codePoint;
result[k] = buildOne(codePoint, data[j], specialMaps);
++k;
++codePoint;
++codePoint;
}
else {
System.out.println("An error has occured during spec mapping.");
@ -333,8 +357,17 @@ public class GenerateCharacter {
while (k < result.length) {
result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
++k;
++codePoint;
++codePoint;
}
// now add all extra supported properties from PropList, to the
// upper 16-bit
addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
addExProp(result, propList, "Ideographic", maskIdeographic);
//addExProp(result, propList, "Other_Math", maskOtherMath);
//addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
return result;
}
@ -381,15 +414,15 @@ public class GenerateCharacter {
// record the general category
resultA |= us.generalCategory;
// record the numeric properties
NUMERIC: {
// record the numeric properties
NUMERIC: {
STRANGE: {
int val = 0;
// c is A-Z
// c is A-Z
if ((c >= 0x0041) && (c <= 0x005A)) {
val = c - 0x0041;
resultA |= valueJavaSupradecimal;
// c is a-z
// c is a-z
} else if ((c >= 0x0061) && (c <= 0x007A)) {
val = c - 0x0061;
resultA |= valueJavaSupradecimal;
@ -428,7 +461,7 @@ public class GenerateCharacter {
resultA |= valueStrangeNumeric;
} // end NUMERIC
// record case mapping
// record case mapping
int offset = 0;
// might have a 1:M mapping
int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
@ -458,12 +491,12 @@ public class GenerateCharacter {
}
}
if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
(bHasUpper && us.hasLowerMap())) {
(bHasUpper && us.hasLowerMap())) {
resultA |= maskTitleCase;
}
if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
System.out.println("Warning: Character " + hex4(c) + " has upper but " +
"no title case; Java won't know this");
System.out.println("Warning: Character " + hex4(c) + " has upper but " +
"no title case; Java won't know this");
}
if (offset < minOffsetSeen) minOffsetSeen = offset;
if (offset > maxOffsetSeen) maxOffsetSeen = offset;
@ -475,8 +508,7 @@ public class GenerateCharacter {
}
resultA |= ((offset & maskCase) << shiftCaseOffset);
// record lexical info about this character
// record lexical info about this character
if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
|| us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
|| us.generalCategory == UnicodeSpec.TITLECASE_LETTER
@ -539,6 +571,16 @@ public class GenerateCharacter {
return resultA;
}
static void addExProp(long[] map, PropList propList, String prop, long mask) {
List<Integer> cps = propList.codepoints(prop);
if (cps != null) {
for (Integer cp : cps) {
if (cp < map.length)
map[cp] |= mask;
}
}
}
/**
* This is the heart of the table compression strategy. The inputs are a map
* and a number of bits (size). The map is simply an array of long integer values;
@ -645,8 +687,8 @@ OUTER: for (int i = 0; i < n; i += m) {
*/
static void generateCharacterClass(String theTemplateFileName,
String theOutputFileName)
throws FileNotFoundException, IOException {
String theOutputFileName)
throws FileNotFoundException, IOException {
BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
out.println(commentStart +
@ -719,6 +761,9 @@ OUTER: for (int i = 0; i < n; i += m) {
if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
x.substring(x.length()-1).equals(")") )
return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
x.substring(x.length()-1).equals(")") )
return genAccess("B", x.substring(9, x.length()-1), 16);
if (x.equals("shiftType")) return Long.toString(shiftType);
if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
@ -731,6 +776,10 @@ OUTER: for (int i = 0; i < n; i += m) {
if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
@ -899,7 +948,7 @@ OUTER: for (int i = 0; i < n; i += m) {
// If we ever need more than 32 bits to represent the character properties,
// then a table "B" may be needed as well.
// genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
result.append(commentStart);
@ -1080,9 +1129,9 @@ OUTER: for (int i = 0; i < n; i += m) {
*/
static void genTable(StringBuffer result, String name,
long[] table, int extract, int bits, int size,
boolean preshifted, int shift, boolean hexFormat,
boolean properties, boolean hexComment) {
long[] table, int extract, int bits, int size,
boolean preshifted, int shift, boolean hexFormat,
boolean properties, boolean hexComment) {
String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
bits == 2 ? (Csyntax ? "unsigned long" : "int") :
@ -1137,7 +1186,12 @@ OUTER: for (int i = 0; i < n; i += m) {
char ch = '\u0000';
int charsPerEntry = -entriesPerChar;
for (int j=0; j<table.length; ++j) {
long entry = table[j] >> extract;
//long entry = table[j] >> extract;
long entry;
if ("A".equals(name))
entry = (table[j] & 0xffffffffL) >> extract;
else
entry = (table[j] >> extract);
if (shiftEntries) entry <<= shift;
if (entry >= (1L << bits)) {
FAIL("Entry too big");
@ -1549,6 +1603,7 @@ OUTER: for (int i = 0; i < n; i += m) {
static String OutputFileName = null;
static String UnicodeSpecFileName = null; // liu
static String SpecialCasingFileName = null;
static String PropListFileName = null;
static boolean useCharForByte = false;
static int[] sizes;
static int bins = 0; // liu; if > 0, then perform search
@ -1668,20 +1723,28 @@ OUTER: for (int i = 0; i < n; i += m) {
SpecialCasingFileName = args[++j];
}
}
else if (args[j].equals("-plane")) {
if (j == args.length -1) {
FAIL("Plane number missing after -plane");
}
else {
plane = Integer.parseInt(args[++j]);
}
if (plane > 0) {
bLatin1 = false;
}
}
else if ("-usecharforbyte".equals(args[j])) {
useCharForByte = true;
}
else if (args[j].equals("-proplist")) {
if (j == args.length -1) {
FAIL("File name missing after -proplist");
}
else {
PropListFileName = args[++j];
}
}
else if (args[j].equals("-plane")) {
if (j == args.length -1) {
FAIL("Plane number missing after -plane");
}
else {
plane = Integer.parseInt(args[++j]);
}
if (plane > 0) {
bLatin1 = false;
}
}
else if ("-usecharforbyte".equals(args[j])) {
useCharForByte = true;
}
else if (args[j].equals("-latin1")) {
bLatin1 = true;
plane = 0;
@ -1728,6 +1791,10 @@ OUTER: for (int i = 0; i < n; i += m) {
SpecialCasingFileName = DefaultSpecialCasingFileName;
desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
}
if (PropListFileName == null) {
PropListFileName = DefaultPropListFileName;
desc.append(" [-proplist " + PropListFileName + ']');
}
if (TemplateFileName == null) {
TemplateFileName = (Csyntax ? DefaultCTemplateFileName
: DefaultJavaTemplateFileName);
@ -1877,12 +1944,13 @@ OUTER: for (int i = 0; i < n; i += m) {
try {
UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
if (verbose) {
System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
}
long[] map = buildMap(data, specialCaseMaps);
long[] map = buildMap(data, specialCaseMaps, propList);
if (verbose) {
System.err.println("Completed building of initial map");
}

View File

@ -0,0 +1,102 @@
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package build.tools.generatecharacter;
import java.util.regex.*;
import java.util.*;
import java.io.*;
/**
* A PropList object contains the lists of code points that have
* the same Unicode property defined in PropList.txt
*
* @author Xueming Shen
*/
public class PropList {
public static PropList readSpecFile(File file, int plane)
throws IOException
{
return new PropList(file, plane);
}
public List<Integer> codepoints(String name) {
return propMap.get(name);
}
public Set<String> names() {
return propMap.keySet();
}
private Map<String, ArrayList<Integer>> propMap =
new LinkedHashMap<String, ArrayList<Integer>>();
private PropList(File file, int plane) throws IOException {
int i, j;
BufferedReader sbfr = new BufferedReader(new FileReader(file));
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s*;\\s+(\\w+)\\s+#.*").matcher("");
String line = null;
int lineNo = 0;
while ((line = sbfr.readLine()) != null) {
lineNo++;
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
if ((start >> 16) != plane)
continue;
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3);
start &= 0xffff;
end &= 0xffff;
ArrayList<Integer> list = propMap.get(name);
if (list == null) {
list = new ArrayList<Integer>();
propMap.put(name, list);
}
while (start <= end)
list.add(start++);
} else {
System.out.printf("Warning: Unrecognized line %d <%s>%n", lineNo, line);
}
}
sbfr.close();
//for (String name: propMap.keySet()) {
// System.out.printf("%s %d%n", name, propMap.get(name).size());
//}
}
public static void main(String[] args) throws IOException {
readSpecFile(new File(args[0]), Integer.decode(args[1]));
}
}

View File

@ -59,14 +59,14 @@ import java.util.Locale;
* <p>The {@code char} data type (and therefore the value that a
* {@code Character} object encapsulates) are based on the
* original Unicode specification, which defined characters as
* fixed-width 16-bit entities. The Unicode standard has since been
* fixed-width 16-bit entities. The Unicode Standard has since been
* changed to allow for characters whose representation requires more
* than 16 bits. The range of legal <em>code point</em>s is now
* U+0000 to U+10FFFF, known as <em>Unicode scalar value</em>.
* (Refer to the <a
* href="http://www.unicode.org/reports/tr27/#notation"><i>
* definition</i></a> of the U+<i>n</i> notation in the Unicode
* standard.)
* Standard.)
*
* <p><a name="BMP">The set of characters from U+0000 to U+FFFF is
* sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>.
@ -5200,7 +5200,8 @@ class Character implements java.io.Serializable, Comparable<Character> {
* <p>
* A character is lowercase if its general category type, provided
* by {@code Character.getType(ch)}, is
* {@code LOWERCASE_LETTER}.
* {@code LOWERCASE_LETTER}, or it has contributory property
* Other_Lowercase as defined by the Unicode Standard.
* <p>
* The following are examples of lowercase characters:
* <p><blockquote><pre>
@ -5235,7 +5236,8 @@ class Character implements java.io.Serializable, Comparable<Character> {
* <p>
* A character is lowercase if its general category type, provided
* by {@link Character#getType getType(codePoint)}, is
* {@code LOWERCASE_LETTER}.
* {@code LOWERCASE_LETTER}, or it has contributory property
* Other_Lowercase as defined by the Unicode Standard.
* <p>
* The following are examples of lowercase characters:
* <p><blockquote><pre>
@ -5257,7 +5259,8 @@ class Character implements java.io.Serializable, Comparable<Character> {
* @since 1.5
*/
public static boolean isLowerCase(int codePoint) {
return getType(codePoint) == Character.LOWERCASE_LETTER;
return getType(codePoint) == Character.LOWERCASE_LETTER ||
CharacterData.of(codePoint).isOtherLowercase(codePoint);
}
/**
@ -5265,6 +5268,7 @@ class Character implements java.io.Serializable, Comparable<Character> {
* <p>
* A character is uppercase if its general category type, provided by
* {@code Character.getType(ch)}, is {@code UPPERCASE_LETTER}.
* or it has contributory property Other_Uppercase as defined by the Unicode Standard.
* <p>
* The following are examples of uppercase characters:
* <p><blockquote><pre>
@ -5298,7 +5302,8 @@ class Character implements java.io.Serializable, Comparable<Character> {
* Determines if the specified character (Unicode code point) is an uppercase character.
* <p>
* A character is uppercase if its general category type, provided by
* {@link Character#getType(int) getType(codePoint)}, is {@code UPPERCASE_LETTER}.
* {@link Character#getType(int) getType(codePoint)}, is {@code UPPERCASE_LETTER},
* or it has contributory property Other_Uppercase as defined by the Unicode Standard.
* <p>
* The following are examples of uppercase characters:
* <p><blockquote><pre>
@ -5320,7 +5325,8 @@ class Character implements java.io.Serializable, Comparable<Character> {
* @since 1.5
*/
public static boolean isUpperCase(int codePoint) {
return getType(codePoint) == Character.UPPERCASE_LETTER;
return getType(codePoint) == Character.UPPERCASE_LETTER ||
CharacterData.of(codePoint).isOtherUppercase(codePoint);
}
/**
@ -5724,6 +5730,52 @@ class Character implements java.io.Serializable, Comparable<Character> {
return isJavaIdentifierPart(ch);
}
/**
* Determines if the specified character (Unicode code point) is an alphabet.
* <p>
* A character is considered to be alphabetic if its general category type,
* provided by {@link Character#getType(int) getType(codePoint)}, is any of
* the following:
* <ul>
* <li> <code>UPPERCASE_LETTER</code>
* <li> <code>LOWERCASE_LETTER</code>
* <li> <code>TITLECASE_LETTER</code>
* <li> <code>MODIFIER_LETTER</code>
* <li> <code>OTHER_LETTER</code>
* <li> <code>LETTER_NUMBER</code>
* </ul>
* or it has contributory property Other_Alphabetic as defined by the
* Unicode Standard.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return <code>true</code> if the character is a Unicode alphabet
* character, <code>false</code> otherwise.
* @since 1.7
*/
public static boolean isAlphabetic(int codePoint) {
return (((((1 << Character.UPPERCASE_LETTER) |
(1 << Character.LOWERCASE_LETTER) |
(1 << Character.TITLECASE_LETTER) |
(1 << Character.MODIFIER_LETTER) |
(1 << Character.OTHER_LETTER) |
(1 << Character.LETTER_NUMBER)) >> getType(codePoint)) & 1) != 0) ||
CharacterData.of(codePoint).isOtherAlphabetic(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is a CJKV
* (Chinese, Japanese, Korean and Vietnamese) ideograph, as defined by
* the Unicode Standard.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return <code>true</code> if the character is a Unicode ideograph
* character, <code>false</code> otherwise.
* @since 1.7
*/
public static boolean isIdeographic(int codePoint) {
return CharacterData.of(codePoint).isIdeographic(codePoint);
}
/**
* Determines if the specified character is
* permissible as the first character in a Java identifier.
@ -6430,7 +6482,7 @@ class Character implements java.io.Serializable, Comparable<Character> {
/**
* Determines if the specified character is a Unicode space character.
* A character is considered to be a space character if and only if
* it is specified to be a space character by the Unicode standard. This
* it is specified to be a space character by the Unicode Standard. This
* method returns true if the character's general category type is any of
* the following:
* <ul>
@ -6458,7 +6510,7 @@ class Character implements java.io.Serializable, Comparable<Character> {
* Determines if the specified character (Unicode code point) is a
* Unicode space character. A character is considered to be a
* space character if and only if it is specified to be a space
* character by the Unicode standard. This method returns true if
* character by the Unicode Standard. This method returns true if
* the character's general category type is any of the following:
*
* <ul>
@ -6908,7 +6960,7 @@ class Character implements java.io.Serializable, Comparable<Character> {
* @since 1.4
*/
static char[] toUpperCaseCharArray(int codePoint) {
// As of Unicode 4.0, 1:M uppercasings only happen in the BMP.
// As of Unicode 6.0, 1:M uppercasings only happen in the BMP.
assert isBmpCodePoint(codePoint);
return CharacterData.of(codePoint).toUpperCaseCharArray(codePoint);
}
@ -6941,7 +6993,7 @@ class Character implements java.io.Serializable, Comparable<Character> {
* Note: if the specified character is not assigned a name by
* the <i>UnicodeData</i> file (part of the Unicode Character
* Database maintained by the Unicode Consortium), the returned
* name is the same as the result of expression
* name is the same as the result of expression.
*
* <blockquote>{@code
* Character.UnicodeBlock.of(codePoint).toString().replace('_', ' ')

View File

@ -46,10 +46,27 @@ abstract class CharacterData {
int toUpperCaseEx(int ch) {
return toUpperCase(ch);
}
char[] toUpperCaseCharArray(int ch) {
return null;
}
boolean isOtherLowercase(int ch) {
return false;
}
boolean isOtherUppercase(int ch) {
return false;
}
boolean isOtherAlphabetic(int ch) {
return false;
}
boolean isIdeographic(int ch) {
return false;
}
// Character <= 0xff (basic latin) is handled by internal fast-path
// to avoid initializing large tables.
// Note: performance of this "fast-path" code may be sub-optimal

View File

@ -0,0 +1,117 @@
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 7037261
* @summary Check j.l.Character.isLowerCase/isUppercase/isAlphabetic/isIdeographic
*/
import java.util.regex.*;
import java.util.*;
import java.io.*;
import static java.lang.Character.*;
public class CheckProp {
public static void main(String[] args) throws IOException {
File fPropList = new File(System.getProperty("test.src", "."), "PropList.txt");
int i, j;
BufferedReader sbfr = new BufferedReader(new FileReader(fPropList));
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s*;\\s+(\\w+)\\s+#.*").matcher("");
Map<String, ArrayList<Integer>> propMap = new LinkedHashMap<>();
String line = null;
int lineNo = 0;
while ((line = sbfr.readLine()) != null) {
lineNo++;
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3);
ArrayList<Integer> list = propMap.get(name);
if (list == null) {
list = new ArrayList<Integer>();
propMap.put(name, list);
}
while (start <= end)
list.add(start++);
} else {
System.out.printf("Warning: Unrecognized line %d <%s>%n", lineNo, line);
}
}
sbfr.close();
//for (String name: propMap.keySet()) {
// System.out.printf("%s %d%n", name, propMap.get(name).size());
//}
Integer[] otherLowercase = propMap.get("Other_Lowercase").toArray(new Integer[0]);
Integer[] otherUppercase = propMap.get("Other_Uppercase").toArray(new Integer[0]);
Integer[] otherAlphabetic = propMap.get("Other_Alphabetic").toArray(new Integer[0]);
Integer[] ideographic = propMap.get("Ideographic").toArray(new Integer[0]);
int fails = 0;
for (int cp = MIN_CODE_POINT; cp < MAX_CODE_POINT; cp++) {
int type = getType(cp);
if (isLowerCase(cp) !=
(type == LOWERCASE_LETTER ||
Arrays.binarySearch(otherLowercase, cp) >= 0))
{
fails++;
System.err.printf("Wrong isLowerCase(U+%04x)\n", cp);
}
if (isUpperCase(cp) !=
(type == UPPERCASE_LETTER ||
Arrays.binarySearch(otherUppercase, cp) >= 0))
{
fails++;
System.err.printf("Wrong isUpperCase(U+%04x)\n", cp);
}
if (isAlphabetic(cp) !=
(type == UPPERCASE_LETTER || type == LOWERCASE_LETTER ||
type == TITLECASE_LETTER || type == MODIFIER_LETTER ||
type == OTHER_LETTER || type == OTHER_LETTER ||
type == LETTER_NUMBER ||
Arrays.binarySearch(otherAlphabetic, cp) >=0))
{
fails++;
System.err.printf("Wrong isAlphabetic(U+%04x)\n", cp);
}
if (isIdeographic(cp) !=
(Arrays.binarySearch(ideographic, cp) >= 0))
{
fails++;
System.err.printf("Wrong isIdeographic(U+%04x)\n", cp);
}
}
if (fails != 0)
throw new RuntimeException("CheckProp failed=" + fails);
}
}

File diff suppressed because it is too large Load Diff