8301971: Make JDK source code UTF-8

8338973: Document need to have UTF-8 locale available to build the JDK

Reviewed-by: erikj, naoto, mbaesken
This commit is contained in:
Magnus Ihse Bursie 2025-05-09 09:05:10 +00:00
parent 74e981e855
commit 3aa2ea7e67
13 changed files with 72 additions and 128 deletions

View File

@ -1,5 +1,8 @@
root = true
[*]
charset = utf-8
[*.{cpp,hpp,c,h,java,cc,hh,m,mm,S,md,properties,gmk,m4,ac}]
trim_trailing_whitespace = true

1
.gitattributes vendored
View File

@ -1,4 +1,5 @@
* -text
* encoding=utf-8
*.java diff=java
*.c diff=cpp
*.h diff=cpp

View File

@ -305,6 +305,14 @@ using
<li><p>If using <a href="#cygwin">Cygwin</a>, you must make sure the
file permissions and attributes between Windows and Cygwin are
consistent. It is recommended that you follow this procedure:</p>
<li><p>UTF-8 support is needed to compile the JDK. On Unix systems, this
typically means that the <code>C.UTF-8</code> or
<code>en_US.UTF-8</code> locale needs to be available. For Windows
users, please see the section on <a href="#locale-requirements">Locale
Requirements</a> below.</p></li>
<li><p>On Windows, if using <a href="#cygwin">Cygwin</a>, extra care
must be taken to make sure the environment is consistent. It is
recommended that you follow this procedure:</p>
<ul>
<li><p>Create the directory that is going to contain the top directory
of the JDK clone by using the <code>mkdir</code> command in the Cygwin

View File

@ -83,6 +83,11 @@ on where and how to check out the source code.
for the source code, see below for suggestions on how to keep the build
artifacts on a local disk.
* UTF-8 support is needed to compile the JDK. On Unix systems, this typically
means that the `C.UTF-8` or `en_US.UTF-8` locale needs to be available. For
Windows users, please see the section on [Locale
Requirements](#locale-requirements) below.
* On Windows, extra care must be taken to have a smooth building experience:
* Make sure that all relevant paths have short names. Short names are used by

View File

@ -96,14 +96,14 @@ JAVADOC_DISABLED_DOCLINT_PACKAGES := org.w3c.* javax.smartcardio
# The initial set of options for javadoc
JAVADOC_OPTIONS := -use -keywords -notimestamp \
-serialwarn -encoding ISO-8859-1 -docencoding UTF-8 -breakiterator \
-serialwarn -encoding utf-8 -docencoding utf-8 -breakiterator \
-splitIndex --system none -javafx --expand-requires transitive \
--override-methods=summary --syntax-highlight
# The reference options must stay stable to allow for comparisons across the
# development cycle.
REFERENCE_OPTIONS := -XDignore.symbol.file=true -use -keywords -notimestamp \
-serialwarn -encoding ISO-8859-1 -breakiterator -splitIndex --system none \
-serialwarn -encoding utf-8 -breakiterator -splitIndex --system none \
-html5 -javafx --expand-requires transitive
# Should we add DRAFT stamps to the generated javadoc?

View File

@ -134,17 +134,33 @@ AC_DEFUN_ONCE([BASIC_SETUP_BUILD_ENV],
)
AC_SUBST(BUILD_ENV)
AC_MSG_CHECKING([for locale to use])
if test "x$LOCALE" != x; then
# Check if we actually have C.UTF-8; if so, use it
if $LOCALE -a | $GREP -q -E "^C\.(utf8|UTF-8)$"; then
LOCALE_USED=C.UTF-8
AC_MSG_RESULT([C.UTF-8 (recommended)])
elif $LOCALE -a | $GREP -q -E "^en_US\.(utf8|UTF-8)$"; then
LOCALE_USED=en_US.UTF-8
AC_MSG_RESULT([en_US.UTF-8 (acceptable fallback)])
else
AC_MSG_WARN([C.UTF-8 locale not found, using C locale])
LOCALE_USED=C
# As a fallback, check if users locale is UTF-8. USER_LOCALE was saved
# by the wrapper configure script before autconf messed up LC_ALL.
if $ECHO $USER_LOCALE | $GREP -q -E "\.(utf8|UTF-8)$"; then
LOCALE_USED=$USER_LOCALE
AC_MSG_RESULT([$USER_LOCALE (untested fallback)])
AC_MSG_WARN([Could not find C.UTF-8 or en_US.UTF-8 locale. This is not supported, and the build might fail unexpectedly.])
else
AC_MSG_RESULT([no UTF-8 locale found])
AC_MSG_WARN([No UTF-8 locale found. This is not supported. Proceeding with the C locale, but the build might fail unexpectedly.])
LOCALE_USED=C
fi
AC_MSG_NOTICE([The recommended locale is C.UTF-8, but en_US.UTF-8 is also accepted.])
fi
else
AC_MSG_WARN([locale command not not found, using C locale])
LOCALE_USED=C
LOCALE_USED=C.UTF-8
AC_MSG_RESULT([C.UTF-8 (default)])
AC_MSG_WARN([locale command not not found, using C.UTF-8 locale])
fi
export LC_ALL=$LOCALE_USED

View File

@ -49,7 +49,9 @@ fi
export CONFIG_SHELL=$BASH
export _as_can_reexec=no
# Make sure all shell commands are executed with the C locale
# Save user's current locale, but make sure all future shell commands are
# executed with the C locale
export USER_LOCALE=$LC_ALL
export LC_ALL=C
if test "x$CUSTOM_CONFIG_DIR" != x; then

View File

@ -573,12 +573,20 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_HELPER],
TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK -fvisibility=hidden -fstack-protector"
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
# The -utf-8 option sets source and execution character sets to UTF-8 to enable correct
# compilation of all source files regardless of the active code page on Windows.
TOOLCHAIN_CFLAGS_JVM="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -utf-8 -MP"
TOOLCHAIN_CFLAGS_JDK="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -utf-8 -Zc:wchar_t-"
TOOLCHAIN_CFLAGS_JVM="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -MP"
TOOLCHAIN_CFLAGS_JDK="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -Zc:wchar_t-"
fi
# Set character encoding in source
if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
CHARSET_CFLAGS="-finput-charset=utf-8"
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
# The -utf-8 option sets both source and execution character sets
CHARSET_CFLAGS="-utf-8 -validate-charset"
fi
TOOLCHAIN_CFLAGS_JVM="$TOOLCHAIN_CFLAGS_JVM $CHARSET_CFLAGS"
TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK $CHARSET_CFLAGS"
# CFLAGS C language level for JDK sources (hotspot only uses C++)
if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
LANGSTD_CFLAGS="-std=c11"

View File

@ -80,15 +80,13 @@ endef
#
# The sed expression does this:
# 1. Add a backslash before any :, = or ! that do not have a backslash already.
# 2. Apply the file unicode2x.sed which does a whole bunch of \u00XX to \xXX
# conversions.
# 3. Delete all lines starting with #.
# 4. Delete empty lines.
# 5. Append lines ending with \ with the next line.
# 6. Remove leading and trailing white space. Note that tabs must be explicit
# 2. Delete all lines starting with #.
# 3. Delete empty lines.
# 4. Append lines ending with \ with the next line.
# 5. Remove leading and trailing white space. Note that tabs must be explicit
# as sed on macosx does not understand '\t'.
# 7. Replace the first \= with just =.
# 8. Finally it's all sorted to create a stable output.
# 6. Replace the first \= with just =.
# 7. Finally it's all sorted to create a stable output.
#
# It is assumed that = is the character used for separating names and values.
define add_file_to_clean
@ -108,7 +106,6 @@ define add_file_to_clean
( $(CAT) $$< && $(ECHO) "" ) \
| $(SED) -e 's/\([^\\]\):/\1\\:/g' -e 's/\([^\\]\)=/\1\\=/g' \
-e 's/\([^\\]\)!/\1\\!/g' -e 's/^[ ]*#.*/#/g' \
| $(SED) -f "$$(TOPDIR)/make/common/support/unicode2x.sed" \
| $(SED) -e '/^#/d' -e '/^$$$$/d' \
-e :a -e '/\\$$$$/N; s/\\\n//; ta' \
-e 's/^[ ]*//;s/[ ]*$$$$//' \
@ -265,10 +262,12 @@ define SetupJavaCompilationBody
endif
# Tell javac to do exactly as told and no more
PARANOIA_FLAGS := -implicit:none -Xprefer:source -XDignore.symbol.file=true -encoding ascii
PARANOIA_FLAGS := -implicit:none -Xprefer:source -XDignore.symbol.file=true
$1_FLAGS += -g -Xlint:all $$($1_TARGET_RELEASE) $$(PARANOIA_FLAGS)
$1_FLAGS += $$($1_JAVAC_FLAGS)
# Set character encoding in source
$1_FLAGS += -encoding utf-8
ifeq ($$(JAVA_WARNINGS_AS_ERRORS), true)
$1_FLAGS += -Werror

View File

@ -227,6 +227,8 @@ endef
GLOBAL_VERSION_INFO_RESOURCE := $(TOPDIR)/src/java.base/windows/native/common/version.rc
# \xA9 is the copyright symbol in ANSI encoding (Windows-1252), which rc.exe
# assumes the resource file is in.
JDK_RCFLAGS=$(RCFLAGS) \
-D"JDK_VERSION_STRING=$(VERSION_STRING)" \
-D"JDK_COMPANY=$(JDK_RC_COMPANY_NAME)" \

View File

@ -1,100 +0,0 @@
s/\\u0020/\x20/g
s/\\u003A/\x3A/g
s/\\u006B/\x6B/g
s/\\u0075/\x75/g
s/\\u00A0/\xA0/g
s/\\u00A3/\xA3/g
s/\\u00B0/\xB0/g
s/\\u00B7/\xB7/g
s/\\u00BA/\xBA/g
s/\\u00BF/\xBF/g
s/\\u00C0/\xC0/g
s/\\u00C1/\xC1/g
s/\\u00C2/\xC2/g
s/\\u00C4/\xC4/g
s/\\u00C5/\xC5/g
s/\\u00C8/\xC8/g
s/\\u00C9/\xC9/g
s/\\u00CA/\xCA/g
s/\\u00CD/\xCD/g
s/\\u00CE/\xCE/g
s/\\u00D3/\xD3/g
s/\\u00D4/\xD4/g
s/\\u00D6/\xD6/g
s/\\u00DA/\xDA/g
s/\\u00DC/\xDC/g
s/\\u00DD/\xDD/g
s/\\u00DF/\xDF/g
s/\\u00E0/\xE0/g
s/\\u00E1/\xE1/g
s/\\u00E2/\xE2/g
s/\\u00E3/\xE3/g
s/\\u00E4/\xE4/g
s/\\u00E5/\xE5/g
s/\\u00E6/\xE6/g
s/\\u00E7/\xE7/g
s/\\u00E8/\xE8/g
s/\\u00E9/\xE9/g
s/\\u00EA/\xEA/g
s/\\u00EB/\xEB/g
s/\\u00EC/\xEC/g
s/\\u00ED/\xED/g
s/\\u00EE/\xEE/g
s/\\u00EF/\xEF/g
s/\\u00F1/\xF1/g
s/\\u00F2/\xF2/g
s/\\u00F3/\xF3/g
s/\\u00F4/\xF4/g
s/\\u00F5/\xF5/g
s/\\u00F6/\xF6/g
s/\\u00F9/\xF9/g
s/\\u00FA/\xFA/g
s/\\u00FC/\xFC/g
s/\\u0020/\x20/g
s/\\u003f/\x3f/g
s/\\u006f/\x6f/g
s/\\u0075/\x75/g
s/\\u00a0/\xa0/g
s/\\u00a3/\xa3/g
s/\\u00b0/\xb0/g
s/\\u00ba/\xba/g
s/\\u00bf/\xbf/g
s/\\u00c1/\xc1/g
s/\\u00c4/\xc4/g
s/\\u00c5/\xc5/g
s/\\u00c8/\xc8/g
s/\\u00c9/\xc9/g
s/\\u00ca/\xca/g
s/\\u00cd/\xcd/g
s/\\u00d6/\xd6/g
s/\\u00dc/\xdc/g
s/\\u00dd/\xdd/g
s/\\u00df/\xdf/g
s/\\u00e0/\xe0/g
s/\\u00e1/\xe1/g
s/\\u00e2/\xe2/g
s/\\u00e3/\xe3/g
s/\\u00e4/\xe4/g
s/\\u00e5/\xe5/g
s/\\u00e7/\xe7/g
s/\\u00e8/\xe8/g
s/\\u00e9/\xe9/g
s/\\u00ea/\xea/g
s/\\u00eb/\xeb/g
s/\\u00ec/\xec/g
s/\\u00ed/\xed/g
s/\\u00ee/\xee/g
s/\\u00ef/\xef/g
s/\\u00f0/\xf0/g
s/\\u00f1/\xf1/g
s/\\u00f2/\xf2/g
s/\\u00f3/\xf3/g
s/\\u00f4/\xf4/g
s/\\u00f5/\xf5/g
s/\\u00f6/\xf6/g
s/\\u00f7/\xf7/g
s/\\u00f8/\xf8/g
s/\\u00f9/\xf9/g
s/\\u00fa/\xfa/g
s/\\u00fc/\xfc/g
s/\\u00ff/\xff/g

View File

@ -87,7 +87,7 @@
"zh", "zh_CN",
#ifdef __linux__
"bokmal", "nb_NO",
"bokm\xE5l", "nb_NO",
"bokmål", "nb_NO",
"catalan", "ca_ES",
"croatian", "hr_HR",
"czech", "cs_CZ",
@ -98,7 +98,7 @@
"eesti", "et_EE",
"estonian", "et_EE",
"finnish", "fi_FI",
"fran\xE7\x61is", "fr_FR",
"français", "fr_FR",
"french", "fr_FR",
"galego", "gl_ES",
"galician", "gl_ES",
@ -162,7 +162,7 @@ static char *language_names[] = {
"deutsch", "de",
"dutch", "nl",
"finnish", "fi",
"fran\xE7\x61is", "fr",
"français", "fr",
"french", "fr",
"german", "de",
"greek", "el",

View File

@ -134,7 +134,7 @@ WCHAR * fixes[2][2][3][16] =
L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
},
{ // currency
L"\xA4", L"", L"\xA4 ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
L"¤", L"", L"¤ ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
},
{ // percent
L"", L"", L"%", L"% ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
@ -145,7 +145,7 @@ WCHAR * fixes[2][2][3][16] =
L"(", L"-", L"- ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
},
{ //currency
L"(\xA4", L"-\xA4", L"\xA4-", L"\xA4", L"(", L"-", L"", L"", L"-", L"-\xA4 ", L"", L"\xA4 ", L"\xA4 -", L"", L"(\xA4 ", L"("
L"(¤", L"", L"¤-", L"¤", L"(", L"-", L"", L"", L"-", L"-¤ ", L"", L"¤ ", L"¤ -", L"", L"(¤ ", L"("
},
{ // percent
L"-", L"-", L"-%", L"%-", L"%", L"", L"", L"-% ", L"", L"% ", L"% -", L"", L"", L"", L"", L"",
@ -158,7 +158,7 @@ WCHAR * fixes[2][2][3][16] =
L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L""
},
{ // currency
L"", L"\xA4 ", L"", L" \xA4", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
L"", L"¤ ", L"", L" ¤", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
},
{ // percent
L" %", L"%", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
@ -169,7 +169,7 @@ WCHAR * fixes[2][2][3][16] =
L")", L"", L" ", L"-", L" -", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
},
{ //currency
L")", L"", L"", L"-", L"\xA4)", L"\xA4", L"-\xA4", L"\xA4-", L" \xA4", L"", L" \xA4-", L"-", L"", L"- \xA4", L")", L" \xA4)"
L")", L"", L"", L"-", L"¤)", L"¤", L"", L"¤-", L" ¤", L"", L" ¤-", L"-", L"", L"- ¤", L")", L" ¤)"
},
{ // percent
L" %", L"%", L"", L"", L"-", L"-%", L"%-", L"", L" %-", L"-", L"", L"- %", L"", L"", L"", L"",