8301971: Make JDK source code UTF-8

8338973: Document need to have UTF-8 locale available to build the JDK

Reviewed-by: erikj, naoto, mbaesken
This commit is contained in:
Magnus Ihse Bursie 2025-05-09 09:05:10 +00:00
parent 74e981e855
commit 3aa2ea7e67
13 changed files with 72 additions and 128 deletions

View File

@ -1,5 +1,8 @@
root = true root = true
[*]
charset = utf-8
[*.{cpp,hpp,c,h,java,cc,hh,m,mm,S,md,properties,gmk,m4,ac}] [*.{cpp,hpp,c,h,java,cc,hh,m,mm,S,md,properties,gmk,m4,ac}]
trim_trailing_whitespace = true trim_trailing_whitespace = true

1
.gitattributes vendored
View File

@ -1,4 +1,5 @@
* -text * -text
* encoding=utf-8
*.java diff=java *.java diff=java
*.c diff=cpp *.c diff=cpp
*.h diff=cpp *.h diff=cpp

View File

@ -305,6 +305,14 @@ using
<li><p>If using <a href="#cygwin">Cygwin</a>, you must make sure the <li><p>If using <a href="#cygwin">Cygwin</a>, you must make sure the
file permissions and attributes between Windows and Cygwin are file permissions and attributes between Windows and Cygwin are
consistent. It is recommended that you follow this procedure:</p> consistent. It is recommended that you follow this procedure:</p>
<li><p>UTF-8 support is needed to compile the JDK. On Unix systems, this
typically means that the <code>C.UTF-8</code> or
<code>en_US.UTF-8</code> locale needs to be available. For Windows
users, please see the section on <a href="#locale-requirements">Locale
Requirements</a> below.</p></li>
<li><p>On Windows, if using <a href="#cygwin">Cygwin</a>, extra care
must be taken to make sure the environment is consistent. It is
recommended that you follow this procedure:</p>
<ul> <ul>
<li><p>Create the directory that is going to contain the top directory <li><p>Create the directory that is going to contain the top directory
of the JDK clone by using the <code>mkdir</code> command in the Cygwin of the JDK clone by using the <code>mkdir</code> command in the Cygwin

View File

@ -83,6 +83,11 @@ on where and how to check out the source code.
for the source code, see below for suggestions on how to keep the build for the source code, see below for suggestions on how to keep the build
artifacts on a local disk. artifacts on a local disk.
* UTF-8 support is needed to compile the JDK. On Unix systems, this typically
means that the `C.UTF-8` or `en_US.UTF-8` locale needs to be available. For
Windows users, please see the section on [Locale
Requirements](#locale-requirements) below.
* On Windows, extra care must be taken to have a smooth building experience: * On Windows, extra care must be taken to have a smooth building experience:
* Make sure that all relevant paths have short names. Short names are used by * Make sure that all relevant paths have short names. Short names are used by

View File

@ -96,14 +96,14 @@ JAVADOC_DISABLED_DOCLINT_PACKAGES := org.w3c.* javax.smartcardio
# The initial set of options for javadoc # The initial set of options for javadoc
JAVADOC_OPTIONS := -use -keywords -notimestamp \ JAVADOC_OPTIONS := -use -keywords -notimestamp \
-serialwarn -encoding ISO-8859-1 -docencoding UTF-8 -breakiterator \ -serialwarn -encoding utf-8 -docencoding utf-8 -breakiterator \
-splitIndex --system none -javafx --expand-requires transitive \ -splitIndex --system none -javafx --expand-requires transitive \
--override-methods=summary --syntax-highlight --override-methods=summary --syntax-highlight
# The reference options must stay stable to allow for comparisons across the # The reference options must stay stable to allow for comparisons across the
# development cycle. # development cycle.
REFERENCE_OPTIONS := -XDignore.symbol.file=true -use -keywords -notimestamp \ REFERENCE_OPTIONS := -XDignore.symbol.file=true -use -keywords -notimestamp \
-serialwarn -encoding ISO-8859-1 -breakiterator -splitIndex --system none \ -serialwarn -encoding utf-8 -breakiterator -splitIndex --system none \
-html5 -javafx --expand-requires transitive -html5 -javafx --expand-requires transitive
# Should we add DRAFT stamps to the generated javadoc? # Should we add DRAFT stamps to the generated javadoc?

View File

@ -134,17 +134,33 @@ AC_DEFUN_ONCE([BASIC_SETUP_BUILD_ENV],
) )
AC_SUBST(BUILD_ENV) AC_SUBST(BUILD_ENV)
AC_MSG_CHECKING([for locale to use])
if test "x$LOCALE" != x; then if test "x$LOCALE" != x; then
# Check if we actually have C.UTF-8; if so, use it # Check if we actually have C.UTF-8; if so, use it
if $LOCALE -a | $GREP -q -E "^C\.(utf8|UTF-8)$"; then if $LOCALE -a | $GREP -q -E "^C\.(utf8|UTF-8)$"; then
LOCALE_USED=C.UTF-8 LOCALE_USED=C.UTF-8
AC_MSG_RESULT([C.UTF-8 (recommended)])
elif $LOCALE -a | $GREP -q -E "^en_US\.(utf8|UTF-8)$"; then
LOCALE_USED=en_US.UTF-8
AC_MSG_RESULT([en_US.UTF-8 (acceptable fallback)])
else else
AC_MSG_WARN([C.UTF-8 locale not found, using C locale]) # As a fallback, check if users locale is UTF-8. USER_LOCALE was saved
LOCALE_USED=C # by the wrapper configure script before autconf messed up LC_ALL.
if $ECHO $USER_LOCALE | $GREP -q -E "\.(utf8|UTF-8)$"; then
LOCALE_USED=$USER_LOCALE
AC_MSG_RESULT([$USER_LOCALE (untested fallback)])
AC_MSG_WARN([Could not find C.UTF-8 or en_US.UTF-8 locale. This is not supported, and the build might fail unexpectedly.])
else
AC_MSG_RESULT([no UTF-8 locale found])
AC_MSG_WARN([No UTF-8 locale found. This is not supported. Proceeding with the C locale, but the build might fail unexpectedly.])
LOCALE_USED=C
fi
AC_MSG_NOTICE([The recommended locale is C.UTF-8, but en_US.UTF-8 is also accepted.])
fi fi
else else
AC_MSG_WARN([locale command not not found, using C locale]) LOCALE_USED=C.UTF-8
LOCALE_USED=C AC_MSG_RESULT([C.UTF-8 (default)])
AC_MSG_WARN([locale command not not found, using C.UTF-8 locale])
fi fi
export LC_ALL=$LOCALE_USED export LC_ALL=$LOCALE_USED

View File

@ -49,7 +49,9 @@ fi
export CONFIG_SHELL=$BASH export CONFIG_SHELL=$BASH
export _as_can_reexec=no export _as_can_reexec=no
# Make sure all shell commands are executed with the C locale # Save user's current locale, but make sure all future shell commands are
# executed with the C locale
export USER_LOCALE=$LC_ALL
export LC_ALL=C export LC_ALL=C
if test "x$CUSTOM_CONFIG_DIR" != x; then if test "x$CUSTOM_CONFIG_DIR" != x; then

View File

@ -573,12 +573,20 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_HELPER],
TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK -fvisibility=hidden -fstack-protector" TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK -fvisibility=hidden -fstack-protector"
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
# The -utf-8 option sets source and execution character sets to UTF-8 to enable correct TOOLCHAIN_CFLAGS_JVM="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -MP"
# compilation of all source files regardless of the active code page on Windows. TOOLCHAIN_CFLAGS_JDK="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -Zc:wchar_t-"
TOOLCHAIN_CFLAGS_JVM="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -utf-8 -MP"
TOOLCHAIN_CFLAGS_JDK="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -utf-8 -Zc:wchar_t-"
fi fi
# Set character encoding in source
if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
CHARSET_CFLAGS="-finput-charset=utf-8"
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
# The -utf-8 option sets both source and execution character sets
CHARSET_CFLAGS="-utf-8 -validate-charset"
fi
TOOLCHAIN_CFLAGS_JVM="$TOOLCHAIN_CFLAGS_JVM $CHARSET_CFLAGS"
TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK $CHARSET_CFLAGS"
# CFLAGS C language level for JDK sources (hotspot only uses C++) # CFLAGS C language level for JDK sources (hotspot only uses C++)
if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
LANGSTD_CFLAGS="-std=c11" LANGSTD_CFLAGS="-std=c11"

View File

@ -80,15 +80,13 @@ endef
# #
# The sed expression does this: # The sed expression does this:
# 1. Add a backslash before any :, = or ! that do not have a backslash already. # 1. Add a backslash before any :, = or ! that do not have a backslash already.
# 2. Apply the file unicode2x.sed which does a whole bunch of \u00XX to \xXX # 2. Delete all lines starting with #.
# conversions. # 3. Delete empty lines.
# 3. Delete all lines starting with #. # 4. Append lines ending with \ with the next line.
# 4. Delete empty lines. # 5. Remove leading and trailing white space. Note that tabs must be explicit
# 5. Append lines ending with \ with the next line.
# 6. Remove leading and trailing white space. Note that tabs must be explicit
# as sed on macosx does not understand '\t'. # as sed on macosx does not understand '\t'.
# 7. Replace the first \= with just =. # 6. Replace the first \= with just =.
# 8. Finally it's all sorted to create a stable output. # 7. Finally it's all sorted to create a stable output.
# #
# It is assumed that = is the character used for separating names and values. # It is assumed that = is the character used for separating names and values.
define add_file_to_clean define add_file_to_clean
@ -108,7 +106,6 @@ define add_file_to_clean
( $(CAT) $$< && $(ECHO) "" ) \ ( $(CAT) $$< && $(ECHO) "" ) \
| $(SED) -e 's/\([^\\]\):/\1\\:/g' -e 's/\([^\\]\)=/\1\\=/g' \ | $(SED) -e 's/\([^\\]\):/\1\\:/g' -e 's/\([^\\]\)=/\1\\=/g' \
-e 's/\([^\\]\)!/\1\\!/g' -e 's/^[ ]*#.*/#/g' \ -e 's/\([^\\]\)!/\1\\!/g' -e 's/^[ ]*#.*/#/g' \
| $(SED) -f "$$(TOPDIR)/make/common/support/unicode2x.sed" \
| $(SED) -e '/^#/d' -e '/^$$$$/d' \ | $(SED) -e '/^#/d' -e '/^$$$$/d' \
-e :a -e '/\\$$$$/N; s/\\\n//; ta' \ -e :a -e '/\\$$$$/N; s/\\\n//; ta' \
-e 's/^[ ]*//;s/[ ]*$$$$//' \ -e 's/^[ ]*//;s/[ ]*$$$$//' \
@ -265,10 +262,12 @@ define SetupJavaCompilationBody
endif endif
# Tell javac to do exactly as told and no more # Tell javac to do exactly as told and no more
PARANOIA_FLAGS := -implicit:none -Xprefer:source -XDignore.symbol.file=true -encoding ascii PARANOIA_FLAGS := -implicit:none -Xprefer:source -XDignore.symbol.file=true
$1_FLAGS += -g -Xlint:all $$($1_TARGET_RELEASE) $$(PARANOIA_FLAGS) $1_FLAGS += -g -Xlint:all $$($1_TARGET_RELEASE) $$(PARANOIA_FLAGS)
$1_FLAGS += $$($1_JAVAC_FLAGS) $1_FLAGS += $$($1_JAVAC_FLAGS)
# Set character encoding in source
$1_FLAGS += -encoding utf-8
ifeq ($$(JAVA_WARNINGS_AS_ERRORS), true) ifeq ($$(JAVA_WARNINGS_AS_ERRORS), true)
$1_FLAGS += -Werror $1_FLAGS += -Werror

View File

@ -227,6 +227,8 @@ endef
GLOBAL_VERSION_INFO_RESOURCE := $(TOPDIR)/src/java.base/windows/native/common/version.rc GLOBAL_VERSION_INFO_RESOURCE := $(TOPDIR)/src/java.base/windows/native/common/version.rc
# \xA9 is the copyright symbol in ANSI encoding (Windows-1252), which rc.exe
# assumes the resource file is in.
JDK_RCFLAGS=$(RCFLAGS) \ JDK_RCFLAGS=$(RCFLAGS) \
-D"JDK_VERSION_STRING=$(VERSION_STRING)" \ -D"JDK_VERSION_STRING=$(VERSION_STRING)" \
-D"JDK_COMPANY=$(JDK_RC_COMPANY_NAME)" \ -D"JDK_COMPANY=$(JDK_RC_COMPANY_NAME)" \

View File

@ -1,100 +0,0 @@
s/\\u0020/\x20/g
s/\\u003A/\x3A/g
s/\\u006B/\x6B/g
s/\\u0075/\x75/g
s/\\u00A0/\xA0/g
s/\\u00A3/\xA3/g
s/\\u00B0/\xB0/g
s/\\u00B7/\xB7/g
s/\\u00BA/\xBA/g
s/\\u00BF/\xBF/g
s/\\u00C0/\xC0/g
s/\\u00C1/\xC1/g
s/\\u00C2/\xC2/g
s/\\u00C4/\xC4/g
s/\\u00C5/\xC5/g
s/\\u00C8/\xC8/g
s/\\u00C9/\xC9/g
s/\\u00CA/\xCA/g
s/\\u00CD/\xCD/g
s/\\u00CE/\xCE/g
s/\\u00D3/\xD3/g
s/\\u00D4/\xD4/g
s/\\u00D6/\xD6/g
s/\\u00DA/\xDA/g
s/\\u00DC/\xDC/g
s/\\u00DD/\xDD/g
s/\\u00DF/\xDF/g
s/\\u00E0/\xE0/g
s/\\u00E1/\xE1/g
s/\\u00E2/\xE2/g
s/\\u00E3/\xE3/g
s/\\u00E4/\xE4/g
s/\\u00E5/\xE5/g
s/\\u00E6/\xE6/g
s/\\u00E7/\xE7/g
s/\\u00E8/\xE8/g
s/\\u00E9/\xE9/g
s/\\u00EA/\xEA/g
s/\\u00EB/\xEB/g
s/\\u00EC/\xEC/g
s/\\u00ED/\xED/g
s/\\u00EE/\xEE/g
s/\\u00EF/\xEF/g
s/\\u00F1/\xF1/g
s/\\u00F2/\xF2/g
s/\\u00F3/\xF3/g
s/\\u00F4/\xF4/g
s/\\u00F5/\xF5/g
s/\\u00F6/\xF6/g
s/\\u00F9/\xF9/g
s/\\u00FA/\xFA/g
s/\\u00FC/\xFC/g
s/\\u0020/\x20/g
s/\\u003f/\x3f/g
s/\\u006f/\x6f/g
s/\\u0075/\x75/g
s/\\u00a0/\xa0/g
s/\\u00a3/\xa3/g
s/\\u00b0/\xb0/g
s/\\u00ba/\xba/g
s/\\u00bf/\xbf/g
s/\\u00c1/\xc1/g
s/\\u00c4/\xc4/g
s/\\u00c5/\xc5/g
s/\\u00c8/\xc8/g
s/\\u00c9/\xc9/g
s/\\u00ca/\xca/g
s/\\u00cd/\xcd/g
s/\\u00d6/\xd6/g
s/\\u00dc/\xdc/g
s/\\u00dd/\xdd/g
s/\\u00df/\xdf/g
s/\\u00e0/\xe0/g
s/\\u00e1/\xe1/g
s/\\u00e2/\xe2/g
s/\\u00e3/\xe3/g
s/\\u00e4/\xe4/g
s/\\u00e5/\xe5/g
s/\\u00e7/\xe7/g
s/\\u00e8/\xe8/g
s/\\u00e9/\xe9/g
s/\\u00ea/\xea/g
s/\\u00eb/\xeb/g
s/\\u00ec/\xec/g
s/\\u00ed/\xed/g
s/\\u00ee/\xee/g
s/\\u00ef/\xef/g
s/\\u00f0/\xf0/g
s/\\u00f1/\xf1/g
s/\\u00f2/\xf2/g
s/\\u00f3/\xf3/g
s/\\u00f4/\xf4/g
s/\\u00f5/\xf5/g
s/\\u00f6/\xf6/g
s/\\u00f7/\xf7/g
s/\\u00f8/\xf8/g
s/\\u00f9/\xf9/g
s/\\u00fa/\xfa/g
s/\\u00fc/\xfc/g
s/\\u00ff/\xff/g

View File

@ -87,7 +87,7 @@
"zh", "zh_CN", "zh", "zh_CN",
#ifdef __linux__ #ifdef __linux__
"bokmal", "nb_NO", "bokmal", "nb_NO",
"bokm\xE5l", "nb_NO", "bokmål", "nb_NO",
"catalan", "ca_ES", "catalan", "ca_ES",
"croatian", "hr_HR", "croatian", "hr_HR",
"czech", "cs_CZ", "czech", "cs_CZ",
@ -98,7 +98,7 @@
"eesti", "et_EE", "eesti", "et_EE",
"estonian", "et_EE", "estonian", "et_EE",
"finnish", "fi_FI", "finnish", "fi_FI",
"fran\xE7\x61is", "fr_FR", "français", "fr_FR",
"french", "fr_FR", "french", "fr_FR",
"galego", "gl_ES", "galego", "gl_ES",
"galician", "gl_ES", "galician", "gl_ES",
@ -162,7 +162,7 @@ static char *language_names[] = {
"deutsch", "de", "deutsch", "de",
"dutch", "nl", "dutch", "nl",
"finnish", "fi", "finnish", "fi",
"fran\xE7\x61is", "fr", "français", "fr",
"french", "fr", "french", "fr",
"german", "de", "german", "de",
"greek", "el", "greek", "el",

View File

@ -134,7 +134,7 @@ WCHAR * fixes[2][2][3][16] =
L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
}, },
{ // currency { // currency
L"\xA4", L"", L"\xA4 ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"¤", L"", L"¤ ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
}, },
{ // percent { // percent
L"", L"", L"%", L"% ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"%", L"% ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
@ -145,7 +145,7 @@ WCHAR * fixes[2][2][3][16] =
L"(", L"-", L"- ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"(", L"-", L"- ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
}, },
{ //currency { //currency
L"(\xA4", L"-\xA4", L"\xA4-", L"\xA4", L"(", L"-", L"", L"", L"-", L"-\xA4 ", L"", L"\xA4 ", L"\xA4 -", L"", L"(\xA4 ", L"(" L"(¤", L"", L"¤-", L"¤", L"(", L"-", L"", L"", L"-", L"-¤ ", L"", L"¤ ", L"¤ -", L"", L"(¤ ", L"("
}, },
{ // percent { // percent
L"-", L"-", L"-%", L"%-", L"%", L"", L"", L"-% ", L"", L"% ", L"% -", L"", L"", L"", L"", L"", L"-", L"-", L"-%", L"%-", L"%", L"", L"", L"-% ", L"", L"% ", L"% -", L"", L"", L"", L"", L"",
@ -158,7 +158,7 @@ WCHAR * fixes[2][2][3][16] =
L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"" L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L""
}, },
{ // currency { // currency
L"", L"\xA4 ", L"", L" \xA4", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"¤ ", L"", L" ¤", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
}, },
{ // percent { // percent
L" %", L"%", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L" %", L"%", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
@ -169,7 +169,7 @@ WCHAR * fixes[2][2][3][16] =
L")", L"", L" ", L"-", L" -", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L")", L"", L" ", L"-", L" -", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
}, },
{ //currency { //currency
L")", L"", L"", L"-", L"\xA4)", L"\xA4", L"-\xA4", L"\xA4-", L" \xA4", L"", L" \xA4-", L"-", L"", L"- \xA4", L")", L" \xA4)" L")", L"", L"", L"-", L"¤)", L"¤", L"", L"¤-", L" ¤", L"", L" ¤-", L"-", L"", L"- ¤", L")", L" ¤)"
}, },
{ // percent { // percent
L" %", L"%", L"", L"", L"-", L"-%", L"%-", L"", L" %-", L"-", L"", L"- %", L"", L"", L"", L"", L" %", L"%", L"", L"", L"-", L"-%", L"%-", L"", L" %-", L"-", L"", L"- %", L"", L"", L"", L"",