8301971: Make JDK source code UTF-8

8338973: Document need to have UTF-8 locale available to build the JDK Reviewed-by: erikj, naoto, mbaesken
2025-05-09 09:05:10 +00:00 · 2025-05-09 09:05:10 +00:00 · 3aa2ea7e67
commit 3aa2ea7e67
parent 74e981e855
13 changed files with 72 additions and 128 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -1,5 +1,8 @@
 root = true
 [*]
 charset = utf-8
 [*.{cpp,hpp,c,h,java,cc,hh,m,mm,S,md,properties,gmk,m4,ac}]
 trim_trailing_whitespace = true
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,5 @@
 *	-text
 *	encoding=utf-8
 *.java	diff=java
 *.c	diff=cpp
 *.h	diff=cpp
--- a/doc/building.html
+++ b/doc/building.html
@ -305,6 +305,14 @@ using
 <li><p>If using <a href="#cygwin">Cygwin</a>, you must make sure the
 file permissions and attributes between Windows and Cygwin are
 consistent. It is recommended that you follow this procedure:</p>
 <li><p>UTF-8 support is needed to compile the JDK. On Unix systems, this
 typically means that the <code>C.UTF-8</code> or
 <code>en_US.UTF-8</code> locale needs to be available. For Windows
 users, please see the section on <a href="#locale-requirements">Locale
 Requirements</a> below.</p></li>
 <li><p>On Windows, if using <a href="#cygwin">Cygwin</a>, extra care
 must be taken to make sure the environment is consistent. It is
 recommended that you follow this procedure:</p>
 <ul>
 <li><p>Create the directory that is going to contain the top directory
 of the JDK clone by using the <code>mkdir</code> command in the Cygwin
--- a/doc/building.md
+++ b/doc/building.md
@ -83,6 +83,11 @@ on where and how to check out the source code.
  for the source code, see below for suggestions on how to keep the build
  artifacts on a local disk.
 * UTF-8 support is needed to compile the JDK. On Unix systems, this typically
  means that the `C.UTF-8` or `en_US.UTF-8` locale needs to be available. For
  Windows users, please see the section on [Locale
  Requirements](#locale-requirements) below.
 * On Windows, extra care must be taken to have a smooth building experience:
  * Make sure that all relevant paths have short names. Short names are used by
--- a/make/Docs.gmk
+++ b/make/Docs.gmk
@ -96,14 +96,14 @@ JAVADOC_DISABLED_DOCLINT_PACKAGES := org.w3c.* javax.smartcardio
 # The initial set of options for javadoc
 JAVADOC_OPTIONS := -use -keywords -notimestamp \
-    -serialwarn -encoding ISO-8859-1 -docencoding UTF-8 -breakiterator \
+    -serialwarn -encoding utf-8 -docencoding utf-8 -breakiterator \
    -splitIndex --system none -javafx --expand-requires transitive \
    --override-methods=summary --syntax-highlight
 # The reference options must stay stable to allow for comparisons across the
 # development cycle.
 REFERENCE_OPTIONS := -XDignore.symbol.file=true -use -keywords -notimestamp \
-    -serialwarn -encoding ISO-8859-1 -breakiterator -splitIndex --system none \
+    -serialwarn -encoding utf-8 -breakiterator -splitIndex --system none \
    -html5 -javafx --expand-requires transitive
 # Should we add DRAFT stamps to the generated javadoc?
--- a/make/autoconf/basic.m4
+++ b/make/autoconf/basic.m4
@ -134,17 +134,33 @@ AC_DEFUN_ONCE([BASIC_SETUP_BUILD_ENV],
  )
  AC_SUBST(BUILD_ENV)
  AC_MSG_CHECKING([for locale to use])
  if test "x$LOCALE" != x; then
    # Check if we actually have C.UTF-8; if so, use it
    if $LOCALE -a | $GREP -q -E "^C\.(utf8|UTF-8)$"; then
      LOCALE_USED=C.UTF-8
      AC_MSG_RESULT([C.UTF-8 (recommended)])
    elif $LOCALE -a | $GREP -q -E "^en_US\.(utf8|UTF-8)$"; then
      LOCALE_USED=en_US.UTF-8
      AC_MSG_RESULT([en_US.UTF-8 (acceptable fallback)])
    else
-      AC_MSG_WARN([C.UTF-8 locale not found, using C locale])
+      # As a fallback, check if users locale is UTF-8. USER_LOCALE was saved
-      LOCALE_USED=C
+      # by the wrapper configure script before autconf messed up LC_ALL.
      if $ECHO $USER_LOCALE | $GREP -q -E "\.(utf8|UTF-8)$"; then
        LOCALE_USED=$USER_LOCALE
        AC_MSG_RESULT([$USER_LOCALE (untested fallback)])
        AC_MSG_WARN([Could not find C.UTF-8 or en_US.UTF-8 locale. This is not supported, and the build might fail unexpectedly.])
      else
        AC_MSG_RESULT([no UTF-8 locale found])
        AC_MSG_WARN([No UTF-8 locale found. This is not supported. Proceeding with the C locale, but the build might fail unexpectedly.])
        LOCALE_USED=C
      fi
      AC_MSG_NOTICE([The recommended locale is C.UTF-8, but en_US.UTF-8 is also accepted.])
    fi
  else
-    AC_MSG_WARN([locale command not not found, using C locale])
+    LOCALE_USED=C.UTF-8
-    LOCALE_USED=C
+    AC_MSG_RESULT([C.UTF-8 (default)])
    AC_MSG_WARN([locale command not not found, using C.UTF-8 locale])
  fi
  export LC_ALL=$LOCALE_USED
--- a/make/autoconf/configure
+++ b/make/autoconf/configure
@ -49,7 +49,9 @@ fi
 export CONFIG_SHELL=$BASH
 export _as_can_reexec=no
-# Make sure all shell commands are executed with the C locale
+# Save user's current locale, but make sure all future shell commands are
 # executed with the C locale
 export USER_LOCALE=$LC_ALL
 export LC_ALL=C
 if test "x$CUSTOM_CONFIG_DIR" != x; then
--- a/make/autoconf/flags-cflags.m4
+++ b/make/autoconf/flags-cflags.m4
@ -573,12 +573,20 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_HELPER],
    TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK -fvisibility=hidden -fstack-protector"
  elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
-    # The -utf-8 option sets source and execution character sets to UTF-8 to enable correct
+    TOOLCHAIN_CFLAGS_JVM="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -MP"
-    # compilation of all source files regardless of the active code page on Windows.
+    TOOLCHAIN_CFLAGS_JDK="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -Zc:wchar_t-"
    TOOLCHAIN_CFLAGS_JVM="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -utf-8 -MP"
    TOOLCHAIN_CFLAGS_JDK="-nologo -MD -Zc:preprocessor -Zc:inline -Zc:throwingNew -permissive- -utf-8 -Zc:wchar_t-"
  fi
  # Set character encoding in source
  if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
    CHARSET_CFLAGS="-finput-charset=utf-8"
  elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
    # The -utf-8 option sets both source and execution character sets
    CHARSET_CFLAGS="-utf-8 -validate-charset"
  fi
  TOOLCHAIN_CFLAGS_JVM="$TOOLCHAIN_CFLAGS_JVM $CHARSET_CFLAGS"
  TOOLCHAIN_CFLAGS_JDK="$TOOLCHAIN_CFLAGS_JDK $CHARSET_CFLAGS"
  # CFLAGS C language level for JDK sources (hotspot only uses C++)
  if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
    LANGSTD_CFLAGS="-std=c11"
--- a/make/common/JavaCompilation.gmk
+++ b/make/common/JavaCompilation.gmk
@ -80,15 +80,13 @@ endef
 #
 # The sed expression does this:
 # 1. Add a backslash before any :, = or ! that do not have a backslash already.
-# 2. Apply the file unicode2x.sed which does a whole bunch of \u00XX to \xXX
+# 2. Delete all lines starting with #.
-#    conversions.
+# 3. Delete empty lines.
-# 3. Delete all lines starting with #.
+# 4. Append lines ending with \ with the next line.
-# 4. Delete empty lines.
+# 5. Remove leading and trailing white space. Note that tabs must be explicit
 # 5. Append lines ending with \ with the next line.
 # 6. Remove leading and trailing white space. Note that tabs must be explicit
 #    as sed on macosx does not understand '\t'.
-# 7. Replace the first \= with just =.
+# 6. Replace the first \= with just =.
-# 8. Finally it's all sorted to create a stable output.
+# 7. Finally it's all sorted to create a stable output.
 #
 # It is assumed that = is the character used for separating names and values.
 define add_file_to_clean
@ -108,7 +106,6 @@ define add_file_to_clean
 	( $(CAT) $$< && $(ECHO) "" ) \
 	    | $(SED) -e 's/\([^\\]\):/\1\\:/g' -e 's/\([^\\]\)=/\1\\=/g' \
 	        -e 's/\([^\\]\)!/\1\\!/g' -e 's/^[ 	]*#.*/#/g' \
 	    | $(SED) -f "$$(TOPDIR)/make/common/support/unicode2x.sed" \
 	    | $(SED) -e '/^#/d' -e '/^$$$$/d' \
 	        -e :a -e '/\\$$$$/N; s/\\\n//; ta' \
 	        -e 's/^[ 	]*//;s/[ 	]*$$$$//' \
@ -265,10 +262,12 @@ define SetupJavaCompilationBody
  endif
  # Tell javac to do exactly as told and no more
-  PARANOIA_FLAGS := -implicit:none -Xprefer:source -XDignore.symbol.file=true -encoding ascii
+  PARANOIA_FLAGS := -implicit:none -Xprefer:source -XDignore.symbol.file=true
  $1_FLAGS += -g -Xlint:all $$($1_TARGET_RELEASE) $$(PARANOIA_FLAGS)
  $1_FLAGS += $$($1_JAVAC_FLAGS)
  # Set character encoding in source
  $1_FLAGS += -encoding utf-8
  ifeq ($$(JAVA_WARNINGS_AS_ERRORS), true)
    $1_FLAGS += -Werror
--- a/make/common/JdkNativeCompilation.gmk
+++ b/make/common/JdkNativeCompilation.gmk
@ -227,6 +227,8 @@ endef
 GLOBAL_VERSION_INFO_RESOURCE := $(TOPDIR)/src/java.base/windows/native/common/version.rc
 # \xA9 is the copyright symbol in ANSI encoding (Windows-1252), which rc.exe
 # assumes the resource file is in.
 JDK_RCFLAGS=$(RCFLAGS) \
    -D"JDK_VERSION_STRING=$(VERSION_STRING)" \
    -D"JDK_COMPANY=$(JDK_RC_COMPANY_NAME)" \
--- a/make/common/support/unicode2x.sed
+++ b/make/common/support/unicode2x.sed
@ -1,100 +0,0 @@
 s/\\u0020/\x20/g
 s/\\u003A/\x3A/g
 s/\\u006B/\x6B/g
 s/\\u0075/\x75/g
 s/\\u00A0/\xA0/g
 s/\\u00A3/\xA3/g
 s/\\u00B0/\xB0/g
 s/\\u00B7/\xB7/g
 s/\\u00BA/\xBA/g
 s/\\u00BF/\xBF/g
 s/\\u00C0/\xC0/g
 s/\\u00C1/\xC1/g
 s/\\u00C2/\xC2/g
 s/\\u00C4/\xC4/g
 s/\\u00C5/\xC5/g
 s/\\u00C8/\xC8/g
 s/\\u00C9/\xC9/g
 s/\\u00CA/\xCA/g
 s/\\u00CD/\xCD/g
 s/\\u00CE/\xCE/g
 s/\\u00D3/\xD3/g
 s/\\u00D4/\xD4/g
 s/\\u00D6/\xD6/g
 s/\\u00DA/\xDA/g
 s/\\u00DC/\xDC/g
 s/\\u00DD/\xDD/g
 s/\\u00DF/\xDF/g
 s/\\u00E0/\xE0/g
 s/\\u00E1/\xE1/g
 s/\\u00E2/\xE2/g
 s/\\u00E3/\xE3/g
 s/\\u00E4/\xE4/g
 s/\\u00E5/\xE5/g
 s/\\u00E6/\xE6/g
 s/\\u00E7/\xE7/g
 s/\\u00E8/\xE8/g
 s/\\u00E9/\xE9/g
 s/\\u00EA/\xEA/g
 s/\\u00EB/\xEB/g
 s/\\u00EC/\xEC/g
 s/\\u00ED/\xED/g
 s/\\u00EE/\xEE/g
 s/\\u00EF/\xEF/g
 s/\\u00F1/\xF1/g
 s/\\u00F2/\xF2/g
 s/\\u00F3/\xF3/g
 s/\\u00F4/\xF4/g
 s/\\u00F5/\xF5/g
 s/\\u00F6/\xF6/g
 s/\\u00F9/\xF9/g
 s/\\u00FA/\xFA/g
 s/\\u00FC/\xFC/g
 s/\\u0020/\x20/g
 s/\\u003f/\x3f/g
 s/\\u006f/\x6f/g
 s/\\u0075/\x75/g
 s/\\u00a0/\xa0/g
 s/\\u00a3/\xa3/g
 s/\\u00b0/\xb0/g
 s/\\u00ba/\xba/g
 s/\\u00bf/\xbf/g
 s/\\u00c1/\xc1/g
 s/\\u00c4/\xc4/g
 s/\\u00c5/\xc5/g
 s/\\u00c8/\xc8/g
 s/\\u00c9/\xc9/g
 s/\\u00ca/\xca/g
 s/\\u00cd/\xcd/g
 s/\\u00d6/\xd6/g
 s/\\u00dc/\xdc/g
 s/\\u00dd/\xdd/g
 s/\\u00df/\xdf/g
 s/\\u00e0/\xe0/g
 s/\\u00e1/\xe1/g
 s/\\u00e2/\xe2/g
 s/\\u00e3/\xe3/g
 s/\\u00e4/\xe4/g
 s/\\u00e5/\xe5/g
 s/\\u00e7/\xe7/g
 s/\\u00e8/\xe8/g
 s/\\u00e9/\xe9/g
 s/\\u00ea/\xea/g
 s/\\u00eb/\xeb/g
 s/\\u00ec/\xec/g
 s/\\u00ed/\xed/g
 s/\\u00ee/\xee/g
 s/\\u00ef/\xef/g
 s/\\u00f0/\xf0/g
 s/\\u00f1/\xf1/g
 s/\\u00f2/\xf2/g
 s/\\u00f3/\xf3/g
 s/\\u00f4/\xf4/g
 s/\\u00f5/\xf5/g
 s/\\u00f6/\xf6/g
 s/\\u00f7/\xf7/g
 s/\\u00f8/\xf8/g
 s/\\u00f9/\xf9/g
 s/\\u00fa/\xfa/g
 s/\\u00fc/\xfc/g
 s/\\u00ff/\xff/g
--- a/src/java.base/unix/native/libjava/locale_str.h
+++ b/src/java.base/unix/native/libjava/locale_str.h
@ -87,7 +87,7 @@
    "zh", "zh_CN",
 #ifdef __linux__
    "bokmal", "nb_NO",
-    "bokm\xE5l", "nb_NO",
+    "bokmål", "nb_NO",
    "catalan", "ca_ES",
    "croatian", "hr_HR",
    "czech", "cs_CZ",
@ -98,7 +98,7 @@
    "eesti", "et_EE",
    "estonian", "et_EE",
    "finnish", "fi_FI",
-    "fran\xE7\x61is", "fr_FR",
+    "français", "fr_FR",
    "french", "fr_FR",
    "galego", "gl_ES",
    "galician", "gl_ES",
@ -162,7 +162,7 @@ static char *language_names[] = {
    "deutsch", "de",
    "dutch", "nl",
    "finnish", "fi",
-    "fran\xE7\x61is", "fr",
+    "français", "fr",
    "french", "fr",
    "german", "de",
    "greek", "el",
--- a/src/java.base/windows/native/libjava/HostLocaleProviderAdapter_md.c
+++ b/src/java.base/windows/native/libjava/HostLocaleProviderAdapter_md.c
@ -134,7 +134,7 @@ WCHAR * fixes[2][2][3][16] =
                L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
            },
            { // currency
-                L"\xA4", L"", L"\xA4 ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
+                L"¤", L"", L"¤ ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
            },
            { // percent
                L"", L"", L"%", L"% ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
@ -145,7 +145,7 @@ WCHAR * fixes[2][2][3][16] =
                L"(", L"-", L"- ", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
            },
            { //currency
-                L"(\xA4", L"-\xA4", L"\xA4-", L"\xA4", L"(", L"-", L"", L"", L"-", L"-\xA4 ", L"", L"\xA4 ", L"\xA4 -", L"", L"(\xA4 ", L"("
+                L"(¤", L"-¤", L"¤-", L"¤", L"(", L"-", L"", L"", L"-", L"-¤ ", L"", L"¤ ", L"¤ -", L"", L"(¤ ", L"("
            },
            { // percent
                L"-", L"-", L"-%", L"%-", L"%", L"", L"", L"-% ", L"", L"% ", L"% -", L"", L"", L"", L"", L"",
@ -158,7 +158,7 @@ WCHAR * fixes[2][2][3][16] =
                L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L""
            },
            { // currency
-                L"", L"\xA4 ", L"", L" \xA4", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
+                L"", L"¤ ", L"", L" ¤", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
            },
            { // percent
                L" %", L"%", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
@ -169,7 +169,7 @@ WCHAR * fixes[2][2][3][16] =
                L")", L"", L" ", L"-", L" -", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"", L"",
            },
            { //currency
-                L")", L"", L"", L"-", L"\xA4)", L"\xA4", L"-\xA4", L"\xA4-", L" \xA4", L"", L" \xA4-", L"-", L"", L"- \xA4", L")", L" \xA4)"
+                L")", L"", L"", L"-", L"¤)", L"¤", L"-¤", L"¤-", L" ¤", L"", L" ¤-", L"-", L"", L"- ¤", L")", L" ¤)"
            },
            { // percent
                L" %", L"%", L"", L"", L"-", L"-%", L"%-", L"", L" %-", L"-", L"", L"- %", L"", L"", L"", L"",