8261027: AArch64: Support for LSE atomics C++ HotSpot code

Reviewed-by: adinn, simonis
This commit is contained in:
Andrew Haley 2021-02-12 13:12:02 +00:00
parent 9ffabf30c3
commit 40ae9937a0
6 changed files with 407 additions and 36 deletions

View File

@ -0,0 +1,46 @@
/* Copyright (c) 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#ifndef CPU_AARCH64_ATOMIC_AARCH64_HPP
#define CPU_AARCH64_ATOMIC_AARCH64_HPP
// Atomic stub implementation.
// Default implementations are in atomic_linux_aarch64.S
//
// All stubs pass arguments the same way
// x0: src/dest address
// x1: arg1
// x2: arg2 (optional)
// x3, x8, x9: scratch
typedef uint64_t (*aarch64_atomic_stub_t)(volatile void *ptr, uint64_t arg1, uint64_t arg2);
// Pointers to stubs
extern aarch64_atomic_stub_t aarch64_atomic_fetch_add_4_impl;
extern aarch64_atomic_stub_t aarch64_atomic_fetch_add_8_impl;
extern aarch64_atomic_stub_t aarch64_atomic_xchg_4_impl;
extern aarch64_atomic_stub_t aarch64_atomic_xchg_8_impl;
extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_1_impl;
extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_4_impl;
extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_8_impl;
#endif // CPU_AARCH64_ATOMIC_AARCH64_HPP

View File

@ -2567,6 +2567,8 @@ void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {
ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)

View File

@ -1039,6 +1039,8 @@ public:
void atomic_xchg(Register prev, Register newv, Register addr);
void atomic_xchgw(Register prev, Register newv, Register addr);
void atomic_xchgl(Register prev, Register newv, Register addr);
void atomic_xchglw(Register prev, Register newv, Register addr);
void atomic_xchgal(Register prev, Register newv, Register addr);
void atomic_xchgalw(Register prev, Register newv, Register addr);

View File

@ -26,6 +26,7 @@
#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "atomic_aarch64.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/gc_globals.hpp"
@ -38,6 +39,7 @@
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/atomic.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
@ -1361,7 +1363,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
// Side Effects:
// disjoint_int_copy_entry is set to the no-overlap entry point
@ -1431,7 +1433,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
address *entry, const char *name,
@ -1596,7 +1598,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
// Side Effects:
// disjoint_int_copy_entry is set to the no-overlap entry point
@ -1620,7 +1622,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
address *entry, const char *name,
@ -5571,6 +5573,91 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
#ifdef LINUX
// ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
//
// If LSE is in use, generate LSE versions of all the stubs. The
// non-LSE versions are in atomic_aarch64.S.
void generate_atomic_entry_points() {
if (! UseLSE) {
return;
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "atomic entry points");
__ align(32);
aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, incr = c_rarg1;
__ atomic_addal(prev, incr, addr);
__ mov(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, incr = c_rarg1;
__ atomic_addalw(prev, incr, addr);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, newv = c_rarg1;
__ atomic_xchglw(prev, newv, addr);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, newv = c_rarg1;
__ atomic_xchgl(prev, newv, addr);
__ mov(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
exchange_val = c_rarg2;
__ cmpxchg(ptr, compare_val, exchange_val,
MacroAssembler::byte,
/*acquire*/false, /*release*/false, /*weak*/false,
prev);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
exchange_val = c_rarg2;
__ cmpxchg(ptr, compare_val, exchange_val,
MacroAssembler::word,
/*acquire*/false, /*release*/false, /*weak*/false,
prev);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
exchange_val = c_rarg2;
__ cmpxchg(ptr, compare_val, exchange_val,
MacroAssembler::xword,
/*acquire*/false, /*release*/false, /*weak*/false,
prev);
__ mov(r0, prev);
__ ret(lr);
}
}
#endif // LINUX
// Continuation point for throwing of implicit exceptions that are
// not handled in the current activation. Fabricates an exception
// oop and initiates normal exception dispatching in this
@ -6683,6 +6770,12 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
}
#ifdef LINUX
generate_atomic_entry_points();
#endif // LINUX
StubRoutines::aarch64::set_completed();
}
@ -6703,3 +6796,27 @@ void StubGenerator_generate(CodeBuffer* code, bool all) {
}
StubGenerator g(code, all);
}
#ifdef LINUX
// Define pointers to atomic stubs and initialize them to point to the
// code in atomic_aarch64.S.
#define DEFAULT_ATOMIC_OP(OPNAME, SIZE) \
extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _default_impl \
(volatile void *ptr, uint64_t arg1, uint64_t arg2); \
aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _impl \
= aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _default_impl;
DEFAULT_ATOMIC_OP(fetch_add, 4)
DEFAULT_ATOMIC_OP(fetch_add, 8)
DEFAULT_ATOMIC_OP(xchg, 4)
DEFAULT_ATOMIC_OP(xchg, 8)
DEFAULT_ATOMIC_OP(cmpxchg, 1)
DEFAULT_ATOMIC_OP(cmpxchg, 4)
DEFAULT_ATOMIC_OP(cmpxchg, 8)
#undef DEFAULT_ATOMIC_OP
#endif // LINUX

View File

@ -0,0 +1,96 @@
// Copyright (c) 2021, Red Hat Inc. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.
.text
.globl aarch64_atomic_fetch_add_8_default_impl
.align 5
aarch64_atomic_fetch_add_8_default_impl:
0: ldaxr x2, [x0]
add x8, x2, x1
stlxr w9, x8, [x0]
cbnz w9, 0b
mov x0, x2
ret
.globl aarch64_atomic_fetch_add_4_default_impl
.align 5
aarch64_atomic_fetch_add_4_default_impl:
0: ldaxr w2, [x0]
add w8, w2, w1
stlxr w9, w8, [x0]
cbnz w9, 0b
mov w0, w2
ret
.globl aarch64_atomic_xchg_4_default_impl
.align 5
aarch64_atomic_xchg_4_default_impl:
0: ldaxr w2, [x0]
stlxr w8, w1, [x0]
cbnz w8, 0b
mov w0, w2
ret
.globl aarch64_atomic_xchg_8_default_impl
.align 5
aarch64_atomic_xchg_8_default_impl:
0: ldaxr x2, [x0]
stlxr w8, x1, [x0]
cbnz w8, 0b
mov x0, x2
ret
.globl aarch64_atomic_cmpxchg_1_default_impl
.align 5
aarch64_atomic_cmpxchg_1_default_impl:
0: ldxrb w3, [x0]
eor w8, w3, w1
tst x8, #0xff
b.ne 1f
stxrb w8, w2, [x0]
cbnz w8, 0b
1: mov w0, w3
ret
.globl aarch64_atomic_cmpxchg_4_default_impl
.align 5
aarch64_atomic_cmpxchg_4_default_impl:
0: ldxr w3, [x0]
cmp w3, w1
b.ne 1f
stxr w8, w2, [x0]
cbnz w8, 0b
1: mov w0, w3
ret
.globl aarch64_atomic_cmpxchg_8_default_impl
.align 5
aarch64_atomic_cmpxchg_8_default_impl:
0: ldxr x3, [x0]
cmp x3, x1
b.ne 1f
stxr w8, x2, [x0]
cbnz w8, 0b
1: mov x0, x3
ret

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -26,58 +26,166 @@
#ifndef OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP
#define OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP
#include "atomic_aarch64.hpp"
#include "runtime/vm_version.hpp"
// Implementation of class atomic
// Note that memory_order_conservative requires a full barrier after atomic stores.
// See https://patchwork.kernel.org/patch/3575821/
// Call one of the stubs from C++. This uses the C calling convention,
// but this asm definition is used in order only to clobber the
// registers we use. If we called the stubs via an ABI call we'd have
// to save X0 - X18 and most of the vectors.
//
// This really ought to be a template definition, but see GCC Bug
// 33661, template methods forget explicit local register asm
// vars. The problem is that register specifiers attached to local
// variables are ignored in any template function.
inline uint64_t bare_atomic_fastcall(address stub, volatile void *ptr, uint64_t arg1, uint64_t arg2 = 0) {
register uint64_t reg0 __asm__("x0") = (uint64_t)ptr;
register uint64_t reg1 __asm__("x1") = arg1;
register uint64_t reg2 __asm__("x2") = arg2;
register uint64_t reg3 __asm__("x3") = (uint64_t)stub;
register uint64_t result __asm__("x0");
asm volatile(// "stp x29, x30, [sp, #-16]!;"
" blr %1;"
// " ldp x29, x30, [sp], #16 // regs %0, %1, %2, %3, %4"
: "=r"(result), "+r"(reg3), "+r"(reg2)
: "r"(reg1), "0"(reg0) : "x8", "x9", "x30", "cc", "memory");
return result;
}
template <typename F, typename D, typename T1>
inline D atomic_fastcall(F stub, volatile D *dest, T1 arg1) {
return (D)bare_atomic_fastcall(CAST_FROM_FN_PTR(address, stub),
dest, (uint64_t)arg1);
}
template <typename F, typename D, typename T1, typename T2>
inline D atomic_fastcall(F stub, volatile D *dest, T1 arg1, T2 arg2) {
return (D)bare_atomic_fastcall(CAST_FROM_FN_PTR(address, stub),
dest, (uint64_t)arg1, (uint64_t)arg2);
}
template<size_t byte_size>
struct Atomic::PlatformAdd {
template<typename D, typename I>
D add_and_fetch(D volatile* dest, I add_value, atomic_memory_order order) const {
D res = __atomic_add_fetch(dest, add_value, __ATOMIC_RELEASE);
FULL_MEM_BARRIER;
return res;
}
D fetch_and_add(D volatile* dest, I add_value, atomic_memory_order order) const;
template<typename D, typename I>
D fetch_and_add(D volatile* dest, I add_value, atomic_memory_order order) const {
return add_and_fetch(dest, add_value, order) - add_value;
D add_and_fetch(D volatile* dest, I add_value, atomic_memory_order order) const {
D value = fetch_and_add(dest, add_value, order) + add_value;
return value;
}
};
template<size_t byte_size>
template<typename T>
inline T Atomic::PlatformXchg<byte_size>::operator()(T volatile* dest,
T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(byte_size == sizeof(T));
T res = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELEASE);
FULL_MEM_BARRIER;
return res;
template<>
template<typename D, typename I>
inline D Atomic::PlatformAdd<4>::fetch_and_add(D volatile* dest, I add_value,
atomic_memory_order order) const {
STATIC_ASSERT(4 == sizeof(I));
STATIC_ASSERT(4 == sizeof(D));
D old_value
= atomic_fastcall(aarch64_atomic_fetch_add_4_impl, dest, add_value);
FULL_MEM_BARRIER;
return old_value;
}
// __attribute__((unused)) on dest is to get rid of spurious GCC warnings.
template<size_t byte_size>
template<>
template<typename D, typename I>
inline D Atomic::PlatformAdd<8>::fetch_and_add(D volatile* dest, I add_value,
atomic_memory_order order) const {
STATIC_ASSERT(8 == sizeof(I));
STATIC_ASSERT(8 == sizeof(D));
D old_value
= atomic_fastcall(aarch64_atomic_fetch_add_8_impl, dest, add_value);
FULL_MEM_BARRIER;
return old_value;
}
template<>
template<typename T>
inline T Atomic::PlatformCmpxchg<byte_size>::operator()(T volatile* dest __attribute__((unused)),
T compare_value,
T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(byte_size == sizeof(T));
inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(4 == sizeof(T));
T old_value = atomic_fastcall(aarch64_atomic_xchg_4_impl, dest, exchange_value);
FULL_MEM_BARRIER;
return old_value;
}
template<>
template<typename T>
inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(8 == sizeof(T));
T old_value = atomic_fastcall(aarch64_atomic_xchg_8_impl, dest, exchange_value);
FULL_MEM_BARRIER;
return old_value;
}
template<>
template<typename T>
inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
T compare_value,
T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(1 == sizeof(T));
aarch64_atomic_stub_t stub = aarch64_atomic_cmpxchg_1_impl;
if (order == memory_order_relaxed) {
T value = compare_value;
__atomic_compare_exchange(dest, &value, &exchange_value, /*weak*/false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
return value;
T old_value = atomic_fastcall(stub, dest,
compare_value, exchange_value);
return old_value;
} else {
T value = compare_value;
FULL_MEM_BARRIER;
__atomic_compare_exchange(dest, &value, &exchange_value, /*weak*/false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
T old_value = atomic_fastcall(stub, dest,
compare_value, exchange_value);
FULL_MEM_BARRIER;
return value;
return old_value;
}
}
template<>
template<typename T>
inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
T compare_value,
T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(4 == sizeof(T));
aarch64_atomic_stub_t stub = aarch64_atomic_cmpxchg_4_impl;
if (order == memory_order_relaxed) {
T old_value = atomic_fastcall(stub, dest,
compare_value, exchange_value);
return old_value;
} else {
FULL_MEM_BARRIER;
T old_value = atomic_fastcall(stub, dest,
compare_value, exchange_value);
FULL_MEM_BARRIER;
return old_value;
}
}
template<>
template<typename T>
inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
T compare_value,
T exchange_value,
atomic_memory_order order) const {
STATIC_ASSERT(8 == sizeof(T));
aarch64_atomic_stub_t stub = aarch64_atomic_cmpxchg_8_impl;
if (order == memory_order_relaxed) {
T old_value = atomic_fastcall(stub, dest,
compare_value, exchange_value);
return old_value;
} else {
FULL_MEM_BARRIER;
T old_value = atomic_fastcall(stub, dest,
compare_value, exchange_value);
FULL_MEM_BARRIER;
return old_value;
}
}