8261027: AArch64: Support for LSE atomics C++ HotSpot code

Reviewed-by: adinn, simonis
2021-02-12 13:12:02 +00:00 · 2021-02-12 13:12:02 +00:00 · 40ae9937a0
commit 40ae9937a0
parent 9ffabf30c3
6 changed files with 407 additions and 36 deletions
--- a/src/hotspot/cpu/aarch64/atomic_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/atomic_aarch64.hpp
@ -0,0 +1,46 @@
+/* Copyright (c) 2021, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_ATOMIC_AARCH64_HPP
+#define CPU_AARCH64_ATOMIC_AARCH64_HPP
+
+// Atomic stub implementation.
+// Default implementations are in atomic_linux_aarch64.S
+//
+// All stubs pass arguments the same way
+// x0: src/dest address
+// x1: arg1
+// x2: arg2 (optional)
+// x3, x8, x9: scratch
+typedef uint64_t (*aarch64_atomic_stub_t)(volatile void *ptr, uint64_t arg1, uint64_t arg2);
+
+// Pointers to stubs
+extern aarch64_atomic_stub_t aarch64_atomic_fetch_add_4_impl;
+extern aarch64_atomic_stub_t aarch64_atomic_fetch_add_8_impl;
+extern aarch64_atomic_stub_t aarch64_atomic_xchg_4_impl;
+extern aarch64_atomic_stub_t aarch64_atomic_xchg_8_impl;
+extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_1_impl;
+extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_4_impl;
+extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_8_impl;
+
+#endif // CPU_AARCH64_ATOMIC_AARCH64_HPP
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@ -2567,6 +2567,8 @@ void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {

 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
+ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
+ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)

--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -1039,6 +1039,8 @@ public:

  void atomic_xchg(Register prev, Register newv, Register addr);
  void atomic_xchgw(Register prev, Register newv, Register addr);
+  void atomic_xchgl(Register prev, Register newv, Register addr);
+  void atomic_xchglw(Register prev, Register newv, Register addr);
  void atomic_xchgal(Register prev, Register newv, Register addr);
  void atomic_xchgalw(Register prev, Register newv, Register addr);

--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -26,6 +26,7 @@
 #include "precompiled.hpp"
 #include "asm/macroAssembler.hpp"
 #include "asm/macroAssembler.inline.hpp"
+#include "atomic_aarch64.hpp"
 #include "gc/shared/barrierSet.hpp"
 #include "gc/shared/barrierSetAssembler.hpp"
 #include "gc/shared/gc_globals.hpp"
@ -38,6 +39,7 @@
 #include "oops/objArrayKlass.hpp"
 #include "oops/oop.inline.hpp"
 #include "prims/methodHandles.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/sharedRuntime.hpp"
@ -1361,7 +1363,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
-  // cache line boundaries will still be loaded and stored atomicly.
+  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
  //   disjoint_int_copy_entry is set to the no-overlap entry point
@ -1431,7 +1433,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
-  // cache line boundaries will still be loaded and stored atomicly.
+  // cache line boundaries will still be loaded and stored atomically.
  //
  address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
                                 address *entry, const char *name,
@ -1596,7 +1598,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
-  // cache line boundaries will still be loaded and stored atomicly.
+  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
  //   disjoint_int_copy_entry is set to the no-overlap entry point
@ -1620,7 +1622,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
-  // cache line boundaries will still be loaded and stored atomicly.
+  // cache line boundaries will still be loaded and stored atomically.
  //
  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
                                     address *entry, const char *name,
@ -5571,6 +5573,91 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+#ifdef LINUX
+  // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
+  //
+  // If LSE is in use, generate LSE versions of all the stubs. The
+  // non-LSE versions are in atomic_aarch64.S.
+  void generate_atomic_entry_points() {
+
+    if (! UseLSE) {
+      return;
+    }
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "atomic entry points");
+
+    __ align(32);
+    aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r2, addr = c_rarg0, incr = c_rarg1;
+      __ atomic_addal(prev, incr, addr);
+      __ mov(r0, prev);
+      __ ret(lr);
+    }
+    __ align(32);
+    aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r2, addr = c_rarg0, incr = c_rarg1;
+      __ atomic_addalw(prev, incr, addr);
+      __ movw(r0, prev);
+      __ ret(lr);
+    }
+    __ align(32);
+    aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r2, addr = c_rarg0, newv = c_rarg1;
+      __ atomic_xchglw(prev, newv, addr);
+      __ movw(r0, prev);
+      __ ret(lr);
+    }
+    __ align(32);
+    aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r2, addr = c_rarg0, newv = c_rarg1;
+      __ atomic_xchgl(prev, newv, addr);
+      __ mov(r0, prev);
+      __ ret(lr);
+    }
+    __ align(32);
+    aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
+        exchange_val = c_rarg2;
+      __ cmpxchg(ptr, compare_val, exchange_val,
+                 MacroAssembler::byte,
+                 /*acquire*/false, /*release*/false, /*weak*/false,
+                 prev);
+      __ movw(r0, prev);
+      __ ret(lr);
+    }
+    __ align(32);
+    aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
+        exchange_val = c_rarg2;
+      __ cmpxchg(ptr, compare_val, exchange_val,
+                 MacroAssembler::word,
+                 /*acquire*/false, /*release*/false, /*weak*/false,
+                 prev);
+      __ movw(r0, prev);
+      __ ret(lr);
+    }
+    __ align(32);
+    aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)__ pc();
+    {
+      Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
+        exchange_val = c_rarg2;
+      __ cmpxchg(ptr, compare_val, exchange_val,
+                 MacroAssembler::xword,
+                 /*acquire*/false, /*release*/false, /*weak*/false,
+                 prev);
+      __ mov(r0, prev);
+      __ ret(lr);
+    }
+  }
+#endif // LINUX
+
  // Continuation point for throwing of implicit exceptions that are
  // not handled in the current activation. Fabricates an exception
  // oop and initiates normal exception dispatching in this
@ -6683,6 +6770,12 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
    }

+#ifdef LINUX
+
+    generate_atomic_entry_points();
+
+#endif // LINUX
+
    StubRoutines::aarch64::set_completed();
  }

@ -6703,3 +6796,27 @@ void StubGenerator_generate(CodeBuffer* code, bool all) {
  }
  StubGenerator g(code, all);
 }
+
+
+#ifdef LINUX
+
+// Define pointers to atomic stubs and initialize them to point to the
+// code in atomic_aarch64.S.
+
+#define DEFAULT_ATOMIC_OP(OPNAME, SIZE)                                 \
+  extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _default_impl \
+    (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
+  aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _impl \
+    = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _default_impl;
+
+DEFAULT_ATOMIC_OP(fetch_add, 4)
+DEFAULT_ATOMIC_OP(fetch_add, 8)
+DEFAULT_ATOMIC_OP(xchg, 4)
+DEFAULT_ATOMIC_OP(xchg, 8)
+DEFAULT_ATOMIC_OP(cmpxchg, 1)
+DEFAULT_ATOMIC_OP(cmpxchg, 4)
+DEFAULT_ATOMIC_OP(cmpxchg, 8)
+
+#undef DEFAULT_ATOMIC_OP
+
+#endif // LINUX
--- a/src/hotspot/os_cpu/linux_aarch64/atomic_linux_aarch64.S
+++ b/src/hotspot/os_cpu/linux_aarch64/atomic_linux_aarch64.S
@ -0,0 +1,96 @@
+// Copyright (c) 2021, Red Hat Inc. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+
+
+
+        .text
+
+        .globl aarch64_atomic_fetch_add_8_default_impl
+        .align 5
+aarch64_atomic_fetch_add_8_default_impl:
+0:      ldaxr   x2, [x0]
+        add     x8, x2, x1
+        stlxr   w9, x8, [x0]
+        cbnz    w9, 0b
+        mov     x0, x2
+        ret
+
+        .globl aarch64_atomic_fetch_add_4_default_impl
+        .align 5
+aarch64_atomic_fetch_add_4_default_impl:
+0:      ldaxr   w2, [x0]
+        add     w8, w2, w1
+        stlxr   w9, w8, [x0]
+        cbnz    w9, 0b
+        mov     w0, w2
+        ret
+
+        .globl aarch64_atomic_xchg_4_default_impl
+        .align 5
+aarch64_atomic_xchg_4_default_impl:
+0:      ldaxr   w2, [x0]
+        stlxr   w8, w1, [x0]
+        cbnz    w8, 0b
+        mov     w0, w2
+        ret
+
+        .globl aarch64_atomic_xchg_8_default_impl
+        .align 5
+aarch64_atomic_xchg_8_default_impl:
+0:      ldaxr   x2, [x0]
+        stlxr   w8, x1, [x0]
+        cbnz    w8, 0b
+        mov     x0, x2
+        ret
+
+        .globl aarch64_atomic_cmpxchg_1_default_impl
+        .align 5
+aarch64_atomic_cmpxchg_1_default_impl:
+0:      ldxrb   w3, [x0]
+        eor     w8, w3, w1
+        tst     x8, #0xff
+        b.ne    1f
+        stxrb   w8, w2, [x0]
+        cbnz    w8, 0b
+1:      mov     w0, w3
+        ret
+
+        .globl aarch64_atomic_cmpxchg_4_default_impl
+        .align 5
+aarch64_atomic_cmpxchg_4_default_impl:
+0:      ldxr    w3, [x0]
+        cmp     w3, w1
+        b.ne    1f
+        stxr    w8, w2, [x0]
+        cbnz    w8, 0b
+1:      mov     w0, w3
+        ret
+
+        .globl aarch64_atomic_cmpxchg_8_default_impl
+        .align 5
+aarch64_atomic_cmpxchg_8_default_impl:
+0:      ldxr    x3, [x0]
+        cmp     x3, x1
+        b.ne    1f
+        stxr    w8, x2, [x0]
+        cbnz    w8, 0b
+1:      mov     x0, x3
+        ret
--- a/src/hotspot/os_cpu/linux_aarch64/atomic_linux_aarch64.hpp
+++ b/src/hotspot/os_cpu/linux_aarch64/atomic_linux_aarch64.hpp
@ -1,6 +1,6 @@
 /*
 * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,58 +26,166 @@
 #ifndef OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP
 #define OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP

+#include "atomic_aarch64.hpp"
 #include "runtime/vm_version.hpp"

 // Implementation of class atomic
+
 // Note that memory_order_conservative requires a full barrier after atomic stores.
 // See https://patchwork.kernel.org/patch/3575821/

+// Call one of the stubs from C++. This uses the C calling convention,
+// but this asm definition is used in order only to clobber the
+// registers we use. If we called the stubs via an ABI call we'd have
+// to save X0 - X18 and most of the vectors.
+//
+// This really ought to be a template definition, but see GCC Bug
+// 33661, template methods forget explicit local register asm
+// vars. The problem is that register specifiers attached to local
+// variables are ignored in any template function.
+inline uint64_t bare_atomic_fastcall(address stub, volatile void *ptr, uint64_t arg1, uint64_t arg2 = 0) {
+  register uint64_t reg0 __asm__("x0") = (uint64_t)ptr;
+  register uint64_t reg1 __asm__("x1") = arg1;
+  register uint64_t reg2 __asm__("x2") = arg2;
+  register uint64_t reg3 __asm__("x3") = (uint64_t)stub;
+  register uint64_t result __asm__("x0");
+  asm volatile(// "stp x29, x30, [sp, #-16]!;"
+               " blr %1;"
+               // " ldp x29, x30, [sp], #16 // regs %0, %1, %2, %3, %4"
+               : "=r"(result), "+r"(reg3), "+r"(reg2)
+               : "r"(reg1), "0"(reg0) : "x8", "x9", "x30", "cc", "memory");
+  return result;
+}
+
+template <typename F, typename D, typename T1>
+inline D atomic_fastcall(F stub, volatile D *dest, T1 arg1) {
+  return (D)bare_atomic_fastcall(CAST_FROM_FN_PTR(address, stub),
+                                 dest, (uint64_t)arg1);
+}
+
+template <typename F, typename D, typename T1, typename T2>
+inline D atomic_fastcall(F stub, volatile D *dest, T1 arg1, T2 arg2) {
+  return (D)bare_atomic_fastcall(CAST_FROM_FN_PTR(address, stub),
+                                 dest, (uint64_t)arg1, (uint64_t)arg2);
+}
+
 template<size_t byte_size>
 struct Atomic::PlatformAdd {
  template<typename D, typename I>
-  D add_and_fetch(D volatile* dest, I add_value, atomic_memory_order order) const {
-    D res = __atomic_add_fetch(dest, add_value, __ATOMIC_RELEASE);
-    FULL_MEM_BARRIER;
-    return res;
-  }
+  D fetch_and_add(D volatile* dest, I add_value, atomic_memory_order order) const;

  template<typename D, typename I>
-  D fetch_and_add(D volatile* dest, I add_value, atomic_memory_order order) const {
-    return add_and_fetch(dest, add_value, order) - add_value;
+  D add_and_fetch(D volatile* dest, I add_value, atomic_memory_order order) const {
+    D value = fetch_and_add(dest, add_value, order) + add_value;
+    return value;
  }
 };

-template<size_t byte_size>
-template<typename T>
-inline T Atomic::PlatformXchg<byte_size>::operator()(T volatile* dest,
-                                                     T exchange_value,
-                                                     atomic_memory_order order) const {
-  STATIC_ASSERT(byte_size == sizeof(T));
-  T res = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELEASE);
-  FULL_MEM_BARRIER;
-  return res;
+template<>
+template<typename D, typename I>
+inline D Atomic::PlatformAdd<4>::fetch_and_add(D volatile* dest, I add_value,
+                                               atomic_memory_order order) const {
+  STATIC_ASSERT(4 == sizeof(I));
+  STATIC_ASSERT(4 == sizeof(D));
+  D old_value
+    = atomic_fastcall(aarch64_atomic_fetch_add_4_impl, dest, add_value);
+    FULL_MEM_BARRIER;
+  return old_value;
 }

-// __attribute__((unused)) on dest is to get rid of spurious GCC warnings.
-template<size_t byte_size>
+template<>
+template<typename D, typename I>
+inline D Atomic::PlatformAdd<8>::fetch_and_add(D volatile* dest, I add_value,
+                                               atomic_memory_order order) const {
+  STATIC_ASSERT(8 == sizeof(I));
+  STATIC_ASSERT(8 == sizeof(D));
+  D old_value
+    = atomic_fastcall(aarch64_atomic_fetch_add_8_impl, dest, add_value);
+    FULL_MEM_BARRIER;
+  return old_value;
+}
+
+template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<byte_size>::operator()(T volatile* dest __attribute__((unused)),
-                                                        T compare_value,
-                                                        T exchange_value,
-                                                        atomic_memory_order order) const {
-  STATIC_ASSERT(byte_size == sizeof(T));
+inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
+                                             T exchange_value,
+                                             atomic_memory_order order) const {
+  STATIC_ASSERT(4 == sizeof(T));
+  T old_value = atomic_fastcall(aarch64_atomic_xchg_4_impl, dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return old_value;
+}
+
+template<>
+template<typename T>
+inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,
+                                             atomic_memory_order order) const {
+  STATIC_ASSERT(8 == sizeof(T));
+  T old_value = atomic_fastcall(aarch64_atomic_xchg_8_impl, dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return old_value;
+}
+
+template<>
+template<typename T>
+inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
+                                                T compare_value,
+                                                T exchange_value,
+                                                atomic_memory_order order) const {
+  STATIC_ASSERT(1 == sizeof(T));
+  aarch64_atomic_stub_t stub = aarch64_atomic_cmpxchg_1_impl;
  if (order == memory_order_relaxed) {
-    T value = compare_value;
-    __atomic_compare_exchange(dest, &value, &exchange_value, /*weak*/false,
-                              __ATOMIC_RELAXED, __ATOMIC_RELAXED);
-    return value;
+    T old_value = atomic_fastcall(stub, dest,
+                                  compare_value, exchange_value);
+    return old_value;
  } else {
-    T value = compare_value;
    FULL_MEM_BARRIER;
-    __atomic_compare_exchange(dest, &value, &exchange_value, /*weak*/false,
-                              __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    T old_value = atomic_fastcall(stub, dest,
+                                  compare_value, exchange_value);
    FULL_MEM_BARRIER;
-    return value;
+    return old_value;
+  }
+}
+
+template<>
+template<typename T>
+inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                T compare_value,
+                                                T exchange_value,
+                                                atomic_memory_order order) const {
+  STATIC_ASSERT(4 == sizeof(T));
+  aarch64_atomic_stub_t stub = aarch64_atomic_cmpxchg_4_impl;
+  if (order == memory_order_relaxed) {
+    T old_value = atomic_fastcall(stub, dest,
+                                  compare_value, exchange_value);
+    return old_value;
+  } else {
+    FULL_MEM_BARRIER;
+    T old_value = atomic_fastcall(stub, dest,
+                                  compare_value, exchange_value);
+    FULL_MEM_BARRIER;
+    return old_value;
+  }
+}
+
+template<>
+template<typename T>
+inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                T compare_value,
+                                                T exchange_value,
+                                                atomic_memory_order order) const {
+  STATIC_ASSERT(8 == sizeof(T));
+  aarch64_atomic_stub_t stub = aarch64_atomic_cmpxchg_8_impl;
+  if (order == memory_order_relaxed) {
+    T old_value = atomic_fastcall(stub, dest,
+                                  compare_value, exchange_value);
+    return old_value;
+  } else {
+    FULL_MEM_BARRIER;
+    T old_value = atomic_fastcall(stub, dest,
+                                  compare_value, exchange_value);
+    FULL_MEM_BARRIER;
+    return old_value;
  }
 }