8282204: Use lea instructions for arithmetic operations on x86_64
Reviewed-by: jiefu, sviswanathan, thartmann
This commit is contained in:
parent
026b85303c
commit
d8c55725e0
@ -2410,10 +2410,7 @@ void Assembler::ldmxcsr( Address src) {
|
||||
|
||||
void Assembler::leal(Register dst, Address src) {
|
||||
InstructionMark im(this);
|
||||
#ifdef _LP64
|
||||
emit_int8(0x67); // addr32
|
||||
prefix(src, dst);
|
||||
#endif // LP64
|
||||
emit_int8((unsigned char)0x8D);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
@ -1044,6 +1044,25 @@ public:
|
||||
static bool supports_clflushopt() { return ((_features & CPU_FLUSHOPT) != 0); }
|
||||
static bool supports_clwb() { return ((_features & CPU_CLWB) != 0); }
|
||||
|
||||
// Old CPUs perform lea on AGU which causes additional latency transfering the
|
||||
// value from/to ALU for other operations
|
||||
static bool supports_fast_2op_lea() {
|
||||
return (is_intel() && supports_avx()) || // Sandy Bridge and above
|
||||
(is_amd() && supports_avx()); // Jaguar and Bulldozer and above
|
||||
}
|
||||
|
||||
// Pre Icelake Intels suffer inefficiency regarding 3-operand lea, which contains
|
||||
// all of base register, index register and displacement immediate, with 3 latency.
|
||||
// Note that when the address contains no displacement but the base register is
|
||||
// rbp or r13, the machine code must contain a zero displacement immediate,
|
||||
// effectively transform a 2-operand lea into a 3-operand lea. This can be
|
||||
// replaced by add-add or lea-add
|
||||
static bool supports_fast_3op_lea() {
|
||||
return supports_fast_2op_lea() &&
|
||||
((is_intel() && supports_clwb() && !is_intel_skylake()) || // Icelake and above
|
||||
is_amd());
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
// Is the CPU running emulated (for example macOS Rosetta running x86_64 code on M1 ARM (aarch64)
|
||||
static bool is_cpu_emulated();
|
||||
|
@ -1,5 +1,5 @@
|
||||
//
|
||||
// Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
// Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
//
|
||||
// This code is free software; you can redistribute it and/or modify it
|
||||
@ -241,6 +241,11 @@ reg_class long_no_rcx_reg %{
|
||||
return _LONG_NO_RCX_REG_mask;
|
||||
%}
|
||||
|
||||
// Class for all long registers (excluding RBP and R13)
|
||||
reg_class long_no_rbp_r13_reg %{
|
||||
return _LONG_NO_RBP_R13_REG_mask;
|
||||
%}
|
||||
|
||||
// Class for all int registers (excluding RSP)
|
||||
reg_class int_reg %{
|
||||
return _INT_REG_mask;
|
||||
@ -256,6 +261,11 @@ reg_class int_no_rcx_reg %{
|
||||
return _INT_NO_RCX_REG_mask;
|
||||
%}
|
||||
|
||||
// Class for all int registers (excluding RBP and R13)
|
||||
reg_class int_no_rbp_r13_reg %{
|
||||
return _INT_NO_RBP_R13_REG_mask;
|
||||
%}
|
||||
|
||||
// Singleton class for RAX pointer register
|
||||
reg_class ptr_rax_reg(RAX, RAX_H);
|
||||
|
||||
@ -319,9 +329,11 @@ extern RegMask _PTR_NO_RAX_RBX_REG_mask;
|
||||
extern RegMask _LONG_REG_mask;
|
||||
extern RegMask _LONG_NO_RAX_RDX_REG_mask;
|
||||
extern RegMask _LONG_NO_RCX_REG_mask;
|
||||
extern RegMask _LONG_NO_RBP_R13_REG_mask;
|
||||
extern RegMask _INT_REG_mask;
|
||||
extern RegMask _INT_NO_RAX_RDX_REG_mask;
|
||||
extern RegMask _INT_NO_RCX_REG_mask;
|
||||
extern RegMask _INT_NO_RBP_R13_REG_mask;
|
||||
extern RegMask _FLOAT_REG_mask;
|
||||
|
||||
extern RegMask _STACK_OR_PTR_REG_mask;
|
||||
@ -348,9 +360,11 @@ RegMask _PTR_NO_RAX_RBX_REG_mask;
|
||||
RegMask _LONG_REG_mask;
|
||||
RegMask _LONG_NO_RAX_RDX_REG_mask;
|
||||
RegMask _LONG_NO_RCX_REG_mask;
|
||||
RegMask _LONG_NO_RBP_R13_REG_mask;
|
||||
RegMask _INT_REG_mask;
|
||||
RegMask _INT_NO_RAX_RDX_REG_mask;
|
||||
RegMask _INT_NO_RCX_REG_mask;
|
||||
RegMask _INT_NO_RBP_R13_REG_mask;
|
||||
RegMask _FLOAT_REG_mask;
|
||||
RegMask _STACK_OR_PTR_REG_mask;
|
||||
RegMask _STACK_OR_LONG_REG_mask;
|
||||
@ -409,6 +423,12 @@ void reg_mask_init() {
|
||||
_LONG_NO_RCX_REG_mask.Remove(OptoReg::as_OptoReg(rcx->as_VMReg()));
|
||||
_LONG_NO_RCX_REG_mask.Remove(OptoReg::as_OptoReg(rcx->as_VMReg()->next()));
|
||||
|
||||
_LONG_NO_RBP_R13_REG_mask = _LONG_REG_mask;
|
||||
_LONG_NO_RBP_R13_REG_mask.Remove(OptoReg::as_OptoReg(rbp->as_VMReg()));
|
||||
_LONG_NO_RBP_R13_REG_mask.Remove(OptoReg::as_OptoReg(rbp->as_VMReg()->next()));
|
||||
_LONG_NO_RBP_R13_REG_mask.Remove(OptoReg::as_OptoReg(r13->as_VMReg()));
|
||||
_LONG_NO_RBP_R13_REG_mask.Remove(OptoReg::as_OptoReg(r13->as_VMReg()->next()));
|
||||
|
||||
_INT_REG_mask = _ALL_INT_REG_mask;
|
||||
if (PreserveFramePointer) {
|
||||
_INT_REG_mask.Remove(OptoReg::as_OptoReg(rbp->as_VMReg()));
|
||||
@ -427,6 +447,10 @@ void reg_mask_init() {
|
||||
_INT_NO_RCX_REG_mask = _INT_REG_mask;
|
||||
_INT_NO_RCX_REG_mask.Remove(OptoReg::as_OptoReg(rcx->as_VMReg()));
|
||||
|
||||
_INT_NO_RBP_R13_REG_mask = _INT_REG_mask;
|
||||
_INT_NO_RBP_R13_REG_mask.Remove(OptoReg::as_OptoReg(rbp->as_VMReg()));
|
||||
_INT_NO_RBP_R13_REG_mask.Remove(OptoReg::as_OptoReg(r13->as_VMReg()));
|
||||
|
||||
// _FLOAT_REG_LEGACY_mask/_FLOAT_REG_EVEX_mask is generated by adlc
|
||||
// from the float_reg_legacy/float_reg_evex register class.
|
||||
_FLOAT_REG_mask = VM_Version::supports_evex() ? _FLOAT_REG_EVEX_mask : _FLOAT_REG_LEGACY_mask;
|
||||
@ -3491,6 +3515,21 @@ operand no_rax_rdx_RegI()
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand no_rbp_r13_RegI()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(int_no_rbp_r13_reg));
|
||||
match(RegI);
|
||||
match(rRegI);
|
||||
match(rax_RegI);
|
||||
match(rbx_RegI);
|
||||
match(rcx_RegI);
|
||||
match(rdx_RegI);
|
||||
match(rdi_RegI);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
// Pointer Register
|
||||
operand any_RegP()
|
||||
%{
|
||||
@ -3718,6 +3757,19 @@ operand rdx_RegL()
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand no_rbp_r13_RegL()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(long_no_rbp_r13_reg));
|
||||
match(RegL);
|
||||
match(rRegL);
|
||||
match(rax_RegL);
|
||||
match(rcx_RegL);
|
||||
match(rdx_RegL);
|
||||
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
// Flags register, used as output of compare instructions
|
||||
operand rFlagsReg()
|
||||
%{
|
||||
@ -7443,14 +7495,53 @@ instruct decI_mem(memory dst, immI_M1 src, rFlagsReg cr)
|
||||
ins_pipe(ialu_mem_imm);
|
||||
%}
|
||||
|
||||
instruct leaI_rReg_immI(rRegI dst, rRegI src0, immI src1)
|
||||
instruct leaI_rReg_immI2_immI(rRegI dst, rRegI index, immI2 scale, immI disp)
|
||||
%{
|
||||
match(Set dst (AddI src0 src1));
|
||||
predicate(VM_Version::supports_fast_2op_lea());
|
||||
match(Set dst (AddI (LShiftI index scale) disp));
|
||||
|
||||
ins_cost(110);
|
||||
format %{ "addr32 leal $dst, [$src0 + $src1]\t# int" %}
|
||||
format %{ "leal $dst, [$index << $scale + $disp]\t# int" %}
|
||||
ins_encode %{
|
||||
__ leal($dst$$Register, Address($src0$$Register, $src1$$constant));
|
||||
Address::ScaleFactor scale = static_cast<Address::ScaleFactor>($scale$$constant);
|
||||
__ leal($dst$$Register, Address(noreg, $index$$Register, scale, $disp$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct leaI_rReg_rReg_immI(rRegI dst, rRegI base, rRegI index, immI disp)
|
||||
%{
|
||||
predicate(VM_Version::supports_fast_3op_lea());
|
||||
match(Set dst (AddI (AddI base index) disp));
|
||||
|
||||
format %{ "leal $dst, [$base + $index + $disp]\t# int" %}
|
||||
ins_encode %{
|
||||
__ leal($dst$$Register, Address($base$$Register, $index$$Register, Address::times_1, $disp$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct leaI_rReg_rReg_immI2(rRegI dst, no_rbp_r13_RegI base, rRegI index, immI2 scale)
|
||||
%{
|
||||
predicate(VM_Version::supports_fast_2op_lea());
|
||||
match(Set dst (AddI base (LShiftI index scale)));
|
||||
|
||||
format %{ "leal $dst, [$base + $index << $scale]\t# int" %}
|
||||
ins_encode %{
|
||||
Address::ScaleFactor scale = static_cast<Address::ScaleFactor>($scale$$constant);
|
||||
__ leal($dst$$Register, Address($base$$Register, $index$$Register, scale));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct leaI_rReg_rReg_immI2_immI(rRegI dst, rRegI base, rRegI index, immI2 scale, immI disp)
|
||||
%{
|
||||
predicate(VM_Version::supports_fast_3op_lea());
|
||||
match(Set dst (AddI (AddI base (LShiftI index scale)) disp));
|
||||
|
||||
format %{ "leal $dst, [$base + $index << $scale + $disp]\t# int" %}
|
||||
ins_encode %{
|
||||
Address::ScaleFactor scale = static_cast<Address::ScaleFactor>($scale$$constant);
|
||||
__ leal($dst$$Register, Address($base$$Register, $index$$Register, scale, $disp$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
@ -7574,14 +7665,53 @@ instruct decL_mem(memory dst, immL_M1 src, rFlagsReg cr)
|
||||
ins_pipe(ialu_mem_imm);
|
||||
%}
|
||||
|
||||
instruct leaL_rReg_immL(rRegL dst, rRegL src0, immL32 src1)
|
||||
instruct leaL_rReg_immI2_immL32(rRegL dst, rRegL index, immI2 scale, immL32 disp)
|
||||
%{
|
||||
match(Set dst (AddL src0 src1));
|
||||
predicate(VM_Version::supports_fast_2op_lea());
|
||||
match(Set dst (AddL (LShiftL index scale) disp));
|
||||
|
||||
ins_cost(110);
|
||||
format %{ "leaq $dst, [$src0 + $src1]\t# long" %}
|
||||
format %{ "leaq $dst, [$index << $scale + $disp]\t# long" %}
|
||||
ins_encode %{
|
||||
__ leaq($dst$$Register, Address($src0$$Register, $src1$$constant));
|
||||
Address::ScaleFactor scale = static_cast<Address::ScaleFactor>($scale$$constant);
|
||||
__ leaq($dst$$Register, Address(noreg, $index$$Register, scale, $disp$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct leaL_rReg_rReg_immL32(rRegL dst, rRegL base, rRegL index, immL32 disp)
|
||||
%{
|
||||
predicate(VM_Version::supports_fast_3op_lea());
|
||||
match(Set dst (AddL (AddL base index) disp));
|
||||
|
||||
format %{ "leaq $dst, [$base + $index + $disp]\t# long" %}
|
||||
ins_encode %{
|
||||
__ leaq($dst$$Register, Address($base$$Register, $index$$Register, Address::times_1, $disp$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct leaL_rReg_rReg_immI2(rRegL dst, no_rbp_r13_RegL base, rRegL index, immI2 scale)
|
||||
%{
|
||||
predicate(VM_Version::supports_fast_2op_lea());
|
||||
match(Set dst (AddL base (LShiftL index scale)));
|
||||
|
||||
format %{ "leaq $dst, [$base + $index << $scale]\t# long" %}
|
||||
ins_encode %{
|
||||
Address::ScaleFactor scale = static_cast<Address::ScaleFactor>($scale$$constant);
|
||||
__ leaq($dst$$Register, Address($base$$Register, $index$$Register, scale));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct leaL_rReg_rReg_immI2_immL32(rRegL dst, rRegL base, rRegL index, immI2 scale, immL32 disp)
|
||||
%{
|
||||
predicate(VM_Version::supports_fast_3op_lea());
|
||||
match(Set dst (AddL (AddL base (LShiftL index scale)) disp));
|
||||
|
||||
format %{ "leaq $dst, [$base + $index << $scale + $disp]\t# long" %}
|
||||
ins_encode %{
|
||||
Address::ScaleFactor scale = static_cast<Address::ScaleFactor>($scale$$constant);
|
||||
__ leaq($dst$$Register, Address($base$$Register, $index$$Register, scale, $disp$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
@ -7612,18 +7742,6 @@ instruct addP_rReg_imm(rRegP dst, immL32 src, rFlagsReg cr)
|
||||
|
||||
// XXX addP mem ops ????
|
||||
|
||||
instruct leaP_rReg_imm(rRegP dst, rRegP src0, immL32 src1)
|
||||
%{
|
||||
match(Set dst (AddP src0 src1));
|
||||
|
||||
ins_cost(110);
|
||||
format %{ "leaq $dst, [$src0 + $src1]\t# ptr" %}
|
||||
ins_encode %{
|
||||
__ leaq($dst$$Register, Address($src0$$Register, $src1$$constant));
|
||||
%}
|
||||
ins_pipe(ialu_reg_reg);
|
||||
%}
|
||||
|
||||
instruct checkCastPP(rRegP dst)
|
||||
%{
|
||||
match(Set dst (CheckCastPP dst));
|
||||
|
122
test/micro/org/openjdk/bench/vm/compiler/LeaInstruction.java
Normal file
122
test/micro/org/openjdk/bench/vm/compiler/LeaInstruction.java
Normal file
@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@Fork(value = 1, jvmArgsAppend = {"-XX:LoopUnrollLimit=1"})
|
||||
@State(Scope.Thread)
|
||||
public class LeaInstruction {
|
||||
static final int ITERATION = 1000;
|
||||
|
||||
int x, y;
|
||||
|
||||
@Benchmark
|
||||
public void IS_D_int(Blackhole bh) {
|
||||
int x = this.x;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x * 4 + 10;
|
||||
}
|
||||
bh.consume(x);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void B_I_D_int(Blackhole bh) {
|
||||
int x = this.x, y = this.y;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x + y + 10;
|
||||
y = x + y + 20;
|
||||
}
|
||||
bh.consume(x);
|
||||
bh.consume(y);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void B_IS_int(Blackhole bh) {
|
||||
int x = this.x, y = this.y;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x + y * 4;
|
||||
y = x + y * 8;
|
||||
}
|
||||
bh.consume(x);
|
||||
bh.consume(y);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void B_IS_D_int(Blackhole bh) {
|
||||
int x = this.x, y = this.y;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x + y * 4 + 10;
|
||||
y = x + y * 8 + 20;
|
||||
}
|
||||
bh.consume(x);
|
||||
bh.consume(y);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void IS_D_long(Blackhole bh) {
|
||||
long x = this.x;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x * 4 + 10;
|
||||
}
|
||||
bh.consume(x);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void B_I_D_long(Blackhole bh) {
|
||||
long x = this.x, y = this.y;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x + y + 10;
|
||||
y = x + y + 20;
|
||||
}
|
||||
bh.consume(x);
|
||||
bh.consume(y);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void B_IS_long(Blackhole bh) {
|
||||
long x = this.x, y = this.y;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x + y * 4;
|
||||
y = x + y * 8;
|
||||
}
|
||||
bh.consume(x);
|
||||
bh.consume(y);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void B_IS_D_long(Blackhole bh) {
|
||||
long x = this.x, y = this.y;
|
||||
for (int i = 0; i < ITERATION; i++) {
|
||||
x = x + y * 4 + 10;
|
||||
y = x + y * 8 + 20;
|
||||
}
|
||||
bh.consume(x);
|
||||
bh.consume(y);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user