8189113: AARCH64: StringLatin1 inflate intrinsic doesn't use prefetch instruction
Reviewed-by: aph
This commit is contained in:
parent
51d3abfc8c
commit
3d7d35c321
@ -16168,7 +16168,7 @@ instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
|
||||
|
||||
// fast byte[] to char[] inflation
|
||||
instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len,
|
||||
vRegD tmp1, vRegD tmp2, vRegD tmp3, iRegP_R3 tmp4, rFlagsReg cr)
|
||||
vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr)
|
||||
%{
|
||||
match(Set dummy (StrInflatedCopy src (Binary dst len)));
|
||||
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
|
||||
|
@ -5681,26 +5681,24 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
|
||||
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
|
||||
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
|
||||
Register tmp4) {
|
||||
Label big, done;
|
||||
Label big, done, after_init, to_stub;
|
||||
|
||||
assert_different_registers(src, dst, len, tmp4, rscratch1);
|
||||
|
||||
fmovd(vtmp1 , zr);
|
||||
lsrw(rscratch1, len, 3);
|
||||
|
||||
cbnzw(rscratch1, big);
|
||||
|
||||
fmovd(vtmp1, zr);
|
||||
lsrw(tmp4, len, 3);
|
||||
bind(after_init);
|
||||
cbnzw(tmp4, big);
|
||||
// Short string: less than 8 bytes.
|
||||
{
|
||||
Label loop, around, tiny;
|
||||
|
||||
subsw(len, len, 4);
|
||||
andw(len, len, 3);
|
||||
br(LO, tiny);
|
||||
Label loop, tiny;
|
||||
|
||||
cmpw(len, 4);
|
||||
br(LT, tiny);
|
||||
// Use SIMD to do 4 bytes.
|
||||
ldrs(vtmp2, post(src, 4));
|
||||
zip1(vtmp3, T8B, vtmp2, vtmp1);
|
||||
subw(len, len, 4);
|
||||
strd(vtmp3, post(dst, 8));
|
||||
|
||||
cbzw(len, done);
|
||||
@ -5714,35 +5712,65 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
||||
bind(tiny);
|
||||
cbnz(len, loop);
|
||||
|
||||
bind(around);
|
||||
b(done);
|
||||
}
|
||||
|
||||
if (SoftwarePrefetchHintDistance >= 0) {
|
||||
bind(to_stub);
|
||||
RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
|
||||
assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
|
||||
trampoline_call(stub);
|
||||
b(after_init);
|
||||
}
|
||||
|
||||
// Unpack the bytes 8 at a time.
|
||||
bind(big);
|
||||
andw(len, len, 7);
|
||||
|
||||
{
|
||||
Label loop, around;
|
||||
Label loop, around, loop_last, loop_start;
|
||||
|
||||
bind(loop);
|
||||
ldrd(vtmp2, post(src, 8));
|
||||
sub(rscratch1, rscratch1, 1);
|
||||
zip1(vtmp3, T16B, vtmp2, vtmp1);
|
||||
st1(vtmp3, T8H, post(dst, 16));
|
||||
cbnz(rscratch1, loop);
|
||||
if (SoftwarePrefetchHintDistance >= 0) {
|
||||
const int large_loop_threshold = (64 + 16)/8;
|
||||
ldrd(vtmp2, post(src, 8));
|
||||
andw(len, len, 7);
|
||||
cmp(tmp4, large_loop_threshold);
|
||||
br(GE, to_stub);
|
||||
b(loop_start);
|
||||
|
||||
bind(around);
|
||||
bind(loop);
|
||||
ldrd(vtmp2, post(src, 8));
|
||||
bind(loop_start);
|
||||
subs(tmp4, tmp4, 1);
|
||||
br(EQ, loop_last);
|
||||
zip1(vtmp2, T16B, vtmp2, vtmp1);
|
||||
ldrd(vtmp3, post(src, 8));
|
||||
st1(vtmp2, T8H, post(dst, 16));
|
||||
subs(tmp4, tmp4, 1);
|
||||
zip1(vtmp3, T16B, vtmp3, vtmp1);
|
||||
st1(vtmp3, T8H, post(dst, 16));
|
||||
br(NE, loop);
|
||||
b(around);
|
||||
bind(loop_last);
|
||||
zip1(vtmp2, T16B, vtmp2, vtmp1);
|
||||
st1(vtmp2, T8H, post(dst, 16));
|
||||
bind(around);
|
||||
cbz(len, done);
|
||||
} else {
|
||||
andw(len, len, 7);
|
||||
bind(loop);
|
||||
ldrd(vtmp2, post(src, 8));
|
||||
sub(tmp4, tmp4, 1);
|
||||
zip1(vtmp3, T16B, vtmp2, vtmp1);
|
||||
st1(vtmp3, T8H, post(dst, 16));
|
||||
cbnz(tmp4, loop);
|
||||
}
|
||||
}
|
||||
|
||||
// Do the tail of up to 8 bytes.
|
||||
sub(src, src, 8);
|
||||
add(src, src, len, ext::uxtw, 0);
|
||||
ldrd(vtmp2, Address(src));
|
||||
sub(dst, dst, 16);
|
||||
add(src, src, len);
|
||||
ldrd(vtmp3, Address(src, -8));
|
||||
add(dst, dst, len, ext::uxtw, 1);
|
||||
zip1(vtmp3, T16B, vtmp2, vtmp1);
|
||||
st1(vtmp3, T8H, Address(dst));
|
||||
zip1(vtmp3, T16B, vtmp3, vtmp1);
|
||||
strq(vtmp3, Address(dst, -16));
|
||||
|
||||
bind(done);
|
||||
}
|
||||
|
@ -4624,6 +4624,68 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
|
||||
}
|
||||
|
||||
void inflate_and_store_2_fp_registers(bool generatePrfm,
|
||||
FloatRegister src1, FloatRegister src2) {
|
||||
Register dst = r1;
|
||||
__ zip1(v1, __ T16B, src1, v0);
|
||||
__ zip2(v2, __ T16B, src1, v0);
|
||||
if (generatePrfm) {
|
||||
__ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
|
||||
}
|
||||
__ zip1(v3, __ T16B, src2, v0);
|
||||
__ zip2(v4, __ T16B, src2, v0);
|
||||
__ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
|
||||
}
|
||||
|
||||
// R0 = src
|
||||
// R1 = dst
|
||||
// R2 = len
|
||||
// R3 = len >> 3
|
||||
// V0 = 0
|
||||
// v1 = loaded 8 bytes
|
||||
address generate_large_byte_array_inflate() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
|
||||
address entry = __ pc();
|
||||
Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
|
||||
Register src = r0, dst = r1, len = r2, octetCounter = r3;
|
||||
const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
|
||||
|
||||
// do one more 8-byte read to have address 16-byte aligned in most cases
|
||||
// also use single store instruction
|
||||
__ ldrd(v2, __ post(src, 8));
|
||||
__ sub(octetCounter, octetCounter, 2);
|
||||
__ zip1(v1, __ T16B, v1, v0);
|
||||
__ zip1(v2, __ T16B, v2, v0);
|
||||
__ st1(v1, v2, __ T16B, __ post(dst, 32));
|
||||
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
|
||||
__ cmp(octetCounter, large_loop_threshold);
|
||||
__ br(__ LE, LOOP_START);
|
||||
__ b(LOOP_PRFM_START);
|
||||
__ bind(LOOP_PRFM);
|
||||
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
|
||||
__ bind(LOOP_PRFM_START);
|
||||
__ prfm(Address(src, SoftwarePrefetchHintDistance));
|
||||
__ sub(octetCounter, octetCounter, 8);
|
||||
__ cmp(octetCounter, large_loop_threshold);
|
||||
inflate_and_store_2_fp_registers(true, v3, v4);
|
||||
inflate_and_store_2_fp_registers(true, v5, v6);
|
||||
__ br(__ GT, LOOP_PRFM);
|
||||
__ cmp(octetCounter, 8);
|
||||
__ br(__ LT, DONE);
|
||||
__ bind(LOOP);
|
||||
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
|
||||
__ bind(LOOP_START);
|
||||
__ sub(octetCounter, octetCounter, 8);
|
||||
__ cmp(octetCounter, 8);
|
||||
inflate_and_store_2_fp_registers(false, v3, v4);
|
||||
inflate_and_store_2_fp_registers(false, v5, v6);
|
||||
__ br(__ GE, LOOP);
|
||||
__ bind(DONE);
|
||||
__ ret(lr);
|
||||
return entry;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -5727,6 +5789,9 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
generate_string_indexof_stubs();
|
||||
|
||||
// byte_array_inflate stub for large arrays.
|
||||
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
|
||||
|
||||
if (UseMultiplyToLenIntrinsic) {
|
||||
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
||||
}
|
||||
|
@ -55,6 +55,7 @@ address StubRoutines::aarch64::_compare_long_string_UL = NULL;
|
||||
address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
|
||||
address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
|
||||
address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
|
||||
address StubRoutines::aarch64::_large_byte_array_inflate = NULL;
|
||||
bool StubRoutines::aarch64::_completed = false;
|
||||
|
||||
/**
|
||||
|
@ -73,6 +73,7 @@ class aarch64 {
|
||||
static address _string_indexof_linear_ll;
|
||||
static address _string_indexof_linear_uu;
|
||||
static address _string_indexof_linear_ul;
|
||||
static address _large_byte_array_inflate;
|
||||
static bool _completed;
|
||||
|
||||
public:
|
||||
@ -171,6 +172,10 @@ class aarch64 {
|
||||
return _string_indexof_linear_uu;
|
||||
}
|
||||
|
||||
static address large_byte_array_inflate() {
|
||||
return _large_byte_array_inflate;
|
||||
}
|
||||
|
||||
static bool complete() {
|
||||
return _completed;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user