8189113: AARCH64: StringLatin1 inflate intrinsic doesn't use prefetch instruction

Reviewed-by: aph
This commit is contained in:
Dmitrij Pochepko 2018-06-25 16:32:23 +03:00
parent 51d3abfc8c
commit 3d7d35c321
5 changed files with 128 additions and 29 deletions

View File

@ -16168,7 +16168,7 @@ instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
// fast byte[] to char[] inflation
instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len,
vRegD tmp1, vRegD tmp2, vRegD tmp3, iRegP_R3 tmp4, rFlagsReg cr)
vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr)
%{
match(Set dummy (StrInflatedCopy src (Binary dst len)));
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);

View File

@ -5681,26 +5681,24 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
Register tmp4) {
Label big, done;
Label big, done, after_init, to_stub;
assert_different_registers(src, dst, len, tmp4, rscratch1);
fmovd(vtmp1 , zr);
lsrw(rscratch1, len, 3);
cbnzw(rscratch1, big);
fmovd(vtmp1, zr);
lsrw(tmp4, len, 3);
bind(after_init);
cbnzw(tmp4, big);
// Short string: less than 8 bytes.
{
Label loop, around, tiny;
subsw(len, len, 4);
andw(len, len, 3);
br(LO, tiny);
Label loop, tiny;
cmpw(len, 4);
br(LT, tiny);
// Use SIMD to do 4 bytes.
ldrs(vtmp2, post(src, 4));
zip1(vtmp3, T8B, vtmp2, vtmp1);
subw(len, len, 4);
strd(vtmp3, post(dst, 8));
cbzw(len, done);
@ -5714,35 +5712,65 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
bind(tiny);
cbnz(len, loop);
bind(around);
b(done);
}
if (SoftwarePrefetchHintDistance >= 0) {
bind(to_stub);
RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
trampoline_call(stub);
b(after_init);
}
// Unpack the bytes 8 at a time.
bind(big);
andw(len, len, 7);
{
Label loop, around;
Label loop, around, loop_last, loop_start;
bind(loop);
ldrd(vtmp2, post(src, 8));
sub(rscratch1, rscratch1, 1);
zip1(vtmp3, T16B, vtmp2, vtmp1);
st1(vtmp3, T8H, post(dst, 16));
cbnz(rscratch1, loop);
if (SoftwarePrefetchHintDistance >= 0) {
const int large_loop_threshold = (64 + 16)/8;
ldrd(vtmp2, post(src, 8));
andw(len, len, 7);
cmp(tmp4, large_loop_threshold);
br(GE, to_stub);
b(loop_start);
bind(around);
bind(loop);
ldrd(vtmp2, post(src, 8));
bind(loop_start);
subs(tmp4, tmp4, 1);
br(EQ, loop_last);
zip1(vtmp2, T16B, vtmp2, vtmp1);
ldrd(vtmp3, post(src, 8));
st1(vtmp2, T8H, post(dst, 16));
subs(tmp4, tmp4, 1);
zip1(vtmp3, T16B, vtmp3, vtmp1);
st1(vtmp3, T8H, post(dst, 16));
br(NE, loop);
b(around);
bind(loop_last);
zip1(vtmp2, T16B, vtmp2, vtmp1);
st1(vtmp2, T8H, post(dst, 16));
bind(around);
cbz(len, done);
} else {
andw(len, len, 7);
bind(loop);
ldrd(vtmp2, post(src, 8));
sub(tmp4, tmp4, 1);
zip1(vtmp3, T16B, vtmp2, vtmp1);
st1(vtmp3, T8H, post(dst, 16));
cbnz(tmp4, loop);
}
}
// Do the tail of up to 8 bytes.
sub(src, src, 8);
add(src, src, len, ext::uxtw, 0);
ldrd(vtmp2, Address(src));
sub(dst, dst, 16);
add(src, src, len);
ldrd(vtmp3, Address(src, -8));
add(dst, dst, len, ext::uxtw, 1);
zip1(vtmp3, T16B, vtmp2, vtmp1);
st1(vtmp3, T8H, Address(dst));
zip1(vtmp3, T16B, vtmp3, vtmp1);
strq(vtmp3, Address(dst, -16));
bind(done);
}

View File

@ -4624,6 +4624,68 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
}
void inflate_and_store_2_fp_registers(bool generatePrfm,
FloatRegister src1, FloatRegister src2) {
Register dst = r1;
__ zip1(v1, __ T16B, src1, v0);
__ zip2(v2, __ T16B, src1, v0);
if (generatePrfm) {
__ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
}
__ zip1(v3, __ T16B, src2, v0);
__ zip2(v4, __ T16B, src2, v0);
__ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
}
// R0 = src
// R1 = dst
// R2 = len
// R3 = len >> 3
// V0 = 0
// v1 = loaded 8 bytes
address generate_large_byte_array_inflate() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
address entry = __ pc();
Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
Register src = r0, dst = r1, len = r2, octetCounter = r3;
const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
// do one more 8-byte read to have address 16-byte aligned in most cases
// also use single store instruction
__ ldrd(v2, __ post(src, 8));
__ sub(octetCounter, octetCounter, 2);
__ zip1(v1, __ T16B, v1, v0);
__ zip1(v2, __ T16B, v2, v0);
__ st1(v1, v2, __ T16B, __ post(dst, 32));
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
__ cmp(octetCounter, large_loop_threshold);
__ br(__ LE, LOOP_START);
__ b(LOOP_PRFM_START);
__ bind(LOOP_PRFM);
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
__ bind(LOOP_PRFM_START);
__ prfm(Address(src, SoftwarePrefetchHintDistance));
__ sub(octetCounter, octetCounter, 8);
__ cmp(octetCounter, large_loop_threshold);
inflate_and_store_2_fp_registers(true, v3, v4);
inflate_and_store_2_fp_registers(true, v5, v6);
__ br(__ GT, LOOP_PRFM);
__ cmp(octetCounter, 8);
__ br(__ LT, DONE);
__ bind(LOOP);
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
__ bind(LOOP_START);
__ sub(octetCounter, octetCounter, 8);
__ cmp(octetCounter, 8);
inflate_and_store_2_fp_registers(false, v3, v4);
inflate_and_store_2_fp_registers(false, v5, v6);
__ br(__ GE, LOOP);
__ bind(DONE);
__ ret(lr);
return entry;
}
/**
* Arguments:
*
@ -5727,6 +5789,9 @@ class StubGenerator: public StubCodeGenerator {
generate_string_indexof_stubs();
// byte_array_inflate stub for large arrays.
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}

View File

@ -55,6 +55,7 @@ address StubRoutines::aarch64::_compare_long_string_UL = NULL;
address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
address StubRoutines::aarch64::_large_byte_array_inflate = NULL;
bool StubRoutines::aarch64::_completed = false;
/**

View File

@ -73,6 +73,7 @@ class aarch64 {
static address _string_indexof_linear_ll;
static address _string_indexof_linear_uu;
static address _string_indexof_linear_ul;
static address _large_byte_array_inflate;
static bool _completed;
public:
@ -171,6 +172,10 @@ class aarch64 {
return _string_indexof_linear_uu;
}
static address large_byte_array_inflate() {
return _large_byte_array_inflate;
}
static bool complete() {
return _completed;
}