8189113: AARCH64: StringLatin1 inflate intrinsic doesn't use prefetch instruction

Reviewed-by: aph
This commit is contained in:
Dmitrij Pochepko 2018-06-25 16:32:23 +03:00
parent 51d3abfc8c
commit 3d7d35c321
5 changed files with 128 additions and 29 deletions

View File

@ -16168,7 +16168,7 @@ instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
// fast byte[] to char[] inflation // fast byte[] to char[] inflation
instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len,
vRegD tmp1, vRegD tmp2, vRegD tmp3, iRegP_R3 tmp4, rFlagsReg cr) vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr)
%{ %{
match(Set dummy (StrInflatedCopy src (Binary dst len))); match(Set dummy (StrInflatedCopy src (Binary dst len)));
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);

View File

@ -5681,26 +5681,24 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
Register tmp4) { Register tmp4) {
Label big, done; Label big, done, after_init, to_stub;
assert_different_registers(src, dst, len, tmp4, rscratch1); assert_different_registers(src, dst, len, tmp4, rscratch1);
fmovd(vtmp1 , zr); fmovd(vtmp1, zr);
lsrw(rscratch1, len, 3); lsrw(tmp4, len, 3);
bind(after_init);
cbnzw(rscratch1, big); cbnzw(tmp4, big);
// Short string: less than 8 bytes. // Short string: less than 8 bytes.
{ {
Label loop, around, tiny; Label loop, tiny;
subsw(len, len, 4);
andw(len, len, 3);
br(LO, tiny);
cmpw(len, 4);
br(LT, tiny);
// Use SIMD to do 4 bytes. // Use SIMD to do 4 bytes.
ldrs(vtmp2, post(src, 4)); ldrs(vtmp2, post(src, 4));
zip1(vtmp3, T8B, vtmp2, vtmp1); zip1(vtmp3, T8B, vtmp2, vtmp1);
subw(len, len, 4);
strd(vtmp3, post(dst, 8)); strd(vtmp3, post(dst, 8));
cbzw(len, done); cbzw(len, done);
@ -5714,35 +5712,65 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
bind(tiny); bind(tiny);
cbnz(len, loop); cbnz(len, loop);
bind(around);
b(done); b(done);
} }
if (SoftwarePrefetchHintDistance >= 0) {
bind(to_stub);
RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
trampoline_call(stub);
b(after_init);
}
// Unpack the bytes 8 at a time. // Unpack the bytes 8 at a time.
bind(big); bind(big);
andw(len, len, 7);
{ {
Label loop, around; Label loop, around, loop_last, loop_start;
bind(loop); if (SoftwarePrefetchHintDistance >= 0) {
ldrd(vtmp2, post(src, 8)); const int large_loop_threshold = (64 + 16)/8;
sub(rscratch1, rscratch1, 1); ldrd(vtmp2, post(src, 8));
zip1(vtmp3, T16B, vtmp2, vtmp1); andw(len, len, 7);
st1(vtmp3, T8H, post(dst, 16)); cmp(tmp4, large_loop_threshold);
cbnz(rscratch1, loop); br(GE, to_stub);
b(loop_start);
bind(around); bind(loop);
ldrd(vtmp2, post(src, 8));
bind(loop_start);
subs(tmp4, tmp4, 1);
br(EQ, loop_last);
zip1(vtmp2, T16B, vtmp2, vtmp1);
ldrd(vtmp3, post(src, 8));
st1(vtmp2, T8H, post(dst, 16));
subs(tmp4, tmp4, 1);
zip1(vtmp3, T16B, vtmp3, vtmp1);
st1(vtmp3, T8H, post(dst, 16));
br(NE, loop);
b(around);
bind(loop_last);
zip1(vtmp2, T16B, vtmp2, vtmp1);
st1(vtmp2, T8H, post(dst, 16));
bind(around);
cbz(len, done);
} else {
andw(len, len, 7);
bind(loop);
ldrd(vtmp2, post(src, 8));
sub(tmp4, tmp4, 1);
zip1(vtmp3, T16B, vtmp2, vtmp1);
st1(vtmp3, T8H, post(dst, 16));
cbnz(tmp4, loop);
}
} }
// Do the tail of up to 8 bytes. // Do the tail of up to 8 bytes.
sub(src, src, 8); add(src, src, len);
add(src, src, len, ext::uxtw, 0); ldrd(vtmp3, Address(src, -8));
ldrd(vtmp2, Address(src));
sub(dst, dst, 16);
add(dst, dst, len, ext::uxtw, 1); add(dst, dst, len, ext::uxtw, 1);
zip1(vtmp3, T16B, vtmp2, vtmp1); zip1(vtmp3, T16B, vtmp3, vtmp1);
st1(vtmp3, T8H, Address(dst)); strq(vtmp3, Address(dst, -16));
bind(done); bind(done);
} }

View File

@ -4624,6 +4624,68 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
} }
void inflate_and_store_2_fp_registers(bool generatePrfm,
FloatRegister src1, FloatRegister src2) {
Register dst = r1;
__ zip1(v1, __ T16B, src1, v0);
__ zip2(v2, __ T16B, src1, v0);
if (generatePrfm) {
__ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
}
__ zip1(v3, __ T16B, src2, v0);
__ zip2(v4, __ T16B, src2, v0);
__ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
}
// R0 = src
// R1 = dst
// R2 = len
// R3 = len >> 3
// V0 = 0
// v1 = loaded 8 bytes
address generate_large_byte_array_inflate() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
address entry = __ pc();
Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
Register src = r0, dst = r1, len = r2, octetCounter = r3;
const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
// do one more 8-byte read to have address 16-byte aligned in most cases
// also use single store instruction
__ ldrd(v2, __ post(src, 8));
__ sub(octetCounter, octetCounter, 2);
__ zip1(v1, __ T16B, v1, v0);
__ zip1(v2, __ T16B, v2, v0);
__ st1(v1, v2, __ T16B, __ post(dst, 32));
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
__ cmp(octetCounter, large_loop_threshold);
__ br(__ LE, LOOP_START);
__ b(LOOP_PRFM_START);
__ bind(LOOP_PRFM);
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
__ bind(LOOP_PRFM_START);
__ prfm(Address(src, SoftwarePrefetchHintDistance));
__ sub(octetCounter, octetCounter, 8);
__ cmp(octetCounter, large_loop_threshold);
inflate_and_store_2_fp_registers(true, v3, v4);
inflate_and_store_2_fp_registers(true, v5, v6);
__ br(__ GT, LOOP_PRFM);
__ cmp(octetCounter, 8);
__ br(__ LT, DONE);
__ bind(LOOP);
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
__ bind(LOOP_START);
__ sub(octetCounter, octetCounter, 8);
__ cmp(octetCounter, 8);
inflate_and_store_2_fp_registers(false, v3, v4);
inflate_and_store_2_fp_registers(false, v5, v6);
__ br(__ GE, LOOP);
__ bind(DONE);
__ ret(lr);
return entry;
}
/** /**
* Arguments: * Arguments:
* *
@ -5727,6 +5789,9 @@ class StubGenerator: public StubCodeGenerator {
generate_string_indexof_stubs(); generate_string_indexof_stubs();
// byte_array_inflate stub for large arrays.
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
if (UseMultiplyToLenIntrinsic) { if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen(); StubRoutines::_multiplyToLen = generate_multiplyToLen();
} }

View File

@ -55,6 +55,7 @@ address StubRoutines::aarch64::_compare_long_string_UL = NULL;
address StubRoutines::aarch64::_string_indexof_linear_ll = NULL; address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
address StubRoutines::aarch64::_string_indexof_linear_uu = NULL; address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
address StubRoutines::aarch64::_string_indexof_linear_ul = NULL; address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
address StubRoutines::aarch64::_large_byte_array_inflate = NULL;
bool StubRoutines::aarch64::_completed = false; bool StubRoutines::aarch64::_completed = false;
/** /**

View File

@ -73,6 +73,7 @@ class aarch64 {
static address _string_indexof_linear_ll; static address _string_indexof_linear_ll;
static address _string_indexof_linear_uu; static address _string_indexof_linear_uu;
static address _string_indexof_linear_ul; static address _string_indexof_linear_ul;
static address _large_byte_array_inflate;
static bool _completed; static bool _completed;
public: public:
@ -171,6 +172,10 @@ class aarch64 {
return _string_indexof_linear_uu; return _string_indexof_linear_uu;
} }
static address large_byte_array_inflate() {
return _large_byte_array_inflate;
}
static bool complete() { static bool complete() {
return _completed; return _completed;
} }