MDEV-25870 followup : pmull support on Windows ARM64

casting vmull_p64 is possible on MSVC, although with much more
verbose code. The reason are missing neon types (no compiler support for
128bit ints).
This commit is contained in:
Vladislav Vaintroub 2025-05-28 01:55:39 +02:00
parent fe10645eb7
commit 8c6cbb3360
2 changed files with 34 additions and 5 deletions

View File

@ -68,12 +68,12 @@ IF(MSVC_INTEL)
ENDIF()
ELSEIF(MSVC_ARM64)
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS -DHAVE_ARMV8_CRYPTO)
IF(CLANG_CL)
SET_SOURCE_FILES_PROPERTIES(
crc32/crc32_arm64.c
PROPERTIES
COMPILE_FLAGS "-march=armv8-a+crc"
COMPILE_FLAGS "-march=armv8-a+crc+crypto"
)
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")

View File

@ -29,7 +29,9 @@ my_crc32_t crc32c_aarch64_available(void)
{
if (crc32_aarch64_available() == 0)
return NULL;
/* TODO : pmull seems supported, but does not compile*/
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
return crc32c_aarch64_pmull;
return crc32c_aarch64;
}
@ -181,11 +183,19 @@ asm(".arch_extension crypto");
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
} while(0)
#if defined _MSC_VER && !defined __clang__
#define PREF4X64L1(buffer, offset, itr)\
__prefetch(buffer + (offset) + ((itr) + 0)*64);\
__prefetch(buffer + (offset) + ((itr) + 1)*64);\
__prefetch(buffer + (offset) + ((itr) + 2)*64);\
__prefetch(buffer + (offset) + ((itr) + 3)*64);
#else
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
#endif
#define PREF1KL1(buffer, PREF_OFFSET) \
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
@ -193,11 +203,20 @@ asm(".arch_extension crypto");
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
PREF4X64L1(buffer,(PREF_OFFSET), 12)
#if defined _MSC_VER && !defined __clang__
#define MY_PLDL2KEEP 2 /* PLDL2KEEP is 2 in ARMv8 */
#define PREF4X64L2(buffer,offset,itr)\
__prefetch2(buffer + offset + ((itr) + 0) * 64, MY_PLDL2KEEP);\
__prefetch2(buffer + offset + ((itr) + 1) * 64, MY_PLDL2KEEP);\
__prefetch2(buffer + offset + ((itr) + 2) * 64, MY_PLDL2KEEP);\
__prefetch2(buffer + offset + ((itr) + 3) * 64, MY_PLDL2KEEP);
#else
#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
#endif
#define PREF1KL2(buffer, PREF_OFFSET) \
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
@ -240,6 +259,16 @@ static unsigned crc32c_aarch64(unsigned crc, const void *buf, size_t len)
#endif
#ifdef HAVE_ARMV8_CRYPTO
static inline uint64_t poly_mul(uint64_t a, uint64_t b)
{
#if defined _MSC_VER && !defined __clang__
return vgetq_lane_u64(vreinterpretq_u64_p128(neon_pmull_64(vcreate_p64(a), vcreate_p64(b))),0);
#else
return (uint64_t) vmull_p64(a, b);
#endif
}
static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
{
int64_t length= (int64_t)len;
@ -286,8 +315,8 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
* crc1 multiply by K2
* crc0 multiply by K1
*/
t1= (uint64_t)vmull_p64(crc1, k2);
t0= (uint64_t)vmull_p64(crc0, k1);
t1= poly_mul(crc1, k2);
t0= poly_mul(crc0, k1);
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
crc1= __crc32cd(0, t1);
crc^= crc1;