Add AVX512BW variant of fletcher
It is much faster than AVX512F when byteswapping on Skylake-SP and newer, as we can do the byteswap in a single vshufb instead of many instructions. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Chunwei Chen <tuxoko@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #9517
This commit is contained in:
parent
bae11ba8dc
commit
0b2a642351
|
@ -143,6 +143,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops;
|
|||
extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64) && defined(HAVE_AVX512BW)
|
||||
extern const fletcher_4_ops_t fletcher_4_avx512bw_ops;
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
|
||||
#endif
|
||||
|
|
|
@ -1507,7 +1507,7 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
|
|||
Select a fletcher 4 implementation.
|
||||
.sp
|
||||
Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
|
||||
\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR.
|
||||
\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR.
|
||||
All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
|
||||
set extensions to be available and will only appear if ZFS detects that they are
|
||||
present at runtime. If multiple implementations of fletcher 4 are available,
|
||||
|
|
|
@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = {
|
|||
#if defined(__x86_64) && defined(HAVE_AVX512F)
|
||||
&fletcher_4_avx512f_ops,
|
||||
#endif
|
||||
#if defined(__x86_64) && defined(HAVE_AVX512BW)
|
||||
&fletcher_4_avx512bw_ops,
|
||||
#endif
|
||||
#if defined(__aarch64__)
|
||||
&fletcher_4_aarch64_neon_ops,
|
||||
#endif
|
||||
|
|
|
@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
|
|||
.name = "avx512f"
|
||||
};
|
||||
|
||||
#if defined(HAVE_AVX512BW)
|
||||
static void
|
||||
fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
|
||||
uint64_t size)
|
||||
{
|
||||
static const zfs_fletcher_avx512_t mask = {
|
||||
.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
|
||||
0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
|
||||
};
|
||||
const uint32_t *ip = buf;
|
||||
const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
|
||||
|
||||
kfpu_begin();
|
||||
|
||||
FLETCHER_4_AVX512_RESTORE_CTX(ctx);
|
||||
|
||||
__asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
|
||||
|
||||
for (; ip < ipend; ip += 8) {
|
||||
__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
|
||||
|
||||
__asm("vpshufb %zmm5, %zmm4, %zmm4");
|
||||
|
||||
__asm("vpaddq %zmm4, %zmm0, %zmm0");
|
||||
__asm("vpaddq %zmm0, %zmm1, %zmm1");
|
||||
__asm("vpaddq %zmm1, %zmm2, %zmm2");
|
||||
__asm("vpaddq %zmm2, %zmm3, %zmm3");
|
||||
}
|
||||
|
||||
FLETCHER_4_AVX512_SAVE_CTX(ctx)
|
||||
|
||||
kfpu_end();
|
||||
}
|
||||
STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
|
||||
|
||||
const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
|
||||
.init_native = fletcher_4_avx512f_init,
|
||||
.fini_native = fletcher_4_avx512f_fini,
|
||||
.compute_native = fletcher_4_avx512f_native,
|
||||
.init_byteswap = fletcher_4_avx512f_init,
|
||||
.fini_byteswap = fletcher_4_avx512f_fini,
|
||||
.compute_byteswap = fletcher_4_avx512bw_byteswap,
|
||||
.valid = fletcher_4_avx512f_valid,
|
||||
.name = "avx512bw"
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
|
||||
|
|
Loading…
Reference in New Issue