zcommon: Refactor FPU state handling in fletcher4
Currently calls to kfpu_begin() and kfpu_end() are split between the init() and fini() functions of the particular SIMD implementation. This was done in #14247 as an optimization measure for the ABD adapter. Unfortunately the split complicates FPU handling on platforms that use a local FPU state buffer, like Windows and macOS. To ease porting, we introduce a boolean struct member in fletcher_4_ops_t, indicating use of the FPU, and move the FPU state handling from the SIMD implementations to the call sites. Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de> Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Jorgen Lundman <lundman@lundman.net> Signed-off-by: Attila Fülöp <attila@fueloep.org> Closes #14600
This commit is contained in:
parent
b15ab50c4d
commit
78289b8458
|
@ -126,8 +126,9 @@ typedef struct fletcher_4_func {
|
|||
fletcher_4_fini_f fini_byteswap;
|
||||
fletcher_4_compute_f compute_byteswap;
|
||||
boolean_t (*valid)(void);
|
||||
boolean_t uses_fpu;
|
||||
const char *name;
|
||||
} fletcher_4_ops_t;
|
||||
} __attribute__((aligned(64))) fletcher_4_ops_t;
|
||||
|
||||
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops;
|
||||
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops;
|
||||
|
|
|
@ -578,13 +578,13 @@
|
|||
<elf-variable-symbols>
|
||||
<elf-symbol name='efi_debug' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_abd_ops' size='24' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_avx2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_avx512bw_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_avx512f_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_sse2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_ssse3_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_superscalar4_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_avx2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_avx512bw_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_avx512f_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_sse2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_ssse3_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_superscalar4_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
<elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
|
||||
|
@ -9053,7 +9053,7 @@
|
|||
<typedef-decl name='fletcher_4_init_f' type-id='173aa527' id='b9ae1656'/>
|
||||
<typedef-decl name='fletcher_4_fini_f' type-id='0ad5b8a8' id='c4c1f4fc'/>
|
||||
<typedef-decl name='fletcher_4_compute_f' type-id='38147eff' id='ad1dc4cb'/>
|
||||
<class-decl name='fletcher_4_func' size-in-bits='512' is-struct='yes' visibility='default' id='57f479a0'>
|
||||
<class-decl name='fletcher_4_func' size-in-bits='1024' is-struct='yes' visibility='default' id='57f479a0'>
|
||||
<data-member access='public' layout-offset-in-bits='0'>
|
||||
<var-decl name='init_native' type-id='b9ae1656' visibility='default'/>
|
||||
</data-member>
|
||||
|
@ -9076,6 +9076,9 @@
|
|||
<var-decl name='valid' type-id='297d38bc' visibility='default'/>
|
||||
</data-member>
|
||||
<data-member access='public' layout-offset-in-bits='448'>
|
||||
<var-decl name='uses_fpu' type-id='c19b74c3' visibility='default'/>
|
||||
</data-member>
|
||||
<data-member access='public' layout-offset-in-bits='512'>
|
||||
<var-decl name='name' type-id='80f4b756' visibility='default'/>
|
||||
</data-member>
|
||||
</class-decl>
|
||||
|
|
|
@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = {
|
|||
.fini_byteswap = fletcher_4_scalar_fini,
|
||||
.compute_byteswap = fletcher_4_scalar_byteswap,
|
||||
.valid = fletcher_4_scalar_valid,
|
||||
.uses_fpu = B_FALSE,
|
||||
.name = "scalar"
|
||||
};
|
||||
|
||||
|
@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
|||
fletcher_4_ctx_t ctx;
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_begin();
|
||||
}
|
||||
ops->init_native(&ctx);
|
||||
ops->compute_native(&ctx, buf, size);
|
||||
ops->fini_native(&ctx, zcp);
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_end();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
|
|||
fletcher_4_ctx_t ctx;
|
||||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_begin();
|
||||
}
|
||||
ops->init_byteswap(&ctx);
|
||||
ops->compute_byteswap(&ctx, buf, size);
|
||||
ops->fini_byteswap(&ctx, zcp);
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_end();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
|
|||
fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
|
||||
fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
|
||||
fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
|
||||
fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \
|
||||
}
|
||||
|
||||
#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
|
||||
|
@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
|
|||
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
|
||||
cdp->acd_private = (void *) ops;
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_begin();
|
||||
}
|
||||
if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
|
||||
ops->init_native(cdp->acd_ctx);
|
||||
else
|
||||
ops->init_byteswap(cdp->acd_ctx);
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
|
|||
ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
|
||||
else
|
||||
ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
|
||||
|
||||
if (ops->uses_fpu == B_TRUE) {
|
||||
kfpu_end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
|
||||
zio_abd_checksum_data_t *cdp)
|
||||
|
|
|
@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
|||
static void
|
||||
fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
|
||||
}
|
||||
|
||||
|
@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
|||
8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
|
||||
ctx->aarch64_neon[1].v[1];
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define NEON_INIT_LOOP() \
|
||||
|
@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
|
|||
.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
|
||||
.fini_byteswap = fletcher_4_aarch64_neon_fini,
|
||||
.valid = fletcher_4_aarch64_neon_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "aarch64_neon"
|
||||
};
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
|||
static void
|
||||
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
|
||||
}
|
||||
|
||||
|
@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
|||
}
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
|
||||
|
@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
|
|||
.fini_byteswap = fletcher_4_avx512f_fini,
|
||||
.compute_byteswap = fletcher_4_avx512f_byteswap,
|
||||
.valid = fletcher_4_avx512f_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "avx512f"
|
||||
};
|
||||
|
||||
|
@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
|
|||
.fini_byteswap = fletcher_4_avx512f_fini,
|
||||
.compute_byteswap = fletcher_4_avx512bw_byteswap,
|
||||
.valid = fletcher_4_avx512bw_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "avx512bw"
|
||||
};
|
||||
#endif
|
||||
|
|
|
@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
|||
static void
|
||||
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
|
||||
}
|
||||
|
||||
|
@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
|||
64 * ctx->avx[3].v[3];
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
|
||||
|
@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = {
|
|||
.fini_byteswap = fletcher_4_avx2_fini,
|
||||
.compute_byteswap = fletcher_4_avx2_byteswap,
|
||||
.valid = fletcher_4_avx2_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "avx2"
|
||||
};
|
||||
|
||||
|
|
|
@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED
|
|||
static void
|
||||
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
|
||||
{
|
||||
kfpu_begin();
|
||||
memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
|
||||
}
|
||||
|
||||
|
@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
|
|||
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
|
||||
|
||||
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
|
||||
kfpu_end();
|
||||
}
|
||||
|
||||
#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
|
||||
|
@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
|
|||
.fini_byteswap = fletcher_4_sse2_fini,
|
||||
.compute_byteswap = fletcher_4_sse2_byteswap,
|
||||
.valid = fletcher_4_sse2_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "sse2"
|
||||
};
|
||||
|
||||
|
@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
|
|||
.fini_byteswap = fletcher_4_sse2_fini,
|
||||
.compute_byteswap = fletcher_4_ssse3_byteswap,
|
||||
.valid = fletcher_4_ssse3_valid,
|
||||
.uses_fpu = B_TRUE,
|
||||
.name = "ssse3"
|
||||
};
|
||||
|
||||
|
|
|
@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = {
|
|||
.compute_byteswap = fletcher_4_superscalar_byteswap,
|
||||
.fini_byteswap = fletcher_4_superscalar_fini,
|
||||
.valid = fletcher_4_superscalar_valid,
|
||||
.uses_fpu = B_FALSE,
|
||||
.name = "superscalar"
|
||||
};
|
||||
|
|
|
@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
|
|||
.compute_byteswap = fletcher_4_superscalar4_byteswap,
|
||||
.fini_byteswap = fletcher_4_superscalar4_fini,
|
||||
.valid = fletcher_4_superscalar4_valid,
|
||||
.uses_fpu = B_FALSE,
|
||||
.name = "superscalar4"
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue