zcommon: Refactor FPU state handling in fletcher4

Currently calls to kfpu_begin() and kfpu_end() are split between
the init() and fini() functions of the particular SIMD
implementation. This was done in  as an optimization measure
for the ABD adapter. Unfortunately the split complicates FPU
handling on platforms that use a local FPU state buffer, like
Windows and macOS.

To ease porting, we introduce a boolean struct member in
fletcher_4_ops_t, indicating use of the FPU, and move the FPU state
handling from the SIMD implementations to the call sites.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Attila Fülöp <attila@fueloep.org>
Closes 
This commit is contained in:
Attila Fülöp 2023-03-14 17:45:28 +01:00 committed by GitHub
parent b15ab50c4d
commit 78289b8458
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 44 additions and 17 deletions

View File

@ -126,8 +126,9 @@ typedef struct fletcher_4_func {
fletcher_4_fini_f fini_byteswap; fletcher_4_fini_f fini_byteswap;
fletcher_4_compute_f compute_byteswap; fletcher_4_compute_f compute_byteswap;
boolean_t (*valid)(void); boolean_t (*valid)(void);
boolean_t uses_fpu;
const char *name; const char *name;
} fletcher_4_ops_t; } __attribute__((aligned(64))) fletcher_4_ops_t;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops; _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops; _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops;

View File

@ -578,13 +578,13 @@
<elf-variable-symbols> <elf-variable-symbols>
<elf-symbol name='efi_debug' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='efi_debug' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_abd_ops' size='24' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_abd_ops' size='24' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_avx2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx512bw_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_avx512bw_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_avx512f_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_avx512f_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_sse2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_sse2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_ssse3_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_ssse3_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_superscalar4_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_superscalar4_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -9053,7 +9053,7 @@
<typedef-decl name='fletcher_4_init_f' type-id='173aa527' id='b9ae1656'/> <typedef-decl name='fletcher_4_init_f' type-id='173aa527' id='b9ae1656'/>
<typedef-decl name='fletcher_4_fini_f' type-id='0ad5b8a8' id='c4c1f4fc'/> <typedef-decl name='fletcher_4_fini_f' type-id='0ad5b8a8' id='c4c1f4fc'/>
<typedef-decl name='fletcher_4_compute_f' type-id='38147eff' id='ad1dc4cb'/> <typedef-decl name='fletcher_4_compute_f' type-id='38147eff' id='ad1dc4cb'/>
<class-decl name='fletcher_4_func' size-in-bits='512' is-struct='yes' visibility='default' id='57f479a0'> <class-decl name='fletcher_4_func' size-in-bits='1024' is-struct='yes' visibility='default' id='57f479a0'>
<data-member access='public' layout-offset-in-bits='0'> <data-member access='public' layout-offset-in-bits='0'>
<var-decl name='init_native' type-id='b9ae1656' visibility='default'/> <var-decl name='init_native' type-id='b9ae1656' visibility='default'/>
</data-member> </data-member>
@ -9076,6 +9076,9 @@
<var-decl name='valid' type-id='297d38bc' visibility='default'/> <var-decl name='valid' type-id='297d38bc' visibility='default'/>
</data-member> </data-member>
<data-member access='public' layout-offset-in-bits='448'> <data-member access='public' layout-offset-in-bits='448'>
<var-decl name='uses_fpu' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='512'>
<var-decl name='name' type-id='80f4b756' visibility='default'/> <var-decl name='name' type-id='80f4b756' visibility='default'/>
</data-member> </data-member>
</class-decl> </class-decl>

View File

@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = {
.fini_byteswap = fletcher_4_scalar_fini, .fini_byteswap = fletcher_4_scalar_fini,
.compute_byteswap = fletcher_4_scalar_byteswap, .compute_byteswap = fletcher_4_scalar_byteswap,
.valid = fletcher_4_scalar_valid, .valid = fletcher_4_scalar_valid,
.uses_fpu = B_FALSE,
.name = "scalar" .name = "scalar"
}; };
@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fletcher_4_ctx_t ctx; fletcher_4_ctx_t ctx;
const fletcher_4_ops_t *ops = fletcher_4_impl_get(); const fletcher_4_ops_t *ops = fletcher_4_impl_get();
if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
ops->init_native(&ctx); ops->init_native(&ctx);
ops->compute_native(&ctx, buf, size); ops->compute_native(&ctx, buf, size);
ops->fini_native(&ctx, zcp); ops->fini_native(&ctx, zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
} }
void void
@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fletcher_4_ctx_t ctx; fletcher_4_ctx_t ctx;
const fletcher_4_ops_t *ops = fletcher_4_impl_get(); const fletcher_4_ops_t *ops = fletcher_4_impl_get();
if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
ops->init_byteswap(&ctx); ops->init_byteswap(&ctx);
ops->compute_byteswap(&ctx, buf, size); ops->compute_byteswap(&ctx, buf, size);
ops->fini_byteswap(&ctx, zcp); ops->fini_byteswap(&ctx, zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
} }
void void
@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \
} }
#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
const fletcher_4_ops_t *ops = fletcher_4_impl_get(); const fletcher_4_ops_t *ops = fletcher_4_impl_get();
cdp->acd_private = (void *) ops; cdp->acd_private = (void *) ops;
if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
ops->init_native(cdp->acd_ctx); ops->init_native(cdp->acd_ctx);
else else
ops->init_byteswap(cdp->acd_ctx); ops->init_byteswap(cdp->acd_ctx);
} }
static void static void
@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
else else
ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
} }
static void static void
abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
zio_abd_checksum_data_t *cdp) zio_abd_checksum_data_t *cdp)

View File

@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void static void
fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx) fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
{ {
kfpu_begin();
memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t)); memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
} }
@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] + 8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
ctx->aarch64_neon[1].v[1]; ctx->aarch64_neon[1].v[1];
ZIO_SET_CHECKSUM(zcp, A, B, C, D); ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
} }
#define NEON_INIT_LOOP() \ #define NEON_INIT_LOOP() \
@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
.compute_byteswap = fletcher_4_aarch64_neon_byteswap, .compute_byteswap = fletcher_4_aarch64_neon_byteswap,
.fini_byteswap = fletcher_4_aarch64_neon_fini, .fini_byteswap = fletcher_4_aarch64_neon_fini,
.valid = fletcher_4_aarch64_neon_valid, .valid = fletcher_4_aarch64_neon_valid,
.uses_fpu = B_TRUE,
.name = "aarch64_neon" .name = "aarch64_neon"
}; };

View File

@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void static void
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
{ {
kfpu_begin();
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t)); memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
} }
@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
} }
ZIO_SET_CHECKSUM(zcp, A, B, C, D); ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
} }
#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ #define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
.fini_byteswap = fletcher_4_avx512f_fini, .fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512f_byteswap, .compute_byteswap = fletcher_4_avx512f_byteswap,
.valid = fletcher_4_avx512f_valid, .valid = fletcher_4_avx512f_valid,
.uses_fpu = B_TRUE,
.name = "avx512f" .name = "avx512f"
}; };
@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
.fini_byteswap = fletcher_4_avx512f_fini, .fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512bw_byteswap, .compute_byteswap = fletcher_4_avx512bw_byteswap,
.valid = fletcher_4_avx512bw_valid, .valid = fletcher_4_avx512bw_valid,
.uses_fpu = B_TRUE,
.name = "avx512bw" .name = "avx512bw"
}; };
#endif #endif

View File

@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void static void
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx) fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
{ {
kfpu_begin();
memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t)); memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
} }
@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
64 * ctx->avx[3].v[3]; 64 * ctx->avx[3].v[3];
ZIO_SET_CHECKSUM(zcp, A, B, C, D); ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
} }
#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \ #define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = {
.fini_byteswap = fletcher_4_avx2_fini, .fini_byteswap = fletcher_4_avx2_fini,
.compute_byteswap = fletcher_4_avx2_byteswap, .compute_byteswap = fletcher_4_avx2_byteswap,
.valid = fletcher_4_avx2_valid, .valid = fletcher_4_avx2_valid,
.uses_fpu = B_TRUE,
.name = "avx2" .name = "avx2"
}; };

View File

@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void static void
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
{ {
kfpu_begin();
memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t)); memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
} }
@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
ZIO_SET_CHECKSUM(zcp, A, B, C, D); ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
} }
#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ #define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
.fini_byteswap = fletcher_4_sse2_fini, .fini_byteswap = fletcher_4_sse2_fini,
.compute_byteswap = fletcher_4_sse2_byteswap, .compute_byteswap = fletcher_4_sse2_byteswap,
.valid = fletcher_4_sse2_valid, .valid = fletcher_4_sse2_valid,
.uses_fpu = B_TRUE,
.name = "sse2" .name = "sse2"
}; };
@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
.fini_byteswap = fletcher_4_sse2_fini, .fini_byteswap = fletcher_4_sse2_fini,
.compute_byteswap = fletcher_4_ssse3_byteswap, .compute_byteswap = fletcher_4_ssse3_byteswap,
.valid = fletcher_4_ssse3_valid, .valid = fletcher_4_ssse3_valid,
.uses_fpu = B_TRUE,
.name = "ssse3" .name = "ssse3"
}; };

View File

@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = {
.compute_byteswap = fletcher_4_superscalar_byteswap, .compute_byteswap = fletcher_4_superscalar_byteswap,
.fini_byteswap = fletcher_4_superscalar_fini, .fini_byteswap = fletcher_4_superscalar_fini,
.valid = fletcher_4_superscalar_valid, .valid = fletcher_4_superscalar_valid,
.uses_fpu = B_FALSE,
.name = "superscalar" .name = "superscalar"
}; };

View File

@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
.compute_byteswap = fletcher_4_superscalar4_byteswap, .compute_byteswap = fletcher_4_superscalar4_byteswap,
.fini_byteswap = fletcher_4_superscalar4_fini, .fini_byteswap = fletcher_4_superscalar4_fini,
.valid = fletcher_4_superscalar4_valid, .valid = fletcher_4_superscalar4_valid,
.uses_fpu = B_FALSE,
.name = "superscalar4" .name = "superscalar4"
}; };