From 78289b84589e632d87504df6a9c63b5ac694d2f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Tue, 14 Mar 2023 17:45:28 +0100 Subject: [PATCH] zcommon: Refactor FPU state handling in fletcher4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently calls to kfpu_begin() and kfpu_end() are split between the init() and fini() functions of the particular SIMD implementation. This was done in #14247 as an optimization measure for the ABD adapter. Unfortunately the split complicates FPU handling on platforms that use a local FPU state buffer, like Windows and macOS. To ease porting, we introduce a boolean struct member in fletcher_4_ops_t, indicating use of the FPU, and move the FPU state handling from the SIMD implementations to the call sites. Reviewed-by: Tino Reichardt Reviewed-by: Richard Yao Reviewed-by: Jorgen Lundman Signed-off-by: Attila Fülöp Closes #14600 --- include/zfs_fletcher.h | 3 ++- lib/libzfs/libzfs.abi | 19 ++++++++++-------- module/zcommon/zfs_fletcher.c | 23 ++++++++++++++++++++++ module/zcommon/zfs_fletcher_aarch64_neon.c | 3 +-- module/zcommon/zfs_fletcher_avx512.c | 4 ++-- module/zcommon/zfs_fletcher_intel.c | 3 +-- module/zcommon/zfs_fletcher_sse.c | 4 ++-- module/zcommon/zfs_fletcher_superscalar.c | 1 + module/zcommon/zfs_fletcher_superscalar4.c | 1 + 9 files changed, 44 insertions(+), 17 deletions(-) diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index db5b539648..e913a0bd7f 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -126,8 +126,9 @@ typedef struct fletcher_4_func { fletcher_4_fini_f fini_byteswap; fletcher_4_compute_f compute_byteswap; boolean_t (*valid)(void); + boolean_t uses_fpu; const char *name; -} fletcher_4_ops_t; +} __attribute__((aligned(64))) fletcher_4_ops_t; _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops; _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 918a9105b2..9d3c0379da 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -578,13 +578,13 @@ - - - - - - - + + + + + + + @@ -9053,7 +9053,7 @@ - + @@ -9076,6 +9076,9 @@ + + + diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index fa9b8447e9..eae854f3d4 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = { .fini_byteswap = fletcher_4_scalar_fini, .compute_byteswap = fletcher_4_scalar_byteswap, .valid = fletcher_4_scalar_valid, + .uses_fpu = B_FALSE, .name = "scalar" }; @@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) fletcher_4_ctx_t ctx; const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + if (ops->uses_fpu == B_TRUE) { + kfpu_begin(); + } ops->init_native(&ctx); ops->compute_native(&ctx, buf, size); ops->fini_native(&ctx, zcp); + if (ops->uses_fpu == B_TRUE) { + kfpu_end(); + } } void @@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) fletcher_4_ctx_t ctx; const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + if (ops->uses_fpu == B_TRUE) { + kfpu_begin(); + } ops->init_byteswap(&ctx); ops->compute_byteswap(&ctx, buf, size); ops->fini_byteswap(&ctx, zcp); + if (ops->uses_fpu == B_TRUE) { + kfpu_end(); + } } void @@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ + fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \ } #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ @@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) const fletcher_4_ops_t *ops = fletcher_4_impl_get(); cdp->acd_private = (void *) ops; + if (ops->uses_fpu == B_TRUE) { + kfpu_begin(); + } if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) ops->init_native(cdp->acd_ctx); else ops->init_byteswap(cdp->acd_ctx); + } static void @@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); else ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); + + if (ops->uses_fpu == B_TRUE) { + kfpu_end(); + } } + static void abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, zio_abd_checksum_data_t *cdp) diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c index 8f03397288..cd5fe545a1 100644 --- a/module/zcommon/zfs_fletcher_aarch64_neon.c +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t)); } @@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] + ctx->aarch64_neon[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define NEON_INIT_LOOP() \ @@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { .compute_byteswap = fletcher_4_aarch64_neon_byteswap, .fini_byteswap = fletcher_4_aarch64_neon_fini, .valid = fletcher_4_aarch64_neon_valid, + .uses_fpu = B_TRUE, .name = "aarch64_neon" }; diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 4a3d5cb24a..81182ead2c 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t)); } @@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ @@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = { .fini_byteswap = fletcher_4_avx512f_fini, .compute_byteswap = fletcher_4_avx512f_byteswap, .valid = fletcher_4_avx512f_valid, + .uses_fpu = B_TRUE, .name = "avx512f" }; @@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = { .fini_byteswap = fletcher_4_avx512f_fini, .compute_byteswap = fletcher_4_avx512bw_byteswap, .valid = fletcher_4_avx512bw_valid, + .uses_fpu = B_TRUE, .name = "avx512bw" }; #endif diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index c124d49280..6108bda7a0 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx2_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t)); } @@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 64 * ctx->avx[3].v[3]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \ @@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = { .fini_byteswap = fletcher_4_avx2_fini, .compute_byteswap = fletcher_4_avx2_byteswap, .valid = fletcher_4_avx2_valid, + .uses_fpu = B_TRUE, .name = "avx2" }; diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index 6c78830be9..096472c9af 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t)); } @@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ @@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = { .fini_byteswap = fletcher_4_sse2_fini, .compute_byteswap = fletcher_4_sse2_byteswap, .valid = fletcher_4_sse2_valid, + .uses_fpu = B_TRUE, .name = "sse2" }; @@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = { .fini_byteswap = fletcher_4_sse2_fini, .compute_byteswap = fletcher_4_ssse3_byteswap, .valid = fletcher_4_ssse3_valid, + .uses_fpu = B_TRUE, .name = "ssse3" }; diff --git a/module/zcommon/zfs_fletcher_superscalar.c b/module/zcommon/zfs_fletcher_superscalar.c index 67dc095927..8b5b72a7b8 100644 --- a/module/zcommon/zfs_fletcher_superscalar.c +++ b/module/zcommon/zfs_fletcher_superscalar.c @@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = { .compute_byteswap = fletcher_4_superscalar_byteswap, .fini_byteswap = fletcher_4_superscalar_fini, .valid = fletcher_4_superscalar_valid, + .uses_fpu = B_FALSE, .name = "superscalar" }; diff --git a/module/zcommon/zfs_fletcher_superscalar4.c b/module/zcommon/zfs_fletcher_superscalar4.c index d2067c12f8..bef3879339 100644 --- a/module/zcommon/zfs_fletcher_superscalar4.c +++ b/module/zcommon/zfs_fletcher_superscalar4.c @@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = { .compute_byteswap = fletcher_4_superscalar4_byteswap, .fini_byteswap = fletcher_4_superscalar4_fini, .valid = fletcher_4_superscalar4_valid, + .uses_fpu = B_FALSE, .name = "superscalar4" };