zfs/module/zcommon/zfs_fletcher_superscalar.c

/*
 * Implement fast Fletcher4 using superscalar pipelines.
 *
 * Use regular C code to compute
 * Fletcher4 in two incremental 64-bit parallel accumulator streams,
 * and then combine the streams to form the final four checksum words.
 * This implementation is a derivative of the AVX SIMD implementation by
 * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
 *
 * Copyright (C) 2016 Romain Dolbeau.
 *
 * Authors:
 *	Romain Dolbeau <romain.dolbeau@atos.net>
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *	- Redistributions of source code must retain the above
 *	  copyright notice, this list of conditions and the following
 *	  disclaimer.
 *
 *	- Redistributions in binary form must reproduce the above
 *	  copyright notice, this list of conditions and the following
 *	  disclaimer in the documentation and/or other materials
 *	  provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <sys/param.h>
#include <sys/byteorder.h>
#include <sys/spa_checksum.h>
#include <sys/string.h>
#include <zfs_fletcher.h>

static void
fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx)
{
	memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t));
}

static void
fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
	uint64_t A, B, C, D;
	A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1];
	B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] -
	    ctx->superscalar[0].v[1];
	C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] +
	    4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1];
	D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] +
	    8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] +
	    ctx->superscalar[1].v[1];
	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
}

static void
fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx,
    const void *buf, uint64_t size)
{
	const uint32_t *ip = buf;
	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
	uint64_t a, b, c, d;
	uint64_t a2, b2, c2, d2;

	a = ctx->superscalar[0].v[0];
	b = ctx->superscalar[1].v[0];
	c = ctx->superscalar[2].v[0];
	d = ctx->superscalar[3].v[0];
	a2 = ctx->superscalar[0].v[1];
	b2 = ctx->superscalar[1].v[1];
	c2 = ctx->superscalar[2].v[1];
	d2 = ctx->superscalar[3].v[1];

	do {
		a += ip[0];
		a2 += ip[1];
		b += a;
		b2 += a2;
		c += b;
		c2 += b2;
		d += c;
		d2 += c2;
	} while ((ip += 2) < ipend);

	ctx->superscalar[0].v[0] = a;
	ctx->superscalar[1].v[0] = b;
	ctx->superscalar[2].v[0] = c;
	ctx->superscalar[3].v[0] = d;
	ctx->superscalar[0].v[1] = a2;
	ctx->superscalar[1].v[1] = b2;
	ctx->superscalar[2].v[1] = c2;
	ctx->superscalar[3].v[1] = d2;
}

static void
fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx,
    const void *buf, uint64_t size)
{
	const uint32_t *ip = buf;
	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
	uint64_t a, b, c, d;
	uint64_t a2, b2, c2, d2;

	a = ctx->superscalar[0].v[0];
	b = ctx->superscalar[1].v[0];
	c = ctx->superscalar[2].v[0];
	d = ctx->superscalar[3].v[0];
	a2 = ctx->superscalar[0].v[1];
	b2 = ctx->superscalar[1].v[1];
	c2 = ctx->superscalar[2].v[1];
	d2 = ctx->superscalar[3].v[1];

	do {
		a += BSWAP_32(ip[0]);
		a2 += BSWAP_32(ip[1]);
		b += a;
		b2 += a2;
		c += b;
		c2 += b2;
		d += c;
		d2 += c2;
	} while ((ip += 2) < ipend);

	ctx->superscalar[0].v[0] = a;
	ctx->superscalar[1].v[0] = b;
	ctx->superscalar[2].v[0] = c;
	ctx->superscalar[3].v[0] = d;
	ctx->superscalar[0].v[1] = a2;
	ctx->superscalar[1].v[1] = b2;
	ctx->superscalar[2].v[1] = c2;
	ctx->superscalar[3].v[1] = d2;
}

static boolean_t fletcher_4_superscalar_valid(void)
{
	return (B_TRUE);
}

const fletcher_4_ops_t fletcher_4_superscalar_ops = {
	.init_native = fletcher_4_superscalar_init,
	.compute_native = fletcher_4_superscalar_native,
	.fini_native = fletcher_4_superscalar_fini,
	.init_byteswap = fletcher_4_superscalar_init,
	.compute_byteswap = fletcher_4_superscalar_byteswap,
	.fini_byteswap = fletcher_4_superscalar_fini,
	.valid = fletcher_4_superscalar_valid,
	.uses_fpu = B_FALSE,
	.name = "superscalar"
};
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`/*`
			`* Implement fast Fletcher4 using superscalar pipelines.`
			`*`
			`* Use regular C code to compute`
			`* Fletcher4 in two incremental 64-bit parallel accumulator streams,`
			`* and then combine the streams to form the final four checksum words.`
			`* This implementation is a derivative of the AVX SIMD implementation by`
			`* James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).`
			`*`
			`* Copyright (C) 2016 Romain Dolbeau.`
			`*`
			`* Authors:`
			`* Romain Dolbeau <romain.dolbeau@atos.net>`
			`*`
			`* This software is available to you under a choice of one of two`
			`* licenses. You may choose to be licensed under the terms of the GNU`
			`* General Public License (GPL) Version 2, available from the file`
			`* COPYING in the main directory of this source tree, or the`
			`* OpenIB.org BSD license below:`
			`*`
			`* Redistribution and use in source and binary forms, with or`
			`* without modification, are permitted provided that the following`
			`* conditions are met:`
			`*`
			`* - Redistributions of source code must retain the above`
			`* copyright notice, this list of conditions and the following`
			`* disclaimer.`
			`*`
			`* - Redistributions in binary form must reproduce the above`
			`* copyright notice, this list of conditions and the following`
			`* disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,`
			`* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND`
			`* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS`
			`* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN`
			`* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN`
			`* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*/`

Refactor ccompile.h to not include system headers This is a step toward being able to vendor the OpenZFS code in FreeBSD. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Matt Macy <mmacy@FreeBSD.org> Closes #10625 2020-07-26 03:09:50 +00:00			`#include <sys/param.h>`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`#include <sys/byteorder.h>`
			`#include <sys/spa_checksum.h>`
Forbid b{copy,zero,cmp}(). Don't include <strings.h> for <string.h> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> Closes #12996 2022-01-22 00:56:46 +00:00			`#include <sys/string.h>`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`#include <zfs_fletcher.h>`

			`static void`
			`fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx)`
			`{`
Remove bcopy(), bzero(), bcmp() bcopy() has a confusing argument order and is actually a move, not a copy; they're all deprecated since POSIX.1-2001 and removed in -2008, and we shim them out to mem*() on Linux anyway Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> Closes #12996 2022-02-25 13:26:54 +00:00			`memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t));`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`}`

			`static void`
			`fletcher_4_superscalar_fini(fletcher_4_ctx_t ctx, zio_cksum_t zcp)`
			`{`
			`uint64_t A, B, C, D;`
			`A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1];`
			`B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] -`
			`ctx->superscalar[0].v[1];`
			`C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] +`
			`4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1];`
			`D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] +`
			`8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] +`
			`ctx->superscalar[1].v[1];`
			`ZIO_SET_CHECKSUM(zcp, A, B, C, D);`
			`}`

			`static void`
			`fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx,`
			`const void *buf, uint64_t size)`
			`{`
			`const uint32_t *ip = buf;`
			`const uint32_t *ipend = ip + (size / sizeof (uint32_t));`
			`uint64_t a, b, c, d;`
			`uint64_t a2, b2, c2, d2;`

			`a = ctx->superscalar[0].v[0];`
			`b = ctx->superscalar[1].v[0];`
			`c = ctx->superscalar[2].v[0];`
			`d = ctx->superscalar[3].v[0];`
			`a2 = ctx->superscalar[0].v[1];`
			`b2 = ctx->superscalar[1].v[1];`
			`c2 = ctx->superscalar[2].v[1];`
			`d2 = ctx->superscalar[3].v[1];`

Micro-optimize fletcher4 calculations When processing abds, we execute 1 `kfpu_begin()`/`kfpu_end()` pair on every page in the abd. This is wasteful and slows down checksum performance versus what the benchmark claimed. We correct this by moving those calls to the init and fini functions. Also, we always check the buffer length against 0 before calling the non-scalar checksum functions. This means that we do not need to execute the loop condition for the first loop iteration. That allows us to micro-optimize the checksum calculations by switching to do-while loops. Note that we do not apply that micro-optimization to the scalar implementation because there is no check in `fletcher_4_incremental_native()`/`fletcher_4_incremental_byteswap()` against 0 sized buffers being passed. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14247 2022-12-05 19:00:34 +00:00			`do {`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`a += ip[0];`
			`a2 += ip[1];`
			`b += a;`
			`b2 += a2;`
			`c += b;`
			`c2 += b2;`
			`d += c;`
			`d2 += c2;`
Micro-optimize fletcher4 calculations When processing abds, we execute 1 `kfpu_begin()`/`kfpu_end()` pair on every page in the abd. This is wasteful and slows down checksum performance versus what the benchmark claimed. We correct this by moving those calls to the init and fini functions. Also, we always check the buffer length against 0 before calling the non-scalar checksum functions. This means that we do not need to execute the loop condition for the first loop iteration. That allows us to micro-optimize the checksum calculations by switching to do-while loops. Note that we do not apply that micro-optimization to the scalar implementation because there is no check in `fletcher_4_incremental_native()`/`fletcher_4_incremental_byteswap()` against 0 sized buffers being passed. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14247 2022-12-05 19:00:34 +00:00			`} while ((ip += 2) < ipend);`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00
			`ctx->superscalar[0].v[0] = a;`
			`ctx->superscalar[1].v[0] = b;`
			`ctx->superscalar[2].v[0] = c;`
			`ctx->superscalar[3].v[0] = d;`
			`ctx->superscalar[0].v[1] = a2;`
			`ctx->superscalar[1].v[1] = b2;`
			`ctx->superscalar[2].v[1] = c2;`
			`ctx->superscalar[3].v[1] = d2;`
			`}`

			`static void`
			`fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx,`
			`const void *buf, uint64_t size)`
			`{`
			`const uint32_t *ip = buf;`
			`const uint32_t *ipend = ip + (size / sizeof (uint32_t));`
			`uint64_t a, b, c, d;`
			`uint64_t a2, b2, c2, d2;`

			`a = ctx->superscalar[0].v[0];`
			`b = ctx->superscalar[1].v[0];`
			`c = ctx->superscalar[2].v[0];`
			`d = ctx->superscalar[3].v[0];`
			`a2 = ctx->superscalar[0].v[1];`
			`b2 = ctx->superscalar[1].v[1];`
			`c2 = ctx->superscalar[2].v[1];`
			`d2 = ctx->superscalar[3].v[1];`

Micro-optimize fletcher4 calculations When processing abds, we execute 1 `kfpu_begin()`/`kfpu_end()` pair on every page in the abd. This is wasteful and slows down checksum performance versus what the benchmark claimed. We correct this by moving those calls to the init and fini functions. Also, we always check the buffer length against 0 before calling the non-scalar checksum functions. This means that we do not need to execute the loop condition for the first loop iteration. That allows us to micro-optimize the checksum calculations by switching to do-while loops. Note that we do not apply that micro-optimization to the scalar implementation because there is no check in `fletcher_4_incremental_native()`/`fletcher_4_incremental_byteswap()` against 0 sized buffers being passed. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14247 2022-12-05 19:00:34 +00:00			`do {`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`a += BSWAP_32(ip[0]);`
			`a2 += BSWAP_32(ip[1]);`
			`b += a;`
			`b2 += a2;`
			`c += b;`
			`c2 += b2;`
			`d += c;`
			`d2 += c2;`
Micro-optimize fletcher4 calculations When processing abds, we execute 1 `kfpu_begin()`/`kfpu_end()` pair on every page in the abd. This is wasteful and slows down checksum performance versus what the benchmark claimed. We correct this by moving those calls to the init and fini functions. Also, we always check the buffer length against 0 before calling the non-scalar checksum functions. This means that we do not need to execute the loop condition for the first loop iteration. That allows us to micro-optimize the checksum calculations by switching to do-while loops. Note that we do not apply that micro-optimization to the scalar implementation because there is no check in `fletcher_4_incremental_native()`/`fletcher_4_incremental_byteswap()` against 0 sized buffers being passed. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14247 2022-12-05 19:00:34 +00:00			`} while ((ip += 2) < ipend);`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00
			`ctx->superscalar[0].v[0] = a;`
			`ctx->superscalar[1].v[0] = b;`
			`ctx->superscalar[2].v[0] = c;`
			`ctx->superscalar[3].v[0] = d;`
			`ctx->superscalar[0].v[1] = a2;`
			`ctx->superscalar[1].v[1] = b2;`
			`ctx->superscalar[2].v[1] = c2;`
			`ctx->superscalar[3].v[1] = d2;`
			`}`

			`static boolean_t fletcher_4_superscalar_valid(void)`
			`{`
			`return (B_TRUE);`
			`}`

			`const fletcher_4_ops_t fletcher_4_superscalar_ops = {`
			`.init_native = fletcher_4_superscalar_init,`
			`.compute_native = fletcher_4_superscalar_native,`
			`.fini_native = fletcher_4_superscalar_fini,`
			`.init_byteswap = fletcher_4_superscalar_init,`
			`.compute_byteswap = fletcher_4_superscalar_byteswap,`
			`.fini_byteswap = fletcher_4_superscalar_fini,`
			`.valid = fletcher_4_superscalar_valid,`
zcommon: Refactor FPU state handling in fletcher4 Currently calls to kfpu_begin() and kfpu_end() are split between the init() and fini() functions of the particular SIMD implementation. This was done in #14247 as an optimization measure for the ABD adapter. Unfortunately the split complicates FPU handling on platforms that use a local FPU state buffer, like Windows and macOS. To ease porting, we introduce a boolean struct member in fletcher_4_ops_t, indicating use of the FPU, and move the FPU state handling from the SIMD implementations to the call sites. Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de> Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Jorgen Lundman <lundman@lundman.net> Signed-off-by: Attila Fülöp <attila@fueloep.org> Closes #14600 2023-03-14 16:45:28 +00:00			`.uses_fpu = B_FALSE,`
Add superscalar fletcher4 This is the Fletcher4 algorithm implemented in pure C, but using multiple counters using algorithms identical to those used for SSE/NEON and AVX2. This allows for faster execution on core with strong superscalar capabilities but weak SIMD capabilities. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net> Closes #5317 2016-11-04 17:53:03 +00:00			`.name = "superscalar"`
			`};`