Fletcher4: Incremental using SIMD

Combine incrementally computed fletcher4 checksums. Checksums are combined
a posteriori, allowing for parallel computation on chunks to be implemented if
required. The algorithm is general, and does not add changes in each SIMD
implementation.
New test in ztest verifies incremental fletcher computations.

Checksum combining matrix for two buffers `a` and `b`, where `Ca` and `Cb` are
respective fletcher4 checksums, `Cab` is combined checksum, `s` is size of buffer
`b` (divided by sizeof(uint32_t)) is:

Cab[A] = Cb[A] + Ca[A]
Cab[B] = Cb[B] + Ca[B] + s * Ca[A]
Cab[C] = Cb[C] + Ca[C] + s * Ca[B] + s(s+1)/2 * Ca[A]
Cab[D] = Cb[D] + Ca[D] + s * Ca[C] + s(s+1)/2 * Ca[B] + s(s+1)(s+2)/6 * Ca[A]

NOTE: this calculation overflows for larger buffers. Thus, internally, the calculation
is performed on 8MiB chunks.

Signed-off-by: Gvozden Neskovic <neskovic@gmail.com>
This commit is contained in:
Gvozden Neskovic 2016-09-23 03:52:29 +02:00
parent dc03fa3092
commit 37f520db2d
2 changed files with 136 additions and 18 deletions

View File

@ -332,6 +332,7 @@ ztest_func_t ztest_split_pool;
ztest_func_t ztest_reguid; ztest_func_t ztest_reguid;
ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_verify_dnode_bt;
uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
@ -379,6 +380,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
}; };
@ -5674,6 +5676,82 @@ ztest_fletcher(ztest_ds_t *zd, uint64_t id)
} }
} }
void
ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
{
void *buf;
size_t size;
int *ptr;
int i;
zio_cksum_t zc_ref;
zio_cksum_t zc_ref_bswap;
hrtime_t end = gethrtime() + NANOSEC;
while (gethrtime() <= end) {
int run_count = 100;
size = ztest_random_blocksize();
buf = umem_alloc(size, UMEM_NOFAIL);
for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
*ptr = ztest_random(UINT_MAX);
VERIFY0(fletcher_4_impl_set("scalar"));
fletcher_4_native(buf, size, NULL, &zc_ref);
fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap);
VERIFY0(fletcher_4_impl_set("cycle"));
while (run_count-- > 0) {
zio_cksum_t zc;
zio_cksum_t zc_bswap;
size_t pos = 0;
ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
while (pos < size) {
size_t inc = 64 * ztest_random(size / 67);
/* sometimes add few bytes to test non-simd */
if (ztest_random(100) < 10)
inc += P2ALIGN(ztest_random(64),
sizeof (uint32_t));
if (inc > (size - pos))
inc = size - pos;
fletcher_4_incremental_native(buf + pos, inc,
&zc);
fletcher_4_incremental_byteswap(buf + pos, inc,
&zc_bswap);
pos += inc;
}
VERIFY3U(pos, ==, size);
VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
/*
* verify if incremental on the whole buffer is
* equivalent to non-incremental version
*/
ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
fletcher_4_incremental_native(buf, size, &zc);
fletcher_4_incremental_byteswap(buf, size, &zc_bswap);
VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
}
umem_free(buf, size);
}
}
static int static int
ztest_check_path(char *path) ztest_check_path(char *path)
{ {

View File

@ -383,24 +383,6 @@ fletcher_4_impl_get(void)
return (ops); return (ops);
} }
void
fletcher_4_incremental_native(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
fletcher_4_scalar_native(buf, size, zcp);
}
void
fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
fletcher_4_scalar_byteswap(buf, size, zcp);
}
static inline void static inline void
fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf, fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
uint64_t size, zio_cksum_t *zcp) uint64_t size, zio_cksum_t *zcp)
@ -477,6 +459,64 @@ fletcher_4_byteswap(const void *buf, uint64_t size,
} }
} }
/* Incremental Fletcher 4 */
static inline void
fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
const zio_cksum_t *nzcp)
{
const uint64_t c1 = size / sizeof (uint32_t);
const uint64_t c2 = c1 * (c1 + 1) / 2;
const uint64_t c3 = c2 * (c1 + 2) / 3;
zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
c2 * zcp->zc_word[0];
zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
zcp->zc_word[0] += nzcp->zc_word[0];
}
static inline void
fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
static const uint64_t FLETCHER_4_INC_MAX = 8ULL << 20;
uint64_t len;
while (size > 0) {
zio_cksum_t nzc;
len = MIN(size, FLETCHER_4_INC_MAX);
if (native)
fletcher_4_native(buf, len, NULL, &nzc);
else
fletcher_4_byteswap(buf, len, NULL, &nzc);
fletcher_4_incremental_combine(zcp, len, &nzc);
size -= len;
buf += len;
}
}
void
fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
}
void
fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
zio_cksum_t *zcp)
{
fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
}
/* Fletcher 4 kstats */
static int static int
fletcher_4_kstat_headers(char *buf, size_t size) fletcher_4_kstat_headers(char *buf, size_t size)
{ {