From cbf484f8ad26b84a17c5308af47d2c202e1dc9e9 Mon Sep 17 00:00:00 2001 From: Gvozden Neskovic Date: Wed, 24 Aug 2016 15:51:33 +0200 Subject: [PATCH] ABD Vectorized raidz Enable vectorized raidz code on ABD buffers. The avx512f, avx512bw, neon and aarch64_neonx2 are disabled in this commit. With the exception of avx512bw these implementations are updated for ABD in the subsequent commits. Signed-off-by: Gvozden Neskovic --- cmd/raidz_test/raidz_bench.c | 11 +- cmd/raidz_test/raidz_test.c | 78 +- cmd/raidz_test/raidz_test.h | 6 +- module/zfs/vdev_raidz_math.c | 39 +- module/zfs/vdev_raidz_math_aarch64_neon.c | 5 +- module/zfs/vdev_raidz_math_aarch64_neonx2.c | 2 +- module/zfs/vdev_raidz_math_avx2.c | 84 +- module/zfs/vdev_raidz_math_avx512bw.c | 18 +- module/zfs/vdev_raidz_math_avx512f.c | 17 +- module/zfs/vdev_raidz_math_impl.h | 1919 +++++++++++-------- module/zfs/vdev_raidz_math_scalar.c | 134 +- module/zfs/vdev_raidz_math_sse2.c | 92 +- module/zfs/vdev_raidz_math_ssse3.c | 84 +- 13 files changed, 1438 insertions(+), 1051 deletions(-) diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index 7ddb7bed01..7a18902eb3 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -23,8 +23,6 @@ * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ -#ifdef _ABD_READY_ - #include #include #include @@ -55,18 +53,18 @@ bench_init_raidz_map(void) /* * To permit larger column sizes these have to be done - * allocated using aligned alloc instead of zio_data_buf_alloc + * allocated using aligned alloc instead of zio_abd_buf_alloc */ - zio_bench.io_data = raidz_alloc(max_data_size); + zio_bench.io_abd = raidz_alloc(max_data_size); - init_zio_data(&zio_bench); + init_zio_abd(&zio_bench); } static void bench_fini_raidz_maps(void) { /* tear down golden zio */ - raidz_free(zio_bench.io_data, max_data_size); + raidz_free(zio_bench.io_abd, max_data_size); bzero(&zio_bench, sizeof (zio_t)); } @@ -227,4 +225,3 @@ run_raidz_benchmark(void) bench_fini_raidz_maps(); } -#endif diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 4b6072992e..87f4eb9786 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -32,16 +32,6 @@ #include #include #include - -#ifndef _ABD_READY_ -int -main(int argc, char **argv) -{ - exit(0); -} - -#else - #include "raidz_test.h" static int *rand_data; @@ -191,10 +181,10 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_data) +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) #define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_data) +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) #define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) static int @@ -205,10 +195,9 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) VERIFY(parity >= 1 && parity <= 3); for (i = 0; i < parity; i++) { - if (0 != memcmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i), - CODE_COL_SIZE(rm, i))) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { ret++; - LOG_OPT(D_DEBUG, opts, "\nParity block [%d] different!\n", i); } @@ -223,8 +212,8 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); for (i = 0; i < dcols; i++) { - if (0 != memcmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i), - DATA_COL_SIZE(opts->rm_golden, i))) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { ret++; LOG_OPT(D_DEBUG, opts, @@ -234,37 +223,55 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) return (ret); } +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *) data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + +static int +corrupt_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *) data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand(); + + return (0); +} + + static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { int i; - int *dst; raidz_col_t *col; for (i = 0; i < cnt; i++) { col = &rm->rm_col[tgts[i]]; - dst = col->rc_data; - for (i = 0; i < col->rc_size / sizeof (int); i++) - dst[i] = rand(); + abd_iterate_func(col->rc_abd, 0, col->rc_size, corrupt_rand, + NULL); } } void -init_zio_data(zio_t *zio) +init_zio_abd(zio_t *zio) { - int i; - int *dst = (int *) zio->io_data; - - for (i = 0; i < zio->io_size / sizeof (int); i++) { - dst[i] = rand_data[i]; - } + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); } static void fini_raidz_map(zio_t **zio, raidz_map_t **rm) { vdev_raidz_map_free(*rm); - raidz_free((*zio)->io_data, (*zio)->io_size); + raidz_free((*zio)->io_abd, (*zio)->io_size); umem_free(*zio, sizeof (zio_t)); *zio = NULL; @@ -289,11 +296,11 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; - opts->zio_golden->io_data = raidz_alloc(opts->rto_dsize); - zio_test->io_data = raidz_alloc(opts->rto_dsize); + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); - init_zio_data(opts->zio_golden); - init_zio_data(zio_test); + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); VERIFY0(vdev_raidz_impl_set("original")); @@ -336,8 +343,8 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_offset = 0; (*zio)->io_size = alloc_dsize; - (*zio)->io_data = raidz_alloc(alloc_dsize); - init_zio_data(*zio); + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); @@ -792,4 +799,3 @@ main(int argc, char **argv) return (err); } -#endif diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index b279d82f29..a7fd26b8b2 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -104,11 +104,11 @@ static inline size_t ilog2(size_t a) #define SEP "----------------\n" -#define raidz_alloc(size) zio_data_buf_alloc(size) -#define raidz_free(p, size) zio_data_buf_free(p, size) +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) -void init_zio_data(zio_t *zio); +void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 1e4bf84131..93d7964d20 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -44,16 +44,6 @@ static raidz_impl_ops_t vdev_raidz_fastest_impl = { .name = "fastest" }; -/* ABD BRINGUP -- not ready yet */ -#if 1 -#ifdef HAVE_SSSE3 -#undef HAVE_SSSE3 -#endif -#ifdef HAVE_AVX2 -#undef HAVE_AVX2 -#endif -#endif - /* All compiled in implementations */ const raidz_impl_ops_t *raidz_all_maths[] = { &vdev_raidz_original_impl, @@ -68,14 +58,14 @@ const raidz_impl_ops_t *raidz_all_maths[] = { &vdev_raidz_avx2_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ - &vdev_raidz_avx512f_impl, + // &vdev_raidz_avx512f_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ - &vdev_raidz_avx512bw_impl, + // &vdev_raidz_avx512bw_impl, #endif #if defined(__aarch64__) - &vdev_raidz_aarch64_neon_impl, - &vdev_raidz_aarch64_neonx2_impl, + // &vdev_raidz_aarch64_neon_impl, + // &vdev_raidz_aarch64_neonx2_impl, #endif }; @@ -159,8 +149,6 @@ vdev_raidz_math_generate(raidz_map_t *rm) { raidz_gen_f gen_parity = NULL; -/* ABD Bringup -- vector code not ready */ -#if 0 switch (raidz_parity(rm)) { case 1: gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; @@ -177,7 +165,6 @@ vdev_raidz_math_generate(raidz_map_t *rm) raidz_parity(rm)); break; } -#endif /* if method is NULL execute the original implementation */ if (gen_parity == NULL) @@ -188,8 +175,6 @@ vdev_raidz_math_generate(raidz_map_t *rm) return (0); } -/* ABD Bringup -- vector code not ready */ -#if 0 static raidz_rec_f reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, const int nbaddata) @@ -244,7 +229,6 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, } return ((raidz_rec_f) NULL); } -#endif /* * Select data reconstruction method for raidz_map @@ -256,31 +240,28 @@ int vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, const int *dt, const int nbaddata) { - raidz_rec_f rec_data = NULL; + raidz_rec_f rec_fn = NULL; -/* ABD Bringup -- vector code not ready */ -#if 0 switch (raidz_parity(rm)) { case PARITY_P: - rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); break; case PARITY_PQ: - rec_data = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); break; case PARITY_PQR: - rec_data = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", raidz_parity(rm)); break; } -#endif - if (rec_data == NULL) + if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_data(rm, dt)); + return (rec_fn(rm, dt)); } const char *raidz_gen_name[] = { diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c index f6a433f100..7ba30ba5ea 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -23,8 +23,9 @@ */ #include +#include -#if defined(__aarch64__) +#if 0 // defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" @@ -153,7 +154,7 @@ const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { #endif /* defined(__aarch64__) */ -#if defined(__aarch64__) +#if 0 // defined(__aarch64__) const uint8_t __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c index d8d1f1bcea..e05deeb98a 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -24,7 +24,7 @@ #include -#if defined(__aarch64__) +#if 0 // defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c index 508c95f8df..25ba9fabd1 100644 --- a/module/zfs/vdev_raidz_math_avx2.c +++ b/module/zfs/vdev_raidz_math_avx2.c @@ -334,59 +334,86 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; kfpu_end(); \ } -#define GEN_P_DEFINE() {} + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 @@ -400,12 +427,7 @@ DEFINE_REC_METHODS(avx2); static boolean_t raidz_will_avx2_work(void) { -/* ABD Bringup -- vector code not ready */ -#if 1 - return (B_FALSE); -#else return (zfs_avx_available() && zfs_avx2_available()); -#endif } const raidz_impl_ops_t vdev_raidz_avx2_impl = { diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c index bcbe657d00..465d1e5693 100644 --- a/module/zfs/vdev_raidz_math_avx512bw.c +++ b/module/zfs/vdev_raidz_math_avx512bw.c @@ -24,7 +24,7 @@ #include -#if defined(__x86_64) && defined(HAVE_AVX512BW) +#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW) #include #include @@ -345,6 +345,22 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; kfpu_end(); \ } +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_DEFINE() {} #define GEN_P_STRIDE 4 #define GEN_P_P 0, 1, 2, 3 diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c index cc3868bce9..c2ccd875eb 100644 --- a/module/zfs/vdev_raidz_math_avx512f.c +++ b/module/zfs/vdev_raidz_math_avx512f.c @@ -24,7 +24,7 @@ #include -#if defined(__x86_64) && defined(HAVE_AVX512F) +#if 0 // defined(__x86_64) && defined(HAVE_AVX512F) #include #include @@ -437,6 +437,21 @@ typedef struct v { kfpu_end(); \ } +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 20, 21, 22, 23 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 20, 21, 22, 23 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 20, 21, 22, 23 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 20, 21, 22, 23 /* * This use zmm16-zmm31 registers to free up zmm0-zmm15 * to use with the AVX2 pshufb, see above diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 53800fd728..a8e4a0740c 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -32,257 +32,14 @@ #define noinline __attribute__((noinline)) #endif -/* Calculate data offset in raidz column, offset is in bytes */ -/* ADB BRINGUP -- needs to be refactored for ABD */ -#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_abd) + (off))) - -/* - * PARITY CALCULATION - * An optimized function is called for a full length of data columns - * If RAIDZ map contains remainder columns (shorter columns) the same function - * is called for reminder of full columns. - * - * GEN_[P|PQ|PQR]_BLOCK() functions are designed to be efficiently in-lined by - * the compiler. This removes a lot of conditionals from the inside loop which - * makes the code faster, especially for vectorized code. - * They are also highly parametrized, allowing for each implementation to define - * most optimal stride, and register allocation. - */ - -static raidz_inline void -GEN_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols) -{ - int c; - size_t ioff; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t *col; - - GEN_P_DEFINE(); - - for (ioff = off; ioff < end; ioff += (GEN_P_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&(rm->rm_col[1]), ioff), GEN_P_P); - - for (c = 2; c < ncols; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), GEN_P_P); - } - - STORE(COL_OFF(pcol, ioff), GEN_P_P); - } -} - -/* - * Generate P parity (RAIDZ1) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - panic("not ABD ready"); - - raidz_math_begin(); - - /* short_size */ - GEN_P_BLOCK(rm, 0, short_size, ncols); - - /* fullcols */ - GEN_P_BLOCK(rm, short_size, psize, raidz_nbigcols(rm)); - - raidz_math_end(); -} - -static raidz_inline void -GEN_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols, const int nbigcols) -{ - int c; - size_t ioff; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t *col; - - GEN_PQ_DEFINE(); - - MUL2_SETUP(); - - for (ioff = off; ioff < end; ioff += (GEN_PQ_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&rm->rm_col[2], ioff), GEN_PQ_P); - COPY(GEN_PQ_P, GEN_PQ_Q); - - for (c = 3; c < nbigcols; c++) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), GEN_PQ_D); - MUL2(GEN_PQ_Q); - XOR(GEN_PQ_D, GEN_PQ_P); - XOR(GEN_PQ_D, GEN_PQ_Q); - } - - STORE(COL_OFF(pcol, ioff), GEN_PQ_P); - - for (; c < ncols; c++) - MUL2(GEN_PQ_Q); - - STORE(COL_OFF(qcol, ioff), GEN_PQ_Q); - } -} - -/* - * Generate PQ parity (RAIDZ2) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - panic("not ABD ready"); - - raidz_math_begin(); - - /* short_size */ - GEN_PQ_BLOCK(rm, 0, short_size, ncols, ncols); - - /* fullcols */ - GEN_PQ_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm)); - - raidz_math_end(); -} - - -static raidz_inline void -GEN_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols, const int nbigcols) -{ - int c; - size_t ioff; - raidz_col_t *col; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - - GEN_PQR_DEFINE(); - - MUL2_SETUP(); - - for (ioff = off; ioff < end; ioff += (GEN_PQR_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&rm->rm_col[3], ioff), GEN_PQR_P); - COPY(GEN_PQR_P, GEN_PQR_Q); - COPY(GEN_PQR_P, GEN_PQR_R); - - for (c = 4; c < nbigcols; c++) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), GEN_PQR_D); - MUL2(GEN_PQR_Q); - MUL4(GEN_PQR_R); - XOR(GEN_PQR_D, GEN_PQR_P); - XOR(GEN_PQR_D, GEN_PQR_Q); - XOR(GEN_PQR_D, GEN_PQR_R); - } - - STORE(COL_OFF(pcol, ioff), GEN_PQR_P); - - for (; c < ncols; c++) { - MUL2(GEN_PQR_Q); - MUL4(GEN_PQR_R); - } - - STORE(COL_OFF(qcol, ioff), GEN_PQR_Q); - STORE(COL_OFF(rcol, ioff), GEN_PQR_R); - } -} - - -/* - * Generate PQR parity (RAIDZ3) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - panic("not ABD ready"); - - raidz_math_begin(); - - /* short_size */ - GEN_PQR_BLOCK(rm, 0, short_size, ncols, ncols); - - /* fullcols */ - GEN_PQR_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm)); - - raidz_math_end(); -} - -/* - * DATA RECONSTRUCTION - * - * Data reconstruction process consists of two phases: - * - Syndrome calculation - * - Data reconstruction - * - * Syndrome is calculated by generating parity using available data columns - * and zeros in places of erasure. Existing parity is added to corresponding - * syndrome value to obtain the [P|Q|R]syn values from equation: - * P = Psyn + Dx + Dy + Dz - * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz - * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz - * - * For data reconstruction phase, the corresponding equations are solved - * for missing data (Dx, Dy, Dz). This generally involves multiplying known - * symbols by an coefficient and adding them together. The multiplication - * constant coefficients are calculated ahead of the operation in - * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. - * - * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" - * and "short" columns. - * For this reason, reconstruction is performed in minimum of - * two steps. First, from offset 0 to short_size, then from short_size to - * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work - * over both ranges. The split also enables removal of conditional expressions - * from loop bodies, improving throughput of SIMD implementations. - * For the best performance, all functions marked with raidz_inline attribute - * must be inlined by compiler. - * - * parity data - * columns columns - * <----------> <------------------> - * x y <----+ missing columns (x, y) - * | | - * +---+---+---+---+-v-+---+-v-+---+ ^ 0 - * | | | | | | | | | | - * | | | | | | | | | | - * | P | Q | R | D | D | D | D | D | | - * | | | | 0 | 1 | 2 | 3 | 4 | | - * | | | | | | | | | v - * | | | | | +---+---+---+ ^ short_size - * | | | | | | | - * +---+---+---+---+---+ v big_size - * <------------------> <----------> - * big columns short columns - * - */ - /* * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. * @rm RAIDZ map * @tgtidx array of missing data indexes - * @coeff output array of coefficients. Array must be user - * provided and must hold minimum MUL_CNT values + * @coeff output array of coefficients. Array must be provided by + * user and must hold minimum MUL_CNT values. */ static noinline void raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) @@ -390,48 +147,437 @@ raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) coeff[MUL_PQR_YQ] = yd; } - /* - * Reconstruction using P parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @ncols number of column + * Method for zeroing a buffer (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @dsize Destination buffer size + * @private Unused */ -static raidz_inline void -REC_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int ncols) +static int +raidz_zero_abd_cb(void *dc, size_t dsize, void *private) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + v_t *dst = (v_t *) dc; + size_t i; - REC_P_DEFINE(); + ZERO_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_P_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_P_X); + (void) private; /* unused */ - for (c = firstdc; c < x; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_P_X); - } + ZERO(ZERO_D); - for (c++; c < ncols; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_P_X); - } - - STORE(COL_OFF(xcol, ioff), REC_P_X); + for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { + STORE(dst + i, ZERO_D); + STORE(dst + i + ZERO_STRIDE, ZERO_D); } + + return (0); +} + +#define raidz_zero(dabd, size) \ +{ \ + abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ } +/* + * Method for copying two buffers (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *) dc; + const v_t *src = (v_t *) sc; + size_t i; + + COPY_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { + LOAD(src + i, COPY_D); + STORE(dst + i, COPY_D); + + LOAD(src + i + COPY_STRIDE, COPY_D); + STORE(dst + i + COPY_STRIDE, COPY_D); + } + + return (0); +} + + +#define raidz_copy(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ +} + +/* + * Method for adding (XORing) two buffers. + * Source and destination are XORed together and result is stored in + * destination buffer. This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *) dc; + const v_t *src = (v_t *) sc; + size_t i; + + ADD_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { + LOAD(dst + i, ADD_D); + XOR_ACC(src + i, ADD_D); + STORE(dst + i, ADD_D); + + LOAD(dst + i + ADD_STRIDE, ADD_D); + XOR_ACC(src + i + ADD_STRIDE, ADD_D); + STORE(dst + i + ADD_STRIDE, ADD_D); + } + + return (0); +} + +#define raidz_add(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ +} + +/* + * Method for multiplying a buffer with a constant in GF(2^8). + * Symbols from buffer are multiplied by a constant and result is stored + * back in the same buffer. + * + * @dc In/Out data buffer. + * @size Size of the buffer + * @private pointer to the multiplication constant (unsigned) + */ +static int +raidz_mul_abd(void *dc, size_t size, void *private) +{ + const unsigned mul = *((unsigned *) private); + v_t *d = (v_t *) dc; + size_t i; + + MUL_DEFINE(); + + for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { + LOAD(d + i, MUL_D); + MUL(mul, MUL_D); + STORE(d + i, MUL_D); + + LOAD(d + i + MUL_STRIDE, MUL_D); + MUL(mul, MUL_D); + STORE(d + i + MUL_STRIDE, MUL_D); + } + + return (0); +} + + +/* + * Syndrome generation/update macros + * + * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros + */ +#define P_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + STORE((t), T); \ +} + +#define R_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define R_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + STORE((t), T); \ +} + + +/* + * PARITY CALCULATION + * + * Macros *_SYNDROME are used for parity/syndrome calculation. + * *_D_SYNDROME() macros are used to calculate syndrome between 0 and + * length of data column, and *_SYNDROME() macros are only for updating + * the parity/syndrome if data column is shorter. + * + * P parity is calculated using raidz_add_abd(). + */ + +/* + * Generate P parity (RAIDZ1) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_p_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t psize = rm->rm_col[CODE_P].rc_size; + abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* start with first data column */ + raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + + for (c = 2; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + size = rm->rm_col[c].rc_size; + + /* add data column */ + raidz_add(pabd, dabd, size); + } + + raidz_math_end(); +} + + +/* + * Generate PQ parity (RAIDZ2) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pq_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *) c[0]; + v_t *q = (v_t *) c[1]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, + q += GEN_PQ_STRIDE) { + LOAD(d, GEN_PQ_D); + P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); + Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); + } + for (; q < qend; q += GEN_PQ_STRIDE) { + Q_SYNDROME(GEN_PQ_C, q); + } +} + + +/* + * Generate PQ parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pq_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + + for (c = 3; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, + raidz_gen_pq_add); + } + + raidz_math_end(); +} + + +/* + * Generate PQR parity (RAIDZ3) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *) c[0]; + v_t *q = (v_t *) c[1]; + v_t *r = (v_t *) c[CODE_R]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, + q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + LOAD(d, GEN_PQR_D); + P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); + Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); + R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); + } + for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + Q_SYNDROME(GEN_PQR_C, q); + R_SYNDROME(GEN_PQR_C, r); + } +} + + +/* + * Generate PQR parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pqr_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + + for (c = 4; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, + raidz_gen_pqr_add); + } + + raidz_math_end(); +} + + +/* + * DATA RECONSTRUCTION + * + * Data reconstruction process consists of two phases: + * - Syndrome calculation + * - Data reconstruction + * + * Syndrome is calculated by generating parity using available data columns + * and zeros in places of erasure. Existing parity is added to corresponding + * syndrome value to obtain the [P|Q|R]syn values from equation: + * P = Psyn + Dx + Dy + Dz + * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz + * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz + * + * For data reconstruction phase, the corresponding equations are solved + * for missing data (Dx, Dy, Dz). This generally involves multiplying known + * symbols by an coefficient and adding them together. The multiplication + * constant coefficients are calculated ahead of the operation in + * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. + * + * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" + * and "short" columns. + * For this reason, reconstruction is performed in minimum of + * two steps. First, from offset 0 to short_size, then from short_size to + * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work + * over both ranges. The split also enables removal of conditional expressions + * from loop bodies, improving throughput of SIMD implementations. + * For the best performance, all functions marked with raidz_inline attribute + * must be inlined by compiler. + * + * parity data + * columns columns + * <----------> <------------------> + * x y <----+ missing columns (x, y) + * | | + * +---+---+---+---+-v-+---+-v-+---+ ^ 0 + * | | | | | | | | | | + * | | | | | | | | | | + * | P | Q | R | D | D | D | D | D | | + * | | | | 0 | 1 | 2 | 3 | 4 | | + * | | | | | | | | | v + * | | | | | +---+---+---+ ^ short_size + * | | | | | | | + * +---+---+---+---+---+ v big_size + * <------------------> <----------> + * big columns short columns + * + */ + + + + /* * Reconstruct single data column using P parity - * @rec_method REC_P_BLOCK() + * + * @syn_method raidz_add_abd() + * @rec_method not applicable * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -439,94 +585,73 @@ REC_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); + size_t c; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + size_t size; + abd_t *dabd; raidz_math_begin(); - /* 0 - short_size */ - REC_P_BLOCK(rm, 0, short_size, x, ncols); + /* copy P into target */ + raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); - /* short_size - xsize */ - REC_P_BLOCK(rm, short_size, xsize, x, nbigcols); + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + + dabd = rm->rm_col[c].rc_abd; + size = MIN(rm->rm_col[c].rc_size, xsize); + + raidz_add(xabd, dabd, size); + } raidz_math_end(); return (1 << CODE_P); } -/* - * Reconstruct using Q parity - */ - -#define REC_Q_SYN_UPDATE() MUL2(REC_Q_X) - -#define REC_Q_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - REC_Q_SYN_UPDATE(); \ - XOR_ACC(COL_OFF(col, ioff), REC_Q_X); \ -} /* - * Reconstruction using Q parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns + * Generate Q syndrome (Qsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @xsize size of syndrome columns + * @dsize size of data column (0 if missing) */ -static raidz_inline void -REC_Q_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const unsigned *coeff, const int ncols, const int nbigcols) +static void +raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, + const size_t dsize) { - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + v_t *x = (v_t *) xc[TARGET_X]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (xsize / sizeof (v_t)); - REC_Q_DEFINE(); + SYN_Q_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_Q_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); + MUL2_SETUP(); - ZERO(REC_Q_X); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_Q_INNER_LOOP(c); - - REC_Q_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_Q_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_Q_SYN_UPDATE(); - if (x != c) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_Q_X); - } - } - for (; c < ncols; c++) - REC_Q_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_Q_X); - MUL(coeff[MUL_Q_X], REC_Q_X); - STORE(COL_OFF(xcol, ioff), REC_Q_X); + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_Q_D); + Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + Q_SYNDROME(SYN_Q_X, x); } } + /* * Reconstruct single data column using Q parity - * @rec_method REC_Q_BLOCK() + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -534,96 +659,90 @@ REC_Q_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *tabds[] = { xabd }; + unsigned coeff[MUL_CNT]; raidz_rec_q_coeff(rm, tgtidx, coeff); raidz_math_begin(); - /* 0 - short_size */ - REC_Q_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } - /* short_size - xsize */ - REC_Q_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_q_abd); + } + + /* add Q to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd, (void*) coeff); raidz_math_end(); return (1 << CODE_Q); } -/* - * Reconstruct using R parity - */ - -#define REC_R_SYN_UPDATE() MUL4(REC_R_X) -#define REC_R_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - REC_R_SYN_UPDATE(); \ - XOR_ACC(COL_OFF(col, ioff), REC_R_X); \ -} /* - * Reconstruction using R parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns + * Generate R syndrome (Rsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ -static raidz_inline void -REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const unsigned *coeff, const int ncols, const int nbigcols) +static void +raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, + const size_t dsize) { - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + v_t *x = (v_t *) xc[TARGET_X]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (tsize / sizeof (v_t)); - REC_R_DEFINE(); + SYN_R_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_R_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); + MUL2_SETUP(); - ZERO(REC_R_X); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_R_INNER_LOOP(c); - - REC_R_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_R_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_R_SYN_UPDATE(); - if (c != x) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_R_X); - } - } - for (; c < ncols; c++) - REC_R_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(rcol, ioff), REC_R_X); - MUL(coeff[MUL_R_X], REC_R_X); - STORE(COL_OFF(xcol, ioff), REC_R_X); + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_R_D); + R_D_SYNDROME(SYN_R_D, SYN_R_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + R_SYNDROME(SYN_R_X, x); } } + /* * Reconstruct single data column using R parity - * @rec_method REC_R_BLOCK() + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -631,122 +750,136 @@ REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *tabds[] = { xabd }; + unsigned coeff[MUL_CNT]; raidz_rec_r_coeff(rm, tgtidx, coeff); raidz_math_begin(); - /* 0 - short_size */ - REC_R_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } - /* short_size - xsize */ - REC_R_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols); + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_r_abd); + } + + /* add R to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd, (void *)coeff); raidz_math_end(); return (1 << CODE_R); } -/* - * Reconstruct using PQ parity - */ - -#define REC_PQ_SYN_UPDATE() MUL2(REC_PQ_Y) -#define REC_PQ_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_PQ_D); \ - REC_PQ_SYN_UPDATE(); \ - XOR(REC_PQ_D, REC_PQ_X); \ - XOR(REC_PQ_D, REC_PQ_Y); \ -} /* - * Reconstruction using PQ parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column + * Generate P and Q syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ -static raidz_inline void -REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) +static void +raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, + const size_t dsize) { - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + v_t *x = (v_t *) tc[TARGET_X]; + v_t *y = (v_t *) tc[TARGET_Y]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); - REC_PQ_DEFINE(); + SYN_PQ_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PQ_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_PQ_X); - ZERO(REC_PQ_Y); - MUL2_SETUP(); + MUL2_SETUP(); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PQ_INNER_LOOP(c); - - REC_PQ_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PQ_INNER_LOOP(c); - - REC_PQ_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PQ_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PQ_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PQ_D); - XOR(REC_PQ_D, REC_PQ_X); - XOR(REC_PQ_D, REC_PQ_Y); - } - } - for (; c < ncols; c++) - REC_PQ_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_PQ_Y); - - /* Save Pxy */ - COPY(REC_PQ_X, REC_PQ_D); - - /* Calc X */ - MUL(coeff[MUL_PQ_X], REC_PQ_X); - MUL(coeff[MUL_PQ_Y], REC_PQ_Y); - XOR(REC_PQ_Y, REC_PQ_X); - STORE(COL_OFF(xcol, ioff), REC_PQ_X); - - if (calcy) { - /* Calc Y */ - XOR(REC_PQ_D, REC_PQ_X); - STORE(COL_OFF(ycol, ioff), REC_PQ_X); - } + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); + Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + Q_SYNDROME(SYN_PQ_X, y); } } +/* + * Reconstruct data using PQ parity and PQ syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *) tc[TARGET_X]; + v_t *y = (v_t *) tc[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; + + REC_PQ_DEFINE(); + + for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, + p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { + LOAD(x, REC_PQ_X); + LOAD(y, REC_PQ_Y); + + XOR_ACC(p, REC_PQ_X); + XOR_ACC(q, REC_PQ_Y); + + /* Save Pxy */ + COPY(REC_PQ_X, REC_PQ_T); + + /* Calc X */ + MUL(mul[MUL_PQ_X], REC_PQ_X); + MUL(mul[MUL_PQ_Y], REC_PQ_Y); + XOR(REC_PQ_Y, REC_PQ_X); + STORE(x, REC_PQ_X); + + /* Calc Y */ + XOR(REC_PQ_T, REC_PQ_X); + STORE(y, REC_PQ_X); + } +} + + /* * Reconstruct two data columns using PQ parity - * @rec_method REC_PQ_BLOCK() + * + * @syn_method raidz_syn_pq_abd() + * @rec_method raidz_rec_pq_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -754,126 +887,156 @@ REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + unsigned coeff[MUL_CNT]; raidz_rec_pq_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PQ_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } - /* short_size - xsize */ - REC_PQ_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pq_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); + + /* Copy shorter targets back to the original abd buffer */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + return ((1 << CODE_P) | (1 << CODE_Q)); } -/* - * Reconstruct using PR parity - */ -#define REC_PR_SYN_UPDATE() MUL4(REC_PR_Y) -#define REC_PR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_PR_D); \ - REC_PR_SYN_UPDATE(); \ - XOR(REC_PR_D, REC_PR_X); \ - XOR(REC_PR_D, REC_PR_Y); \ +/* + * Generate P and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *) c[TARGET_X]; + v_t *y = (v_t *) c[TARGET_Y]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PR_D); + P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); + R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + R_SYNDROME(SYN_PR_X, y); + } } /* - * Reconstruction using PR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column + * Reconstruct data using PR parity and PR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants */ -static raidz_inline void -REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) +static void +raidz_rec_pr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + v_t *x = (v_t *) t[TARGET_X]; + v_t *y = (v_t *) t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; REC_PR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PR_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_PR_X); - ZERO(REC_PR_Y); - MUL2_SETUP(); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PR_INNER_LOOP(c); - - REC_PR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PR_INNER_LOOP(c); - - REC_PR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PR_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PR_D); - XOR(REC_PR_D, REC_PR_X); - XOR(REC_PR_D, REC_PR_Y); - } - } - for (; c < ncols; c++) - REC_PR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(rcol, ioff), REC_PR_Y); + for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, + p += REC_PR_STRIDE, q += REC_PR_STRIDE) { + LOAD(x, REC_PR_X); + LOAD(y, REC_PR_Y); + XOR_ACC(p, REC_PR_X); + XOR_ACC(q, REC_PR_Y); /* Save Pxy */ - COPY(REC_PR_X, REC_PR_D); + COPY(REC_PR_X, REC_PR_T); /* Calc X */ - MUL(coeff[MUL_PR_X], REC_PR_X); - MUL(coeff[MUL_PR_Y], REC_PR_Y); + MUL(mul[MUL_PR_X], REC_PR_X); + MUL(mul[MUL_PR_Y], REC_PR_Y); XOR(REC_PR_Y, REC_PR_X); - STORE(COL_OFF(xcol, ioff), REC_PR_X); + STORE(x, REC_PR_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PR_D, REC_PR_X); - STORE(COL_OFF(ycol, ioff), REC_PR_X); - } + /* Calc Y */ + XOR(REC_PR_T, REC_PR_X); + STORE(y, REC_PR_X); } } /* * Reconstruct two data columns using PR parity - * @rec_method REC_PR_BLOCK() + * + * @syn_method raidz_syn_pr_abd() + * @rec_method raidz_rec_pr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -881,134 +1044,162 @@ REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[0]; + const size_t y = tgtidx[1]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets are shorter then others. + * They need to be replaced with a new buffer so that syndrome can + * be calculated on full length. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } - /* short_size - xsize */ - REC_PR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); - return ((1 << CODE_P) | (1 << CODE_R)); + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); } /* - * Reconstruct using QR parity + * Generate Q and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ - -#define REC_QR_SYN_UPDATE() \ -{ \ - MUL2(REC_QR_X); \ - MUL4(REC_QR_Y); \ -} - -#define REC_QR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_QR_D); \ - REC_QR_SYN_UPDATE(); \ - XOR(REC_QR_D, REC_QR_X); \ - XOR(REC_QR_D, REC_QR_Y); \ -} - -/* - * Reconstruction using QR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - */ -static raidz_inline void -REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) +static void +raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + v_t *x = (v_t *) c[TARGET_X]; + v_t *y = (v_t *) c[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); - REC_QR_DEFINE(); + SYN_QR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_QR_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); - ZERO(REC_QR_X); - ZERO(REC_QR_Y); + MUL2_SETUP(); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_QR_INNER_LOOP(c); - - REC_QR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_QR_INNER_LOOP(c); - - REC_QR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_QR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_QR_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_QR_D); - XOR(REC_QR_D, REC_QR_X); - XOR(REC_QR_D, REC_QR_Y); - } - } - for (; c < ncols; c++) - REC_QR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_QR_X); - XOR_ACC(COL_OFF(rcol, ioff), REC_QR_Y); - - /* Save Qxy */ - COPY(REC_QR_X, REC_QR_D); - - /* Calc X */ - MUL(coeff[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ - XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ - MUL(coeff[MUL_QR_X], REC_QR_X); /* X = X * xm */ - STORE(COL_OFF(xcol, ioff), REC_QR_X); - - if (calcy) { - /* Calc Y */ - MUL(coeff[MUL_QR_YQ], REC_QR_D); /* X = Q * xqm */ - XOR(REC_QR_Y, REC_QR_D); /* X = R ^ X */ - MUL(coeff[MUL_QR_Y], REC_QR_D); /* X = X * xm */ - STORE(COL_OFF(ycol, ioff), REC_QR_D); - } + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); + R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); + } + for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { + Q_SYNDROME(SYN_QR_X, x); + R_SYNDROME(SYN_QR_X, y); } } + +/* + * Reconstruct data using QR parity and QR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_qr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *) t[TARGET_X]; + v_t *y = (v_t *) t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; + + REC_QR_DEFINE(); + + for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, + p += REC_QR_STRIDE, q += REC_QR_STRIDE) { + LOAD(x, REC_QR_X); + LOAD(y, REC_QR_Y); + + XOR_ACC(p, REC_QR_X); + XOR_ACC(q, REC_QR_Y); + + /* Save Pxy */ + COPY(REC_QR_X, REC_QR_T); + + /* Calc X */ + MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ + MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ + STORE(x, REC_QR_X); + + /* Calc Y */ + MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ + MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ + STORE(y, REC_QR_T); + } +} + + /* * Reconstruct two data columns using QR parity - * @rec_method REC_QR_BLOCK() + * + * @syn_method raidz_syn_qr_abd() + * @rec_method raidz_rec_qr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -1016,158 +1207,182 @@ REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_QR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } - /* short_size - xsize */ - REC_QR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_qr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_Q) | (1 << CODE_R)); } -/* - * Reconstruct using PQR parity - */ - -#define REC_PQR_SYN_UPDATE() \ -{ \ - MUL2(REC_PQR_Y); \ - MUL4(REC_PQR_Z); \ -} - -#define REC_PQR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[(c)]; \ - LOAD(COL_OFF(col, ioff), REC_PQR_D); \ - REC_PQR_SYN_UPDATE(); \ - XOR(REC_PQR_D, REC_PQR_X); \ - XOR(REC_PQR_D, REC_PQR_Y); \ - XOR(REC_PQR_D, REC_PQR_Z); \ -} /* - * Reconstruction using PQR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @z missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - * @calcz calculate third data column + * Generate P, Q, and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ -static raidz_inline void -REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const int z, const unsigned *coeff, - const int ncols, const int nbigcols, const boolean_t calcy, - const boolean_t calcz) +static void +raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t * const zcol = raidz_col_p(rm, z); - raidz_col_t *col; + v_t *x = (v_t *) c[TARGET_X]; + v_t *y = (v_t *) c[TARGET_Y]; + v_t *z = (v_t *) c[TARGET_Z]; + const v_t * const yend = y + (tsize / sizeof (v_t)); + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, + z += SYN_STRIDE) { + LOAD(d, SYN_PQR_D); + P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) + Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); + R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); + } + for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { + Q_SYNDROME(SYN_PQR_X, y); + R_SYNDROME(SYN_PQR_X, z); + } +} + + +/* + * Reconstruct data using PRQ parity and PQR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, + const unsigned * const mul) +{ + v_t *x = (v_t *) t[TARGET_X]; + v_t *y = (v_t *) t[TARGET_Y]; + v_t *z = (v_t *) t[TARGET_Z]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; + const v_t *r = (v_t *) c[CODE_R]; REC_PQR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PQR_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); - LOAD(COL_OFF(pcol, ioff), REC_PQR_X); - ZERO(REC_PQR_Y); - ZERO(REC_PQR_Z); + for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, + z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, + r += REC_PQR_STRIDE) { + LOAD(x, REC_PQR_X); + LOAD(y, REC_PQR_Y); + LOAD(z, REC_PQR_Z); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < z; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PQR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PQR_SYN_UPDATE(); - if (c != x && c != y && c != z) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PQR_D); - XOR(REC_PQR_D, REC_PQR_X); - XOR(REC_PQR_D, REC_PQR_Y); - XOR(REC_PQR_D, REC_PQR_Z); - } - } - for (; c < ncols; c++) - REC_PQR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_PQR_Y); - XOR_ACC(COL_OFF(rcol, ioff), REC_PQR_Z); + XOR_ACC(p, REC_PQR_X); + XOR_ACC(q, REC_PQR_Y); + XOR_ACC(r, REC_PQR_Z); /* Save Pxyz and Qxyz */ COPY(REC_PQR_X, REC_PQR_XS); COPY(REC_PQR_Y, REC_PQR_YS); /* Calc X */ - MUL(coeff[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ - MUL(coeff[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ + MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ + MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ XOR(REC_PQR_Y, REC_PQR_X); - MUL(coeff[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ + MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ - STORE(COL_OFF(xcol, ioff), REC_PQR_X); + STORE(x, REC_PQR_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ - MUL(coeff[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ - XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ - COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ - MUL(coeff[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ - MUL(coeff[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ - XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ - STORE(COL_OFF(ycol, ioff), REC_PQR_YS); - } + /* Calc Y */ + XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ + MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ + XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ + COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ + MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ + MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ + XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ + STORE(y, REC_PQR_YS); - if (calcz) { - /* Calc Z */ - XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ - STORE(COL_OFF(zcol, ioff), REC_PQR_YS); - } + /* Calc Z */ + XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ + STORE(z, REC_PQR_YS); } } + /* * Reconstruct three data columns using PQR parity - * @rec_method REC_PQR_BLOCK() + * + * @syn_method raidz_syn_pqr_abd() + * @rec_method raidz_rec_pqr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -1175,31 +1390,87 @@ REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int z = tgtidx[TARGET_Z]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t zsize = raidz_col_size(rm, z); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t z = tgtidx[TARGET_Z]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + const size_t zsize = rm->rm_col[z].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *zabd = rm->rm_col[z].rc_abd; + abd_t *tabds[] = { xabd, yabd, zabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + if (zsize < xsize) { + zabd = abd_alloc(xsize, B_FALSE); + tabds[2] = zabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PQR_BLOCK(rm, 0, short_size, x, y, z, coeff, ncols, ncols, - B_TRUE, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + raidz_zero(zabd, xsize); + } - /* short_size - xsize */ - REC_PQR_BLOCK(rm, short_size, xsize, x, y, z, coeff, ncols, nbigcols, - xsize == ysize, xsize == zsize); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y || c == z) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + raidz_syn_pqr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + if (zsize < xsize) + raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + if (zsize < xsize) + abd_free(zabd); + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); } diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c index 1d782b6334..a693bff63f 100644 --- a/module/zfs/vdev_raidz_math_scalar.c +++ b/module/zfs/vdev_raidz_math_scalar.c @@ -24,6 +24,7 @@ */ #include + /* * Provide native CPU scalar routines. * Support 32bit and 64bit CPUs. @@ -153,72 +154,97 @@ static const struct { #define raidz_math_begin() {} #define raidz_math_end() {} -#define GEN_P_DEFINE() v_t p0 -#define GEN_P_STRIDE 1 -#define GEN_P_P p0 +#define SYN_STRIDE 1 -#define GEN_PQ_DEFINE() v_t d0, p0, q0 -#define GEN_PQ_STRIDE 1 -#define GEN_PQ_D d0 -#define GEN_PQ_P p0 -#define GEN_PQ_Q q0 +#define ZERO_DEFINE() v_t d0 +#define ZERO_STRIDE 1 +#define ZERO_D d0 -#define GEN_PQR_DEFINE() v_t d0, p0, q0, r0 -#define GEN_PQR_STRIDE 1 -#define GEN_PQR_D d0 -#define GEN_PQR_P p0 -#define GEN_PQR_Q q0 -#define GEN_PQR_R r0 +#define COPY_DEFINE() v_t d0 +#define COPY_STRIDE 1 +#define COPY_D d0 -#define REC_P_DEFINE() v_t x0 -#define REC_P_STRIDE 1 -#define REC_P_X x0 +#define ADD_DEFINE() v_t d0 +#define ADD_STRIDE 1 +#define ADD_D d0 -#define REC_Q_DEFINE() v_t x0 -#define REC_Q_STRIDE 1 -#define REC_Q_X x0 +#define MUL_DEFINE() v_t d0 +#define MUL_STRIDE 1 +#define MUL_D d0 -#define REC_R_DEFINE() v_t x0 -#define REC_R_STRIDE 1 -#define REC_R_X x0 +#define GEN_P_STRIDE 1 +#define GEN_P_DEFINE() v_t p0 +#define GEN_P_P p0 -#define REC_PQ_DEFINE() v_t x0, y0, d0 -#define REC_PQ_STRIDE 1 -#define REC_PQ_X x0 -#define REC_PQ_Y y0 -#define REC_PQ_D d0 +#define GEN_PQ_STRIDE 1 +#define GEN_PQ_DEFINE() v_t d0, c0 +#define GEN_PQ_D d0 +#define GEN_PQ_C c0 -#define REC_PR_DEFINE() v_t x0, y0, d0 -#define REC_PR_STRIDE 1 -#define REC_PR_X x0 -#define REC_PR_Y y0 -#define REC_PR_D d0 +#define GEN_PQR_STRIDE 1 +#define GEN_PQR_DEFINE() v_t d0, c0 +#define GEN_PQR_D d0 +#define GEN_PQR_C c0 -#define REC_QR_DEFINE() v_t x0, y0, d0 -#define REC_QR_STRIDE 1 -#define REC_QR_X x0 -#define REC_QR_Y y0 -#define REC_QR_D d0 +#define SYN_Q_DEFINE() v_t d0, x0 +#define SYN_Q_D d0 +#define SYN_Q_X x0 -#define REC_PQR_DEFINE() v_t x0, y0, z0, d0, t0 -#define REC_PQR_STRIDE 1 -#define REC_PQR_X x0 -#define REC_PQR_Y y0 -#define REC_PQR_Z z0 -#define REC_PQR_D d0 -#define REC_PQR_XS d0 -#define REC_PQR_YS t0 + +#define SYN_R_DEFINE() v_t d0, x0 +#define SYN_R_D d0 +#define SYN_R_X x0 + + +#define SYN_PQ_DEFINE() v_t d0, x0 +#define SYN_PQ_D d0 +#define SYN_PQ_X x0 + + +#define REC_PQ_STRIDE 1 +#define REC_PQ_DEFINE() v_t x0, y0, t0 +#define REC_PQ_X x0 +#define REC_PQ_Y y0 +#define REC_PQ_T t0 + + +#define SYN_PR_DEFINE() v_t d0, x0 +#define SYN_PR_D d0 +#define SYN_PR_X x0 + +#define REC_PR_STRIDE 1 +#define REC_PR_DEFINE() v_t x0, y0, t0 +#define REC_PR_X x0 +#define REC_PR_Y y0 +#define REC_PR_T t0 + + +#define SYN_QR_DEFINE() v_t d0, x0 +#define SYN_QR_D d0 +#define SYN_QR_X x0 + + +#define REC_QR_STRIDE 1 +#define REC_QR_DEFINE() v_t x0, y0, t0 +#define REC_QR_X x0 +#define REC_QR_Y y0 +#define REC_QR_T t0 + + +#define SYN_PQR_DEFINE() v_t d0, x0 +#define SYN_PQR_D d0 +#define SYN_PQR_X x0 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0 +#define REC_PQR_X x0 +#define REC_PQR_Y y0 +#define REC_PQR_Z z0 +#define REC_PQR_XS xs0 +#define REC_PQR_YS ys0 #include "vdev_raidz_math_impl.h" -/* - * If compiled with -O0, gcc doesn't do any stack frame coalescing - * and -Wframe-larger-than=1024 is triggered in debug mode. - * Starting with gcc 4.8, new opt level -Og is introduced for debugging, which - * does not trigger this warning. - */ -#pragma GCC diagnostic ignored "-Wframe-larger-than=" - DEFINE_GEN_METHODS(scalar); DEFINE_REC_METHODS(scalar); diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c index 6fc81215a5..97ddfc9895 100644 --- a/module/zfs/vdev_raidz_math_sse2.c +++ b/module/zfs/vdev_raidz_math_sse2.c @@ -236,6 +236,10 @@ typedef struct v { #define MUL2(r...) \ { \ switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(VR0(r), VR1(r)); \ + _MUL2_x2(VR2(r), VR3(r)); \ + break; \ case 2: \ _MUL2_x2(VR0(r), VR1(r)); \ break; \ @@ -271,8 +275,8 @@ typedef struct v { if (x & 0x80) { MUL2(in); XOR(in, acc); } \ } -#define _mul_x1_in 9 -#define _mul_x1_acc 11 +#define _mul_x1_in 11 +#define _mul_x1_acc 12 #define MUL_x1_DEFINE(x) \ static void \ @@ -533,61 +537,87 @@ gf_x2_mul_fns[256] = { #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() -#define GEN_P_DEFINE() {} +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 2 +#define MUL_DEFINE() {} +#define MUL_D 0, 1 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 +#define GEN_PQ_STRIDE 4 #define GEN_PQ_DEFINE() {} -#define GEN_PQ_STRIDE 2 -#define GEN_PQ_D 0, 1 -#define GEN_PQ_P 2, 3 -#define GEN_PQ_Q 4, 5 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 2 -#define REC_Q_X 0, 1 +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 2 -#define REC_R_X 0, 1 +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0 #define REC_PQR_Y 1 #define REC_PQR_Z 2 -#define REC_PQR_D 3 -#define REC_PQR_XS 4 -#define REC_PQR_YS 5 +#define REC_PQR_XS 3 +#define REC_PQR_YS 4 #include diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c index 81f1b9a077..d8fa8fb828 100644 --- a/module/zfs/vdev_raidz_math_ssse3.c +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -337,59 +337,86 @@ typedef struct v { #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() -#define GEN_P_DEFINE() {} + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 @@ -403,13 +430,8 @@ DEFINE_REC_METHODS(ssse3); static boolean_t raidz_will_ssse3_work(void) { -/* ABD Bringup -- vector code not ready */ -#if 1 - return (B_FALSE); -#else return (zfs_sse_available() && zfs_sse2_available() && zfs_ssse3_available()); -#endif } const raidz_impl_ops_t vdev_raidz_ssse3_impl = {