From a206522c4fd31f03f14ba174d6159b72acfae0a9 Mon Sep 17 00:00:00 2001 From: Gvozden Neskovic Date: Wed, 24 Aug 2016 15:42:51 +0200 Subject: [PATCH] ABD changes for vectorized RAIDZ * userspace: aligned buffers. Minimum of 32B alignment is needed for AVX2. Kernel buffers are aligned 512B or more. * add abd_get_offset_size() interface * abd_iter_map(): fix calculation of iter_mapsize * add abd_raidz_gen_iterate() and abd_raidz_rec_iterate() Signed-off-by: Gvozden Neskovic --- include/sys/abd.h | 10 ++ module/zfs/abd.c | 198 ++++++++++++++++++++++++++++++++++++++-- module/zfs/vdev_raidz.c | 16 ++-- 3 files changed, 210 insertions(+), 14 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 6e3530aeca..321c647133 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -84,6 +84,7 @@ abd_t *abd_alloc_for_io(size_t, boolean_t); abd_t *abd_alloc_sametype(abd_t *, size_t); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_offset_size(abd_t *, size_t, size_t); abd_t *abd_get_from_buf(void *, size_t); void abd_put(abd_t *); @@ -119,6 +120,15 @@ unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); #endif +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul); + /* * Wrappers for calls with offsets of 0 */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 306c47536a..08bb0c52e3 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -228,8 +228,8 @@ abd_fini(void) struct page; #define kpm_enable 1 #define abd_alloc_chunk() \ - ((struct page *)kmem_alloc(PAGESIZE, KM_SLEEP)) -#define abd_free_chunk(chunk) kmem_free(chunk, PAGESIZE) + ((struct page *) umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP)) +#define abd_free_chunk(chunk) umem_free(chunk, PAGESIZE) #define abd_map_chunk(chunk) ((void *)chunk) static void abd_unmap_chunk(struct page *c) @@ -474,8 +474,8 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ -abd_t * -abd_get_offset(abd_t *sabd, size_t off) +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { abd_t *abd; @@ -496,8 +496,8 @@ abd_get_offset(abd_t *sabd, size_t off) (char *)sabd->abd_u.abd_linear.abd_buf + off; } else { size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / PAGESIZE); + size_t chunkcnt = abd_chunkcnt_for_bytes(size + + new_offset % PAGESIZE); abd = abd_alloc_struct(chunkcnt); @@ -517,7 +517,7 @@ abd_get_offset(abd_t *sabd, size_t off) chunkcnt * sizeof (void *)); } - abd->abd_size = sabd->abd_size - off; + abd->abd_size = size; abd->abd_parent = sabd; refcount_create(&abd->abd_children); (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); @@ -525,6 +525,24 @@ abd_get_offset(abd_t *sabd, size_t off) return (abd); } +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + + VERIFY3U(size, >, 0); + + return (abd_get_offset_impl(sabd, off, size)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + /* * Allocate a linear ABD structure for buf. You must free this with abd_put() * since the resulting ABD doesn't own its own buffer. @@ -757,10 +775,14 @@ abd_iter_map(struct abd_iter *aiter) } else { size_t index = abd_iter_scatter_chunk_index(aiter); offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = PAGESIZE - offset; + + aiter->iter_mapsize = MIN(PAGESIZE - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = abd_map_chunk( aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]); } + aiter->iter_mapaddr = (char *)paddr + offset; } @@ -999,6 +1021,166 @@ abd_cmp(abd_t *dabd, abd_t *sabd) abd_cmp_cb, NULL)); } +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter; + void *caddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i]); + + if (dabd) + abd_iter_init(&daiter, dabd); + + ASSERT3S(dsize, >=, 0); + + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + case 2: + len = MIN(caiters[1].iter_mapsize, len); + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i]); + abd_iter_init(&xiters[i], tabds[i]); + } + + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } +} + #if defined(_KERNEL) && defined(HAVE_SPL) /* * bio_nr_pages for ABD. diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index d08fdab13f..a92d3cbaad 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -243,8 +243,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset( - rm->rm_abd_copy, offset); + rm->rm_col[x].rc_abd = abd_get_offset_size( + rm->rm_abd_copy, offset, + rm->rm_col[x].rc_size); offset += rm->rm_col[x].rc_size; } } @@ -310,7 +311,8 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); + abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, + col->rc_size); abd_copy(tmp, col->rc_abd, col->rc_size); abd_put(col->rc_abd); @@ -432,13 +434,15 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, 0); + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rm->rm_col[c].rc_size); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, off); + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, + rm->rm_col[c].rc_size); off += rm->rm_col[c].rc_size; }