From b69bebb535572ef905b065182d8c80d2fff5a8b4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 21 Apr 2024 16:37:06 +1000 Subject: [PATCH] libzpool/abd_os: iovec-based scatter abd This is intended to be a simple userspace scatter abd based on struct iovec. It's not very sophisticated as-is, but sets a base for something much more interesting. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16253 --- lib/libzpool/abd_os.c | 471 +++++++++++------------------- lib/libzpool/include/sys/abd_os.h | 4 +- 2 files changed, 174 insertions(+), 301 deletions(-) diff --git a/lib/libzpool/abd_os.c b/lib/libzpool/abd_os.c index de93f99a55..5a91605b2f 100644 --- a/lib/libzpool/abd_os.c +++ b/lib/libzpool/abd_os.c @@ -24,34 +24,6 @@ * Copyright (c) 2023, 2024, Klara Inc. */ -/* - * See abd.c for a general overview of the arc buffered data (ABD). - * - * Linear buffers act exactly like normal buffers and are always mapped into the - * kernel's virtual memory space, while scattered ABD data chunks are allocated - * as physical pages and then mapped in only while they are actually being - * accessed through one of the abd_* library functions. Using scattered ABDs - * provides several benefits: - * - * (1) They avoid use of kmem_*, preventing performance problems where running - * kmem_reap on very large memory systems never finishes and causes - * constant TLB shootdowns. - * - * (2) Fragmentation is less of an issue since when we are at the limit of - * allocatable space, we won't have to search around for a long free - * hole in the VA space for large ARC allocations. Each chunk is mapped in - * individually, so even if we are using HIGHMEM (see next point) we - * wouldn't need to worry about finding a contiguous address range. - * - * (3) If we are not using HIGHMEM, then all physical memory is always - * mapped into the kernel's address space, so we also avoid the map / - * unmap costs on each ABD access. - * - * If we are not using HIGHMEM, scattered buffers which have only one chunk - * can be treated as linear buffers, because they are contiguous in the - * kernel's virtual address space. See abd_alloc_chunks() for details. - */ - #include #include #include @@ -59,199 +31,112 @@ #include #include - -#define abd_for_each_sg(abd, sg, n, i) \ - for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) +/* + * We're simulating scatter/gather with 4K allocations, since that's more like + * what a typical kernel does. + */ +#define ABD_PAGESIZE (4096) +#define ABD_PAGESHIFT (12) +#define ABD_PAGEMASK (ABD_PAGESIZE-1) /* - * zfs_abd_scatter_min_size is the minimum allocation size to use scatter - * ABD's. Smaller allocations will use linear ABD's which uses - * zio_[data_]buf_alloc(). - * - * Scatter ABD's use at least one page each, so sub-page allocations waste - * some space when allocated as scatter (e.g. 2KB scatter allocation wastes - * half of each page). Using linear ABD's for small allocations means that - * they will be put on slabs which contain many allocations. This can - * improve memory efficiency, but it also makes it much harder for ARC - * evictions to actually free pages, because all the buffers on one slab need - * to be freed in order for the slab (and underlying pages) to be freed. - * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's - * possible for them to actually waste more memory than scatter (one page per - * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). - * - * Spill blocks are typically 512B and are heavily used on systems running - * selinux with the default dnode size and the `xattr=sa` property set. - * - * By default we use linear allocations for 512B and 1KB, and scatter - * allocations for larger (1.5KB and up). + * See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is + * mostly useful to get a mix of linear and scatter ABDs for testing. */ -static int zfs_abd_scatter_min_size = 512 * 3; +#define ABD_SCATTER_MIN_SIZE (512 * 3) -/* - * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are - * just a single zero'd page. This allows us to conserve memory by - * only using a single zero page for the scatterlist. - */ abd_t *abd_zero_scatter = NULL; -struct page; -/* - * abd_zero_page will be allocated with a zero'ed PAGESIZE buffer, which is - * assigned to each of the pages of abd_zero_scatter. - */ -static struct page *abd_zero_page = NULL; - -static kmem_cache_t *abd_cache = NULL; - static uint_t -abd_chunkcnt_for_bytes(size_t size) +abd_iovcnt_for_bytes(size_t size) { - return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); + /* + * Each iovec points to a 4K page. There's no real reason to do this + * in userspace, but our whole point here is to make it feel a bit + * more like a real paged memory model. + */ + return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE); } abd_t * abd_alloc_struct_impl(size_t size) { /* - * In Linux we do not use the size passed in during ABD - * allocation, so we just ignore it. + * Zero-sized means it will be used for a linear or gang abd, so just + * allocate the abd itself and return. */ - (void) size; - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); - ASSERT3P(abd, !=, NULL); + if (size == 0) + return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL)); + /* + * Allocating for a scatter abd, so compute how many ABD_PAGESIZE + * iovecs we will need to hold this size. Append that allocation to the + * end. Note that struct abd_scatter has includes abd_iov[1], so we + * allocate one less iovec than we need. + * + * Note we're not allocating the pages proper, just the iovec pointers. + * That's down in abd_alloc_chunks. We _could_ do it here in a single + * allocation, but it's fiddly and harder to read for no real gain. + */ + uint_t n = abd_iovcnt_for_bytes(size); + abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec), + UMEM_NOFAIL); + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_iovcnt = n; return (abd); } void abd_free_struct_impl(abd_t *abd) { - kmem_cache_free(abd_cache, abd); -} - -#define nth_page(pg, i) \ - ((struct page *)((void *)(pg) + (i) * PAGESIZE)) - -struct scatterlist { - struct page *page; - int length; - int end; -}; - -static void -sg_init_table(struct scatterlist *sg, int nr) -{ - memset(sg, 0, nr * sizeof (struct scatterlist)); - sg[nr - 1].end = 1; -} - -/* - * This must be called if any of the sg_table allocation functions - * are called. - */ -static void -abd_free_sg_table(abd_t *abd) -{ - int nents = ABD_SCATTER(abd).abd_nents; - vmem_free(ABD_SCATTER(abd).abd_sgl, - nents * sizeof (struct scatterlist)); -} - -#define for_each_sg(sgl, sg, nr, i) \ - for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) - -static inline void -sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) -{ - /* currently we don't use offset */ - ASSERT(offset == 0); - sg->page = page; - sg->length = len; -} - -static inline struct page * -sg_page(struct scatterlist *sg) -{ - return (sg->page); -} - -static inline struct scatterlist * -sg_next(struct scatterlist *sg) -{ - if (sg->end) - return (NULL); - - return (sg + 1); + /* For scatter, compute the extra amount we need to free */ + uint_t iovcnt = + abd_is_linear(abd) || abd_is_gang(abd) ? + 0 : (ABD_SCATTER(abd).abd_iovcnt - 1); + umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec)); } void abd_alloc_chunks(abd_t *abd, size_t size) { - unsigned nr_pages = abd_chunkcnt_for_bytes(size); - struct scatterlist *sg; - int i; + /* + * We've already allocated the iovec array; ensure that the wanted size + * actually matches, otherwise the caller has made a mistake somewhere. + */ + uint_t n = ABD_SCATTER(abd).abd_iovcnt; + ASSERT3U(n, ==, abd_iovcnt_for_bytes(size)); - ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); - - abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - sg_set_page(sg, p, PAGESIZE, 0); + /* + * Allocate a ABD_PAGESIZE region for each iovec. + */ + struct iovec *iov = ABD_SCATTER(abd).abd_iov; + for (int i = 0; i < n; i++) { + iov[i].iov_base = + umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL); + iov[i].iov_len = ABD_PAGESIZE; } - ABD_SCATTER(abd).abd_nents = nr_pages; } void abd_free_chunks(abd_t *abd) { - int i, n = ABD_SCATTER(abd).abd_nents; - struct scatterlist *sg; - - abd_for_each_sg(abd, sg, n, i) { - struct page *p = nth_page(sg_page(sg), 0); - umem_free_aligned(p, PAGESIZE); - } - abd_free_sg_table(abd); -} - -static void -abd_alloc_zero_scatter(void) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); - struct scatterlist *sg; - int i; - - abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - memset(abd_zero_page, 0, PAGESIZE); - abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); - abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; - abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK; - ABD_SCATTER(abd_zero_scatter).abd_offset = 0; - ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; - abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - - sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); - - abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { - sg_set_page(sg, abd_zero_page, PAGESIZE, 0); - } + uint_t n = ABD_SCATTER(abd).abd_iovcnt; + struct iovec *iov = ABD_SCATTER(abd).abd_iov; + for (int i = 0; i < n; i++) + umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE); } boolean_t abd_size_alloc_linear(size_t size) { - return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); + return (size < ABD_SCATTER_MIN_SIZE); } void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); - int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; + int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size; if (op == ABDSTAT_INCR) { arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); } else { @@ -270,67 +155,72 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) void abd_verify_scatter(abd_t *abd) { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; +#ifdef ZFS_DEBUG + /* + * scatter abds shall have: + * - at least one iovec + * - all iov_base point somewhere + * - all iov_len are ABD_PAGESIZE + * - offset set within the abd pages somewhere + */ + uint_t n = ABD_SCATTER(abd).abd_iovcnt; + ASSERT3U(n, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, - ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; - abd_for_each_sg(abd, sg, n, i) { - ASSERT3P(sg_page(sg), !=, NULL); + uint_t len = 0; + for (int i = 0; i < n; i++) { + ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL); + ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE); + len += ABD_PAGESIZE; } -} -static void -abd_free_zero_scatter(void) -{ - abd_free_sg_table(abd_zero_scatter); - abd_free_struct(abd_zero_scatter); - abd_zero_scatter = NULL; - ASSERT3P(abd_zero_page, !=, NULL); - umem_free_aligned(abd_zero_page, PAGESIZE); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len); +#endif } void abd_init(void) { - abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); + /* + * Create the "zero" scatter abd. This is always the size of the + * largest possible block, but only actually has a single allocated + * page, which all iovecs in the abd point to. + */ + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - abd_alloc_zero_scatter(); + void *zero = + umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL); + memset(zero, 0, ABD_PAGESIZE); + + uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE); + struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov; + for (int i = 0; i < n; i++) { + iov[i].iov_base = zero; + iov[i].iov_len = ABD_PAGESIZE; + } } void abd_fini(void) { - abd_free_zero_scatter(); - - if (abd_cache) { - kmem_cache_destroy(abd_cache); - abd_cache = NULL; - } + umem_free_aligned( + ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE); + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; } void abd_free_linear_page(abd_t *abd) { + /* + * LINEAR_PAGE is specific to the Linux kernel; we never set this + * flag, so this will never be called. + */ (void) abd; - __builtin_unreachable(); + PANIC("unreachable"); } -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * On Linux the optimal thing to do would be to use abd_get_offset() and - * construct a new ABD which shares the original pages thereby eliminating - * the copy. But for the moment a new linear ABD is allocated until this - * performance optimization can be implemented. - */ abd_t * abd_alloc_for_io(size_t size, boolean_t is_metadata) { @@ -338,43 +228,60 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) } abd_t * -abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, - size_t size) +abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size) { - (void) size; - int i = 0; - struct scatterlist *sg = NULL; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; - - if (abd == NULL) - abd = abd_alloc_struct(0); /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. + * Create a new scatter dabd by borrowing data pages from sabd to cover + * off+size. + * + * sabd is an existing scatter abd with a set of iovecs, each covering + * an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset. + * + * [........][........][........][........] + * ^- sabd_offset + * + * We want to produce a new abd, referencing those allocations at the + * given offset. + * + * [........][........][........][........] + * ^- dabd_offset = sabd_offset + off + * ^- dabd_offset + size + * + * In this example, dabd needs three iovecs. The first iovec is offset + * 0, so the final dabd_offset is masked back into the first iovec. + * + * [........][........][........] + * ^- dabd_offset */ + size_t soff = ABD_SCATTER(sabd).abd_offset + off; + size_t doff = soff & ABD_PAGEMASK; + size_t iovcnt = abd_iovcnt_for_bytes(doff + size); - abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { - if (new_offset < sg->length) - break; - new_offset -= sg->length; - } + /* + * If the passed-in abd has enough allocated iovecs already, reuse it. + * Otherwise, make a new one. The caller will free the original if the + * one it gets back is not the same. + * + * Note that it's ok if we reuse an abd with more iovecs than we need. + * abd_size has the usable amount of data, and the abd does not own the + * pages referenced by the iovecs. At worst, they're holding dangling + * pointers that we'll never use anyway. + */ + if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt) + dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT); - ABD_SCATTER(abd).abd_sgl = sg; - ABD_SCATTER(abd).abd_offset = new_offset; - ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + /* Set offset into first page in view */ + ABD_SCATTER(dabd).abd_offset = doff; - return (abd); + /* Copy the wanted iovecs from the source to the dest */ + memcpy(&ABD_SCATTER(dabd).abd_iov[0], + &ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT], + iovcnt * sizeof (struct iovec)); + + return (dabd); } -/* - * Initialize the abd_iter. - */ void abd_iter_init(struct abd_iter *aiter, abd_t *abd) { @@ -382,16 +289,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) abd_verify(abd); memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - if (!abd_is_linear(abd)) { - aiter->iter_offset = ABD_SCATTER(abd).abd_offset; - aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; - } } -/* - * This is just a helper function to see if we have exhausted the - * abd_iter and reached the end. - */ boolean_t abd_iter_at_end(struct abd_iter *aiter) { @@ -399,83 +298,57 @@ abd_iter_at_end(struct abd_iter *aiter) return (aiter->iter_pos == aiter->iter_abd->abd_size); } -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ void abd_iter_advance(struct abd_iter *aiter, size_t amount) { - /* - * Ensure that last chunk is not in use. abd_iterate_*() must clear - * this state (directly or abd_iter_unmap()) before advancing. - */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); - ASSERT3P(aiter->iter_page, ==, NULL); - ASSERT0(aiter->iter_page_doff); - ASSERT0(aiter->iter_page_dsize); - /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) return; aiter->iter_pos += amount; - aiter->iter_offset += amount; - if (!abd_is_linear(aiter->iter_abd)) { - while (aiter->iter_offset >= aiter->iter_sg->length) { - aiter->iter_offset -= aiter->iter_sg->length; - aiter->iter_sg = sg_next(aiter->iter_sg); - if (aiter->iter_sg == NULL) { - ASSERT0(aiter->iter_offset); - break; - } - } - } + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); } -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ void abd_iter_map(struct abd_iter *aiter) { - void *paddr; - size_t offset = 0; - ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); - /* There's nothing left to iterate over, so do nothing */ if (abd_iter_at_end(aiter)) return; if (abd_is_linear(aiter->iter_abd)) { - ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); - offset = aiter->iter_offset; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = ABD_LINEAR_BUF(aiter->iter_abd); - } else { - offset = aiter->iter_offset; - aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - - paddr = sg_page(aiter->iter_sg); + aiter->iter_mapaddr = + ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + aiter->iter_mapsize = + aiter->iter_abd->abd_size - aiter->iter_pos; + return; } - aiter->iter_mapaddr = (char *)paddr + offset; + /* + * For scatter, we index into the appropriate iovec, and return the + * smaller of the amount requested, or up to the end of the page. + */ + size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset; + + ASSERT3U(poff >> ABD_PAGESHIFT, <=, + ABD_SCATTER(aiter->iter_abd).abd_iovcnt); + struct iovec *iov = &ABD_SCATTER(aiter->iter_abd). + abd_iov[poff >> ABD_PAGESHIFT]; + + aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK), + aiter->iter_abd->abd_size - aiter->iter_pos); + ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE); + + aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK); } -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ void abd_iter_unmap(struct abd_iter *aiter) { - /* There's nothing left to unmap, so do nothing */ if (abd_iter_at_end(aiter)) return; diff --git a/lib/libzpool/include/sys/abd_os.h b/lib/libzpool/include/sys/abd_os.h index 67f7e5606b..8ff6aa1e9e 100644 --- a/lib/libzpool/include/sys/abd_os.h +++ b/lib/libzpool/include/sys/abd_os.h @@ -32,8 +32,8 @@ extern "C" { struct abd_scatter { uint_t abd_offset; - uint_t abd_nents; - struct scatterlist *abd_sgl; + uint_t abd_iovcnt; + struct iovec abd_iov[1]; /* actually variable-length */ }; struct abd_linear {