allow callers to allocate and provide the abd_t struct

The `abd_get_offset_*()` routines create an abd_t that references
another abd_t, and doesn't allocate any pages/buffers of its own.  In
some workloads, these routines may be called frequently, to create many
abd_t's representing small pieces of a single large abd_t.  In
particular, the upcoming RAIDZ Expansion project makes heavy use of
these routines.

This commit adds the ability for the caller to allocate and provide the
abd_t struct to a variant of `abd_get_offset_*()`.  This eliminates the
cost of allocating the abd_t and performing the accounting associated
with it (`abdstat_struct_size`).  The RAIDZ/DRAID code uses this for
the `rc_abd`, which references the zio's abd.  The upcoming RAIDZ
Expansion project will leverage this infrastructure to increase
performance of reads post-expansion by around 50%.

Additionally, some of the interfaces around creating and destroying
abd_t's are cleaned up.  Most significantly, the distinction between
`abd_put()` and `abd_free()` is eliminated; all types of abd_t's are
now disposed of with `abd_free()`.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Issue #8853 
Closes #11439
This commit is contained in:
Matthew Ahrens 2021-01-20 11:24:37 -08:00 committed by GitHub
parent 03f036cbcc
commit e2af2acce3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 261 additions and 295 deletions

View File

@ -492,8 +492,9 @@ vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
(dc - r) * (rows - 1) + row;
}
rr->rr_col[c].rc_size = 1ULL << ashift;
rr->rr_col[c].rc_abd =
abd_get_offset(abd, off << ashift);
rr->rr_col[c].rc_abd = abd_get_offset_struct(
&rr->rr_col[c].rc_abdstruct,
abd, off << ashift, 1 << ashift);
}
asize += rr->rr_col[c].rc_size;

View File

@ -35,8 +35,46 @@
extern "C" {
#endif
struct abd; /* forward declaration */
typedef struct abd abd_t;
typedef enum abd_flags {
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */
ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */
ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */
ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */
} abd_flags_t;
typedef struct abd {
abd_flags_t abd_flags;
uint_t abd_size; /* excludes scattered abd_offset */
list_node_t abd_gang_link;
struct abd *abd_parent;
zfs_refcount_t abd_children;
kmutex_t abd_mtx;
union {
struct abd_scatter {
uint_t abd_offset;
#if defined(__FreeBSD__) && defined(_KERNEL)
uint_t abd_chunk_size;
void *abd_chunks[1]; /* actually variable-length */
#else
uint_t abd_nents;
struct scatterlist *abd_sgl;
#endif
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
struct abd_gang {
list_t abd_gang_chain;
} abd_gang;
} abd_u;
} abd_t;
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
@ -49,14 +87,14 @@ extern int zfs_abd_scatter_enabled;
abd_t *abd_alloc(size_t, boolean_t);
abd_t *abd_alloc_linear(size_t, boolean_t);
abd_t *abd_alloc_gang_abd(void);
abd_t *abd_alloc_gang(void);
abd_t *abd_alloc_for_io(size_t, boolean_t);
abd_t *abd_alloc_sametype(abd_t *, size_t);
void abd_gang_add(abd_t *, abd_t *, boolean_t);
void abd_free(abd_t *);
void abd_put(abd_t *);
abd_t *abd_get_offset(abd_t *, size_t);
abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
abd_t *abd_get_zeros(size_t);
abd_t *abd_get_from_buf(void *, size_t);
void abd_cache_reap_now(void);
@ -87,7 +125,6 @@ int abd_cmp(abd_t *, abd_t *);
int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
void abd_zero_off(abd_t *, size_t, size_t);
void abd_verify(abd_t *);
uint_t abd_get_size(abd_t *);
void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
ssize_t csize, ssize_t dsize, const unsigned parity,
@ -135,9 +172,29 @@ abd_zero(abd_t *abd, size_t size)
/*
* ABD type check functions
*/
boolean_t abd_is_linear(abd_t *);
boolean_t abd_is_gang(abd_t *);
boolean_t abd_is_linear_page(abd_t *);
static inline boolean_t
abd_is_linear(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
}
static inline boolean_t
abd_is_linear_page(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0);
}
static inline boolean_t
abd_is_gang(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_GANG) != 0);
}
static inline uint_t
abd_get_size(abd_t *abd)
{
return (abd->abd_size);
}
/*
* Module lifecycle

View File

@ -32,51 +32,11 @@
extern "C" {
#endif
typedef enum abd_flags {
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */
ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */
ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */
} abd_flags_t;
typedef enum abd_stats_op {
ABDSTAT_INCR, /* Increase abdstat values */
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;
struct abd {
abd_flags_t abd_flags;
uint_t abd_size; /* excludes scattered abd_offset */
list_node_t abd_gang_link;
struct abd *abd_parent;
zfs_refcount_t abd_children;
kmutex_t abd_mtx;
union {
struct abd_scatter {
uint_t abd_offset;
#if defined(__FreeBSD__) && defined(_KERNEL)
uint_t abd_chunk_size;
void *abd_chunks[];
#else
uint_t abd_nents;
struct scatterlist *abd_sgl;
#endif
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
struct abd_gang {
list_t abd_gang_chain;
} abd_gang;
} abd_u;
};
struct scatterlist; /* forward declaration */
struct abd_iter {
@ -95,14 +55,16 @@ struct abd_iter {
extern abd_t *abd_zero_scatter;
abd_t *abd_gang_get_offset(abd_t *, size_t *);
abd_t *abd_alloc_struct(size_t);
void abd_free_struct(abd_t *);
/*
* OS specific functions
*/
abd_t *abd_alloc_struct(size_t);
abd_t *abd_get_offset_scatter(abd_t *, size_t);
void abd_free_struct(abd_t *);
abd_t *abd_alloc_struct_impl(size_t);
abd_t *abd_get_offset_scatter(abd_t *, abd_t *, size_t);
void abd_free_struct_impl(abd_t *);
void abd_alloc_chunks(abd_t *, size_t);
void abd_free_chunks(abd_t *);
boolean_t abd_size_alloc_linear(size_t);

View File

@ -106,6 +106,7 @@ typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
abd_t rc_abdstruct; /* rc_abd probably points here */
abd_t *rc_abd; /* I/O data */
void *rc_orig_data; /* pre-reconstruction */
abd_t *rc_gdata; /* used to store the "good" version */

View File

@ -202,7 +202,7 @@ abd_free_chunks(abd_t *abd)
}
abd_t *
abd_alloc_struct(size_t size)
abd_alloc_struct_impl(size_t size)
{
uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
/*
@ -216,22 +216,18 @@ abd_alloc_struct(size_t size)
offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
ASSERT3P(abd, !=, NULL);
list_link_init(&abd->abd_gang_link);
mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
ABDSTAT_INCR(abdstat_struct_size, abd_size);
return (abd);
}
void
abd_free_struct(abd_t *abd)
abd_free_struct_impl(abd_t *abd)
{
uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
abd_scatter_chunkcnt(abd);
ssize_t size = MAX(sizeof (abd_t),
offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
mutex_destroy(&abd->abd_mtx);
ASSERT(!list_link_active(&abd->abd_gang_link));
kmem_free(abd, size);
ABDSTAT_INCR(abdstat_struct_size, -size);
}
@ -249,10 +245,8 @@ abd_alloc_zero_scatter(void)
abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags = ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
abd_zero_scatter->abd_parent = NULL;
zfs_refcount_create(&abd_zero_scatter->abd_children);
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
@ -270,7 +264,6 @@ abd_alloc_zero_scatter(void)
static void
abd_free_zero_scatter(void)
{
zfs_refcount_destroy(&abd_zero_scatter->abd_children);
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
@ -355,10 +348,8 @@ abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt)
}
abd_t *
abd_get_offset_scatter(abd_t *sabd, size_t off)
abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
{
abd_t *abd = NULL;
abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size);
@ -366,14 +357,24 @@ abd_get_offset_scatter(abd_t *sabd, size_t off)
uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
(new_offset / zfs_abd_chunk_size);
abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt);
/*
* If an abd struct is provided, it is only the minimum size. If we
* need additional chunks, we need to allocate a new struct.
*/
if (abd != NULL &&
offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) >
sizeof (abd_t)) {
abd = NULL;
}
if (abd == NULL)
abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = 0;
ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;

View File

@ -185,7 +185,7 @@ abd_chunkcnt_for_bytes(size_t size)
}
abd_t *
abd_alloc_struct(size_t size)
abd_alloc_struct_impl(size_t size)
{
/*
* In Linux we do not use the size passed in during ABD
@ -193,18 +193,14 @@ abd_alloc_struct(size_t size)
*/
abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
ASSERT3P(abd, !=, NULL);
list_link_init(&abd->abd_gang_link);
mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
return (abd);
}
void
abd_free_struct(abd_t *abd)
abd_free_struct_impl(abd_t *abd)
{
mutex_destroy(&abd->abd_mtx);
ASSERT(!list_link_active(&abd->abd_gang_link));
kmem_cache_free(abd_cache, abd);
ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
}
@ -472,14 +468,12 @@ abd_alloc_zero_scatter(void)
ASSERT3U(table.nents, ==, nr_pages);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags = ABD_FLAG_OWNER;
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
abd_zero_scatter->abd_parent = NULL;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
zfs_refcount_create(&abd_zero_scatter->abd_children);
abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
@ -599,12 +593,11 @@ abd_alloc_zero_scatter(void)
abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
memset(abd_zero_page, 0, PAGESIZE);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
abd_zero_scatter->abd_flags = ABD_FLAG_OWNER;
abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
abd_zero_scatter->abd_parent = NULL;
zfs_refcount_create(&abd_zero_scatter->abd_children);
ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
sizeof (struct scatterlist), KM_SLEEP);
@ -678,7 +671,6 @@ abd_verify_scatter(abd_t *abd)
static void
abd_free_zero_scatter(void)
{
zfs_refcount_destroy(&abd_zero_scatter->abd_children);
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
@ -747,9 +739,7 @@ abd_free_linear_page(abd_t *abd)
ABD_SCATTER(abd).abd_sgl = sg;
abd_free_chunks(abd);
zfs_refcount_destroy(&abd->abd_children);
abd_update_scatter_stats(abd, ABDSTAT_DECR);
abd_free_struct(abd);
}
/*
@ -770,9 +760,8 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
}
abd_t *
abd_get_offset_scatter(abd_t *sabd, size_t off)
abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
{
abd_t *abd = NULL;
int i = 0;
struct scatterlist *sg = NULL;
@ -781,6 +770,7 @@ abd_get_offset_scatter(abd_t *sabd, size_t off)
size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
if (abd == NULL)
abd = abd_alloc_struct(0);
/*
@ -788,7 +778,6 @@ abd_get_offset_scatter(abd_t *sabd, size_t off)
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = 0;
abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
if (new_offset < sg->length)

View File

@ -105,26 +105,6 @@
/* see block comment above for description */
int zfs_abd_scatter_enabled = B_TRUE;
boolean_t
abd_is_linear(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
}
boolean_t
abd_is_linear_page(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
B_TRUE : B_FALSE);
}
boolean_t
abd_is_gang(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_GANG) != 0 ? B_TRUE :
B_FALSE);
}
void
abd_verify(abd_t *abd)
{
@ -133,7 +113,7 @@ abd_verify(abd_t *abd)
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS));
ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
@ -153,11 +133,39 @@ abd_verify(abd_t *abd)
}
}
uint_t
abd_get_size(abd_t *abd)
static void
abd_init_struct(abd_t *abd)
{
abd_verify(abd);
return (abd->abd_size);
list_link_init(&abd->abd_gang_link);
mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
zfs_refcount_create(&abd->abd_children);
abd->abd_flags = 0;
abd->abd_parent = NULL;
abd->abd_size = 0;
}
static void
abd_fini_struct(abd_t *abd)
{
mutex_destroy(&abd->abd_mtx);
ASSERT(!list_link_active(&abd->abd_gang_link));
zfs_refcount_destroy(&abd->abd_children);
}
abd_t *
abd_alloc_struct(size_t size)
{
abd_t *abd = abd_alloc_struct_impl(size);
abd_init_struct(abd);
abd->abd_flags |= ABD_FLAG_ALLOCD;
return (abd);
}
void
abd_free_struct(abd_t *abd)
{
abd_fini_struct(abd);
abd_free_struct_impl(abd);
}
/*
@ -173,7 +181,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
abd_t *abd = abd_alloc_struct(size);
abd->abd_flags = ABD_FLAG_OWNER;
abd->abd_flags |= ABD_FLAG_OWNER;
abd->abd_u.abd_scatter.abd_offset = 0;
abd_alloc_chunks(abd, size);
@ -181,65 +189,12 @@ abd_alloc(size_t size, boolean_t is_metadata)
abd->abd_flags |= ABD_FLAG_META;
}
abd->abd_size = size;
abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children);
abd_update_scatter_stats(abd, ABDSTAT_INCR);
return (abd);
}
static void
abd_free_scatter(abd_t *abd)
{
abd_free_chunks(abd);
zfs_refcount_destroy(&abd->abd_children);
abd_update_scatter_stats(abd, ABDSTAT_DECR);
abd_free_struct(abd);
}
static void
abd_put_gang_abd(abd_t *abd)
{
ASSERT(abd_is_gang(abd));
abd_t *cabd;
while ((cabd = list_remove_head(&ABD_GANG(abd).abd_gang_chain))
!= NULL) {
ASSERT0(cabd->abd_flags & ABD_FLAG_GANG_FREE);
abd->abd_size -= cabd->abd_size;
abd_put(cabd);
}
ASSERT0(abd->abd_size);
list_destroy(&ABD_GANG(abd).abd_gang_chain);
}
/*
* Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
* free the underlying scatterlist or buffer.
*/
void
abd_put(abd_t *abd)
{
if (abd == NULL)
return;
abd_verify(abd);
ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
if (abd->abd_parent != NULL) {
(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
abd->abd_size, abd);
}
if (abd_is_gang(abd))
abd_put_gang_abd(abd);
zfs_refcount_destroy(&abd->abd_children);
abd_free_struct(abd);
}
/*
* Allocate an ABD that must be linear, along with its own underlying data
* buffer. Only use this when it would be very annoying to write your ABD
@ -252,13 +207,11 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
if (is_metadata) {
abd->abd_flags |= ABD_FLAG_META;
}
abd->abd_size = size;
abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children);
if (is_metadata) {
ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
@ -284,19 +237,16 @@ abd_free_linear(abd_t *abd)
zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
}
zfs_refcount_destroy(&abd->abd_children);
abd_update_linear_stats(abd, ABDSTAT_DECR);
abd_free_struct(abd);
}
static void
abd_free_gang_abd(abd_t *abd)
{
ASSERT(abd_is_gang(abd));
abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
abd_t *cabd;
while (cabd != NULL) {
while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {
/*
* We must acquire the child ABDs mutex to ensure that if it
* is being added to another gang ABD we will set the link
@ -308,23 +258,30 @@ abd_free_gang_abd(abd_t *abd)
list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);
mutex_exit(&cabd->abd_mtx);
abd->abd_size -= cabd->abd_size;
if (cabd->abd_flags & ABD_FLAG_GANG_FREE) {
if (cabd->abd_flags & ABD_FLAG_OWNER)
if (cabd->abd_flags & ABD_FLAG_GANG_FREE)
abd_free(cabd);
else
abd_put(cabd);
}
cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
}
ASSERT0(abd->abd_size);
list_destroy(&ABD_GANG(abd).abd_gang_chain);
zfs_refcount_destroy(&abd->abd_children);
abd_free_struct(abd);
}
static void
abd_free_scatter(abd_t *abd)
{
abd_free_chunks(abd);
abd_update_scatter_stats(abd, ABDSTAT_DECR);
}
/*
* Free an ABD. Only use this on ABDs allocated with abd_alloc(),
* abd_alloc_linear(), or abd_alloc_gang_abd().
* Free an ABD. Use with any kind of abd: those created with abd_alloc_*()
* and abd_get_*(), including abd_get_offset_struct().
*
* If the ABD was created with abd_alloc_*(), the underlying data
* (scatterlist or linear buffer) will also be freed. (Subject to ownership
* changes via abd_*_ownership_of_buf().)
*
* Unless the ABD was created with abd_get_offset_struct(), the abd_t will
* also be freed.
*/
void
abd_free(abd_t *abd)
@ -333,14 +290,26 @@ abd_free(abd_t *abd)
return;
abd_verify(abd);
ASSERT3P(abd->abd_parent, ==, NULL);
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd))
abd_free_linear(abd);
else if (abd_is_gang(abd))
IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);
if (abd_is_gang(abd)) {
abd_free_gang_abd(abd);
else
} else if (abd_is_linear(abd)) {
if (abd->abd_flags & ABD_FLAG_OWNER)
abd_free_linear(abd);
} else {
if (abd->abd_flags & ABD_FLAG_OWNER)
abd_free_scatter(abd);
}
if (abd->abd_parent != NULL) {
(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
abd->abd_size, abd);
}
abd_fini_struct(abd);
if (abd->abd_flags & ABD_FLAG_ALLOCD)
abd_free_struct_impl(abd);
}
/*
@ -359,24 +328,18 @@ abd_alloc_sametype(abd_t *sabd, size_t size)
}
}
/*
* Create gang ABD that will be the head of a list of ABD's. This is used
* to "chain" scatter/gather lists together when constructing aggregated
* IO's. To free this abd, abd_free() must be called.
*/
abd_t *
abd_alloc_gang_abd(void)
abd_alloc_gang(void)
{
abd_t *abd;
abd = abd_alloc_struct(0);
abd->abd_flags = ABD_FLAG_GANG | ABD_FLAG_OWNER;
abd->abd_size = 0;
abd->abd_parent = NULL;
abd_t *abd = abd_alloc_struct(0);
abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;
list_create(&ABD_GANG(abd).abd_gang_chain,
sizeof (abd_t), offsetof(abd_t, abd_gang_link));
zfs_refcount_create(&abd->abd_children);
return (abd);
}
@ -392,8 +355,8 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
if (free_on_free) {
/*
* If the parent is responsible for freeing the child gang
* ABD we will just splice the childs children ABD list to
* the parents list and immediately free the child gang ABD
* ABD we will just splice the child's children ABD list to
* the parent's list and immediately free the child gang ABD
* struct. The parent gang ABDs children from the child gang
* will retain all the free_on_free settings after being
* added to the parents list.
@ -431,7 +394,7 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
/*
* If the child being added is a gang ABD, we will add the
* childs ABDs to the parent gang ABD. This alllows us to account
* child's ABDs to the parent gang ABD. This allows us to account
* for the offset correctly in the parent gang ABD.
*/
if (abd_is_gang(cabd)) {
@ -515,68 +478,89 @@ abd_gang_get_offset(abd_t *abd, size_t *off)
}
/*
* Allocate a new ABD to point to offset off of sabd. It shares the underlying
* buffer data with sabd. Use abd_put() to free. sabd must not be freed while
* any derived ABDs exist.
* Allocate a new ABD, using the provided struct (if non-NULL, and if
* circumstances allow - otherwise allocate the struct). The returned ABD will
* point to offset off of sabd. It shares the underlying buffer data with sabd.
* Use abd_free() to free. sabd must not be freed while any derived ABDs exist.
*/
static abd_t *
abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
{
abd_t *abd = NULL;
abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size);
ASSERT3U(off + size, <=, sabd->abd_size);
if (abd_is_linear(sabd)) {
if (abd == NULL)
abd = abd_alloc_struct(0);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = ABD_FLAG_LINEAR;
abd->abd_flags |= ABD_FLAG_LINEAR;
ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
} else if (abd_is_gang(sabd)) {
size_t left = size;
abd = abd_alloc_gang_abd();
if (abd == NULL) {
abd = abd_alloc_gang();
} else {
abd->abd_flags |= ABD_FLAG_GANG;
list_create(&ABD_GANG(abd).abd_gang_chain,
sizeof (abd_t), offsetof(abd_t, abd_gang_link));
}
abd->abd_flags &= ~ABD_FLAG_OWNER;
for (abd_t *cabd = abd_gang_get_offset(sabd, &off);
cabd != NULL && left > 0;
cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {
int csize = MIN(left, cabd->abd_size - off);
abd_t *nabd = abd_get_offset_impl(cabd, off, csize);
abd_gang_add(abd, nabd, B_FALSE);
abd_t *nabd = abd_get_offset_size(cabd, off, csize);
abd_gang_add(abd, nabd, B_TRUE);
left -= csize;
off = 0;
}
ASSERT3U(left, ==, 0);
} else {
abd = abd_get_offset_scatter(sabd, off);
abd = abd_get_offset_scatter(abd, sabd, off);
}
abd->abd_size = size;
abd->abd_parent = sabd;
zfs_refcount_create(&abd->abd_children);
(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
return (abd);
}
/*
* Like abd_get_offset_size(), but memory for the abd_t is provided by the
* caller. Using this routine can improve performance by avoiding the cost
* of allocating memory for the abd_t struct, and updating the abd stats.
* Usually, the provided abd is returned, but in some circumstances (FreeBSD,
* if sabd is scatter and size is more than 2 pages) a new abd_t may need to
* be allocated. Therefore callers should be careful to use the returned
* abd_t*.
*/
abd_t *
abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)
{
abd_init_struct(abd);
return (abd_get_offset_impl(abd, sabd, off, size));
}
abd_t *
abd_get_offset(abd_t *sabd, size_t off)
{
size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
VERIFY3U(size, >, 0);
return (abd_get_offset_impl(sabd, off, size));
return (abd_get_offset_impl(NULL, sabd, off, size));
}
abd_t *
abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
{
ASSERT3U(off + size, <=, sabd->abd_size);
return (abd_get_offset_impl(sabd, off, size));
return (abd_get_offset_impl(NULL, sabd, off, size));
}
/*
@ -607,10 +591,8 @@ abd_get_from_buf(void *buf, size_t size)
* own the underlying data buffer, which is not true in this case.
* Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = ABD_FLAG_LINEAR;
abd->abd_flags |= ABD_FLAG_LINEAR;
abd->abd_size = size;
abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children);
ABD_LINEAR_BUF(abd) = buf;
@ -790,12 +772,12 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
boolean_t abd_multi = abd_is_gang(abd);
boolean_t gang = abd_is_gang(abd);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
/* If we are at the end of the gang ABD we are done */
if (abd_multi && !c_abd)
if (gang && !c_abd)
break;
abd_iter_map(&aiter);

View File

@ -3065,7 +3065,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_hdr_size(hdr), hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
abd_put(hdr->b_l1hdr.b_pabd);
abd_free(hdr->b_l1hdr.b_pabd);
hdr->b_l1hdr.b_pabd = NULL;
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
@ -7047,7 +7047,7 @@ arc_write_done(zio_t *zio)
ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
abd_put(zio->io_abd);
abd_free(zio->io_abd);
kmem_free(callback, sizeof (arc_write_callback_t));
}
@ -9043,7 +9043,7 @@ l2arc_blk_fetch_done(zio_t *zio)
cb = zio->io_private;
if (cb->l2rcb_abd != NULL)
abd_put(cb->l2rcb_abd);
abd_free(cb->l2rcb_abd);
kmem_free(cb, sizeof (l2arc_read_callback_t));
}
@ -10015,7 +10015,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_SPECULATIVE, B_FALSE));
abd_put(abd);
abd_free(abd);
if (err != 0) {
ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
@ -10383,7 +10383,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
abd_put(abd);
abd_free(abd);
if (err != 0) {
zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
@ -10472,7 +10472,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
fletcher_4_native(tmpbuf, asize, NULL,
&l2dhdr->dh_start_lbps[0].lbp_cksum);
abd_put(abd_buf->abd);
abd_free(abd_buf->abd);
/* perform the write itself */
abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));

View File

@ -4656,7 +4656,7 @@ dbuf_write_override_done(zio_t *zio)
dbuf_write_done(zio, NULL, db);
if (zio->io_abd != NULL)
abd_put(zio->io_abd);
abd_free(zio->io_abd);
}
typedef struct dbuf_remap_impl_callback_arg {

View File

@ -1600,7 +1600,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
abd_put(zio->io_abd);
abd_free(zio->io_abd);
kmem_free(dsa, sizeof (*dsa));
}

View File

@ -716,7 +716,7 @@ vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
offset = 0;
for (; x < rr->rr_cols; x++) {
abd_put(rr->rr_col[x].rc_abd);
abd_free(rr->rr_col[x].rc_abd);
if (offset == good_size) {
/* empty data column (small write) */
@ -754,11 +754,7 @@ vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
offset = 0;
for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
if (offset == good_size || x < rr->rr_bigcols)
abd_put(rr->rr_col[x].rc_abd);
else
abd_free(rr->rr_col[x].rc_abd);
rr->rr_col[x].rc_abd = abd_get_offset_size(
rr->rr_abd_copy, offset,
rr->rr_col[x].rc_size);
@ -797,7 +793,7 @@ vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
abd_put((abd_t *)good);
abd_free((abd_t *)good);
}
/*
@ -852,11 +848,7 @@ vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
offset, col->rc_size);
abd_copy(tmp, col->rc_abd, col->rc_size);
if (abd_is_gang(col->rc_abd))
abd_free(col->rc_abd);
else
abd_put(col->rc_abd);
col->rc_abd = tmp;
offset += col->rc_size;
@ -902,12 +894,12 @@ vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
rc->rc_abd = abd_get_zeros(skip_size);
} else if (rc->rc_size == parity_size) {
/* this is a "big column" */
rc->rc_abd = abd_get_offset_size(zio->io_abd,
abd_off, rc->rc_size);
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, abd_off, rc->rc_size);
} else {
/* short data column, add a skip sector */
ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
rc->rc_abd = abd_alloc_gang_abd();
rc->rc_abd = abd_alloc_gang();
abd_gang_add(rc->rc_abd, abd_get_offset_size(
zio->io_abd, abd_off, rc->rc_size), B_TRUE);
abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
@ -958,13 +950,13 @@ vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
skip_off += skip_size;
} else if (rc->rc_size == parity_size) {
/* this is a "big column" */
rc->rc_abd = abd_get_offset_size(zio->io_abd,
abd_off, rc->rc_size);
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, abd_off, rc->rc_size);
} else {
/* short data column, add a skip sector */
ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
ASSERT3U(rr->rr_nempty, !=, 0);
rc->rc_abd = abd_alloc_gang_abd();
rc->rc_abd = abd_alloc_gang();
abd_gang_add(rc->rc_abd, abd_get_offset_size(
zio->io_abd, abd_off, rc->rc_size), B_TRUE);
abd_gang_add(rc->rc_abd, abd_get_offset_size(
@ -1006,8 +998,8 @@ vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_size > 0) {
rc->rc_abd = abd_get_offset_size(zio->io_abd,
abd_off, rc->rc_size);
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, abd_off, rc->rc_size);
abd_off += rc->rc_size;
}
}
@ -1056,7 +1048,7 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
ASSERT3P(rc->rc_abd, !=, NULL);
ASSERT(!abd_is_gang(rc->rc_abd));
abd_t *read_abd = rc->rc_abd;
rc->rc_abd = abd_alloc_gang_abd();
rc->rc_abd = abd_alloc_gang();
abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
abd_gang_add(rc->rc_abd, abd_get_offset_size(
rr->rr_abd_empty, skip_off, skip_size), B_TRUE);

View File

@ -1187,7 +1187,7 @@ vdev_indirect_child_io_done(zio_t *zio)
pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
mutex_exit(&pio->io_lock);
abd_put(zio->io_abd);
abd_free(zio->io_abd);
}
/*

View File

@ -789,7 +789,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
size = IO_SPAN(first, last);
ASSERT3U(size, <=, maxblocksize);
abd = abd_alloc_gang_abd();
abd = abd_alloc_gang();
if (abd == NULL)
return (NULL);

View File

@ -138,30 +138,15 @@
static void
vdev_raidz_row_free(raidz_row_t *rr)
{
int c;
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) {
abd_free(rr->rr_col[c].rc_abd);
if (rr->rr_col[c].rc_gdata != NULL) {
abd_free(rr->rr_col[c].rc_gdata);
}
if (rr->rr_col[c].rc_orig_data != NULL) {
zio_buf_free(rr->rr_col[c].rc_orig_data,
rr->rr_col[c].rc_size);
}
}
for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
if (rr->rr_col[c].rc_size != 0) {
if (abd_is_gang(rr->rr_col[c].rc_abd))
abd_free(rr->rr_col[c].rc_abd);
else
abd_put(rr->rr_col[c].rc_abd);
}
if (rr->rr_col[c].rc_orig_data != NULL) {
zio_buf_free(rr->rr_col[c].rc_orig_data,
rr->rr_col[c].rc_size);
}
if (rc->rc_size != 0)
abd_free(rc->rc_abd);
if (rc->rc_gdata != NULL)
abd_free(rc->rc_gdata);
if (rc->rc_orig_data != NULL)
zio_buf_free(rc->rc_orig_data, rc->rc_size);
}
if (rr->rr_abd_copy != NULL)
@ -249,7 +234,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
/* fill in the data columns from good_data */
offset = 0;
for (; x < rr->rr_cols; x++) {
abd_put(rr->rr_col[x].rc_abd);
abd_free(rr->rr_col[x].rc_abd);
rr->rr_col[x].rc_abd =
abd_get_offset_size((abd_t *)good_data,
@ -268,7 +253,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
offset = 0;
for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
abd_put(rr->rr_col[x].rc_abd);
abd_free(rr->rr_col[x].rc_abd);
rr->rr_col[x].rc_abd = abd_get_offset_size(
rr->rr_abd_copy, offset,
rr->rr_col[x].rc_size);
@ -291,7 +276,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
abd_put((abd_t *)good);
abd_free((abd_t *)good);
}
/*
@ -344,7 +329,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
abd_copy(tmp, col->rc_abd, col->rc_size);
abd_put(col->rc_abd);
abd_free(col->rc_abd);
col->rc_abd = tmp;
offset += col->rc_size;
@ -379,7 +364,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << ashift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
uint64_t off = 0;
raidz_map_t *rm =
kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
@ -477,13 +461,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rr->rr_col[c].rc_abd =
abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
rr->rr_col[c].rc_size);
off = rr->rr_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
for (uint64_t off = 0; c < acols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size);
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
zio->io_abd, off, rc->rc_size);
off += rc->rc_size;
}

View File

@ -1230,7 +1230,7 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(!BP_IS_HOLE(zio->io_bp));
ASSERT(BP_GET_FILL(zio->io_bp) == 0);
abd_put(zio->io_abd);
abd_free(zio->io_abd);
mutex_enter(&zilog->zl_lock);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);

View File

@ -2450,7 +2450,7 @@ zio_resume_wait(spa_t *spa)
static void
zio_gang_issue_func_done(zio_t *zio)
{
abd_put(zio->io_abd);
abd_free(zio->io_abd);
}
static zio_t *
@ -2494,7 +2494,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
buf, BP_GET_PSIZE(bp));
abd_put(buf);
abd_free(buf);
}
/*
* If we are here to damage data for testing purposes,
@ -2622,7 +2622,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
abd_put(zio->io_abd);
abd_free(zio->io_abd);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@ -2746,7 +2746,7 @@ zio_write_gang_done(zio_t *zio)
* check for it here as it is cleared in zio_ready.
*/
if (zio->io_abd != NULL)
abd_put(zio->io_abd);
abd_free(zio->io_abd);
}
static zio_t *