Combine OS-independent ABD Code into Common Source File

Reorganizing ABD code base so OS-independent ABD code has been placed
into a common abd.c file. OS-dependent ABD code has been left in each
OS's ABD source files, and these source files have been renamed to
abd_os.

The OS-independent ABD code is now under:
module/zfs/abd.c
With the OS-dependent code in:
module/os/linux/zfs/abd_os.c
module/os/freebsd/zfs/abd_os.c

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #10293
This commit is contained in:
Brian Atkinson 2020-05-10 13:23:52 -06:00 committed by GitHub
parent bd95f00d4b
commit fc551d7efb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1605 additions and 2074 deletions

View File

@ -2,6 +2,7 @@ SUBDIRS = fm fs crypto lua sysevent
COMMON_H = \ COMMON_H = \
$(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/abd.h \
$(top_srcdir)/include/sys/abd_impl.h \
$(top_srcdir)/include/sys/aggsum.h \ $(top_srcdir)/include/sys/aggsum.h \
$(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc.h \
$(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/arc_impl.h \

View File

@ -35,56 +35,14 @@
extern "C" { extern "C" {
#endif #endif
typedef enum abd_flags { struct abd; /* forward declaration */
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ typedef struct abd abd_t;
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
} abd_flags_t;
typedef struct abd {
abd_flags_t abd_flags;
uint_t abd_size; /* excludes scattered abd_offset */
struct abd *abd_parent;
zfs_refcount_t abd_children;
union {
struct abd_scatter {
uint_t abd_offset;
#if defined(__FreeBSD__) && defined(_KERNEL)
uint_t abd_chunk_size;
void *abd_chunks[];
#else
uint_t abd_nents;
struct scatterlist *abd_sgl;
#endif
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
} abd_u;
} abd_t;
typedef int abd_iter_func_t(void *buf, size_t len, void *private); typedef int abd_iter_func_t(void *buf, size_t len, void *private);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private);
extern int zfs_abd_scatter_enabled; extern int zfs_abd_scatter_enabled;
static inline boolean_t
abd_is_linear(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
}
static inline boolean_t
abd_is_linear_page(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
B_TRUE : B_FALSE);
}
/* /*
* Allocations and deallocations * Allocations and deallocations
*/ */
@ -124,12 +82,8 @@ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
int abd_cmp(abd_t *, abd_t *); int abd_cmp(abd_t *, abd_t *);
int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
void abd_zero_off(abd_t *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t);
void abd_verify(abd_t *);
#if defined(_KERNEL) uint_t abd_get_size(abd_t *);
unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int,
size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#endif
void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
ssize_t csize, ssize_t dsize, const unsigned parity, ssize_t csize, ssize_t dsize, const unsigned parity,
@ -174,13 +128,29 @@ abd_zero(abd_t *abd, size_t size)
abd_zero_off(abd, 0, size); abd_zero_off(abd, 0, size);
} }
/*
* ABD type check functions
*/
boolean_t abd_is_linear(abd_t *);
boolean_t abd_is_linear_page(abd_t *);
/* /*
* Module lifecycle * Module lifecycle
* Defined in each specific OS's abd_os.c
*/ */
void abd_init(void); void abd_init(void);
void abd_fini(void); void abd_fini(void);
/*
* Linux ABD bio functions
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int,
size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

126
include/sys/abd_impl.h Normal file
View File

@ -0,0 +1,126 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#ifndef _ABD_IMPL_H
#define _ABD_IMPL_H
#include <sys/abd.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef enum abd_flags {
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
} abd_flags_t;
typedef enum abd_stats_op {
ABDSTAT_INCR, /* Increase abdstat values */
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;
struct abd {
abd_flags_t abd_flags;
uint_t abd_size; /* excludes scattered abd_offset */
struct abd *abd_parent;
zfs_refcount_t abd_children;
union {
struct abd_scatter {
uint_t abd_offset;
#if defined(__FreeBSD__) && defined(_KERNEL)
uint_t abd_chunk_size;
void *abd_chunks[];
#else
uint_t abd_nents;
struct scatterlist *abd_sgl;
#endif
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
} abd_u;
};
struct scatterlist; /* forward declaration */
struct abd_iter {
/* public interface */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
/* private */
abd_t *iter_abd; /* ABD being iterated through */
size_t iter_pos;
size_t iter_offset; /* offset in current sg/abd_buf, */
/* abd_offset included */
struct scatterlist *iter_sg; /* current sg */
};
/*
* OS specific functions
*/
abd_t *abd_alloc_struct(size_t);
abd_t *abd_get_offset_scatter(abd_t *, size_t);
void abd_free_struct(abd_t *);
void abd_alloc_chunks(abd_t *, size_t);
void abd_free_chunks(abd_t *);
boolean_t abd_size_alloc_linear(size_t);
void abd_update_scatter_stats(abd_t *, abd_stats_op_t);
void abd_update_linear_stats(abd_t *, abd_stats_op_t);
void abd_verify_scatter(abd_t *);
void abd_free_linear_page(abd_t *);
void abd_enter_critical(unsigned long);
void abd_exit_critical(unsigned long);
/* OS specific abd_iter functions */
void abd_iter_init(struct abd_iter *, abd_t *);
boolean_t abd_iter_at_end(struct abd_iter *);
void abd_iter_advance(struct abd_iter *, size_t);
void abd_iter_map(struct abd_iter *);
void abd_iter_unmap(struct abd_iter *);
/*
* Helper macros
*/
#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
#define ABDSTAT_INCR(stat, val) \
atomic_add_64(&abd_stats.stat.value.ui64, (val))
#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
#ifdef __cplusplus
}
#endif
#endif /* _ABD_IMPL_H */

View File

@ -39,6 +39,7 @@ KERNEL_C = \
zpool_prop.c \ zpool_prop.c \
zprop_common.c \ zprop_common.c \
abd.c \ abd.c \
abd_os.c \
aggsum.c \ aggsum.c \
arc.c \ arc.c \
arc_os.c \ arc_os.c \

View File

@ -127,7 +127,7 @@ SRCS+= spl_atomic.c
.endif .endif
#os/freebsd/zfs #os/freebsd/zfs
SRCS+= abd.c \ SRCS+= abd_os.c \
crypto_os.c \ crypto_os.c \
dmu_os.c \ dmu_os.c \
hkdf.c \ hkdf.c \
@ -169,7 +169,8 @@ SRCS+= zfeature_common.c \
zprop_common.c zprop_common.c
#zfs #zfs
SRCS+= aggsum.c \ SRCS+= abd.c \
aggsum.c \
arc.c \ arc.c \
arc_os.c \ arc_os.c \
blkptr.c \ blkptr.c \

View File

@ -0,0 +1,433 @@
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
/*
* See abd.c for a general overview of the arc buffered data (ABD).
*
* Using a large proportion of scattered ABDs decreases ARC fragmentation since
* when we are at the limit of allocatable space, using equal-size chunks will
* allow us to quickly reclaim enough space for a new large allocation (assuming
* it is also scattered).
*
* ABDs are allocated scattered by default unless the caller uses
* abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
*/
#include <sys/abd_impl.h>
#include <sys/param.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
typedef struct abd_stats {
kstat_named_t abdstat_struct_size;
kstat_named_t abdstat_scatter_cnt;
kstat_named_t abdstat_scatter_data_size;
kstat_named_t abdstat_scatter_chunk_waste;
kstat_named_t abdstat_linear_cnt;
kstat_named_t abdstat_linear_data_size;
} abd_stats_t;
static abd_stats_t abd_stats = {
/* Amount of memory occupied by all of the abd_t struct allocations */
{ "struct_size", KSTAT_DATA_UINT64 },
/*
* The number of scatter ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset()).
*/
{ "scatter_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
{ "scatter_data_size", KSTAT_DATA_UINT64 },
/*
* The amount of space wasted at the end of the last chunk across all
* scatter ABDs tracked by scatter_cnt.
*/
{ "scatter_chunk_waste", KSTAT_DATA_UINT64 },
/*
* The number of linear ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset() and abd_get_from_buf()). If an
* ABD takes ownership of its buf then it will become tracked.
*/
{ "linear_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all linear ABDs tracked by linear_cnt */
{ "linear_data_size", KSTAT_DATA_UINT64 },
};
/*
* The size of the chunks ABD allocates. Because the sizes allocated from the
* kmem_cache can't change, this tunable can only be modified at boot. Changing
* it at runtime would cause ABD iteration to work incorrectly for ABDs which
* were allocated with the old size, so a safeguard has been put in place which
* will cause the machine to panic if you change it and try to access the data
* within a scattered ABD.
*/
size_t zfs_abd_chunk_size = 4096;
#if defined(_KERNEL)
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
&zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
#endif
kmem_cache_t *abd_chunk_cache;
static kstat_t *abd_ksp;
static void
abd_free_chunk(void *c)
{
kmem_cache_free(abd_chunk_cache, c);
}
static size_t
abd_chunkcnt_for_bytes(size_t size)
{
return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
}
static inline size_t
abd_scatter_chunkcnt(abd_t *abd)
{
ASSERT(!abd_is_linear(abd));
return (abd_chunkcnt_for_bytes(
ABD_SCATTER(abd).abd_offset + abd->abd_size));
}
boolean_t
abd_size_alloc_linear(size_t size)
{
return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
}
void
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
{
size_t n = abd_scatter_chunkcnt(abd);
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
if (op == ABDSTAT_INCR) {
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
n * zfs_abd_chunk_size - abd->abd_size);
} else {
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
abd->abd_size - n * zfs_abd_chunk_size);
}
}
void
abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
{
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
if (op == ABDSTAT_INCR) {
ABDSTAT_BUMP(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
} else {
ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
}
}
void
abd_verify_scatter(abd_t *abd)
{
/*
* There is no scatter linear pages in FreeBSD so there is an
* if an error if the ABD has been marked as a linear page.
*/
VERIFY(!abd_is_linear_page(abd));
ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
zfs_abd_chunk_size);
size_t n = abd_scatter_chunkcnt(abd);
for (int i = 0; i < n; i++) {
ASSERT3P(
ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
}
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
size_t n = abd_chunkcnt_for_bytes(size);
for (int i = 0; i < n; i++) {
void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
ASSERT3P(c, !=, NULL);
ABD_SCATTER(abd).abd_chunks[i] = c;
}
ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
}
void
abd_free_chunks(abd_t *abd)
{
size_t n = abd_scatter_chunkcnt(abd);
for (int i = 0; i < n; i++) {
abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
}
}
abd_t *
abd_alloc_struct(size_t size)
{
size_t chunkcnt = abd_chunkcnt_for_bytes(size);
size_t abd_size = offsetof(abd_t,
abd_u.abd_scatter.abd_chunks[chunkcnt]);
abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
ASSERT3P(abd, !=, NULL);
ABDSTAT_INCR(abdstat_struct_size, abd_size);
return (abd);
}
void
abd_free_struct(abd_t *abd)
{
size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
kmem_free(abd, size);
ABDSTAT_INCR(abdstat_struct_size, -size);
}
void
abd_init(void)
{
abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (abd_ksp != NULL) {
abd_ksp->ks_data = &abd_stats;
kstat_install(abd_ksp);
}
}
void
abd_fini(void)
{
if (abd_ksp != NULL) {
kstat_delete(abd_ksp);
abd_ksp = NULL;
}
kmem_cache_destroy(abd_chunk_cache);
abd_chunk_cache = NULL;
}
void
abd_free_linear_page(abd_t *abd)
{
/*
* FreeBSD does not have have scatter linear pages
* so there is an error.
*/
VERIFY(0);
}
/*
* If we're going to use this ABD for doing I/O using the block layer, the
* consumer of the ABD data doesn't care if it's scattered or not, and we don't
* plan to store this ABD in memory for a long period of time, we should
* allocate the ABD type that requires the least data copying to do the I/O.
*
* Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
* using a scatter/gather list we should switch to that and replace this call
* with vanilla abd_alloc().
*/
abd_t *
abd_alloc_for_io(size_t size, boolean_t is_metadata)
{
return (abd_alloc_linear(size, is_metadata));
}
/*
* This is just a helper function to abd_get_offset_scatter() to alloc a
* scatter ABD using the calculated chunkcnt based on the offset within the
* parent ABD.
*/
static abd_t *
abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt)
{
size_t abd_size = offsetof(abd_t,
abd_u.abd_scatter.abd_chunks[chunkcnt]);
abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
ASSERT3P(abd, !=, NULL);
ABDSTAT_INCR(abdstat_struct_size, abd_size);
return (abd);
}
abd_t *
abd_get_offset_scatter(abd_t *sabd, size_t off)
{
abd_t *abd = NULL;
abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size);
size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
(new_offset / zfs_abd_chunk_size);
abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = 0;
ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
/* Copy the scatterlist starting at the correct offset */
(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
&ABD_SCATTER(sabd).abd_chunks[new_offset /
zfs_abd_chunk_size],
chunkcnt * sizeof (void *));
return (abd);
}
static inline size_t
abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
{
ASSERT(!abd_is_linear(aiter->iter_abd));
return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
aiter->iter_pos) % zfs_abd_chunk_size);
}
static inline size_t
abd_iter_scatter_chunk_index(struct abd_iter *aiter)
{
ASSERT(!abd_is_linear(aiter->iter_abd));
return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
aiter->iter_pos) / zfs_abd_chunk_size);
}
/*
* Initialize the abd_iter.
*/
void
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
abd_verify(abd);
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
/*
* This is just a helper function to see if we have exhausted the
* abd_iter and reached the end.
*/
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
/*
* Advance the iterator by a certain amount. Cannot be called when a chunk is
* in use. This can be safely called when the aiter has already exhausted, in
* which case this does nothing.
*/
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
return;
aiter->iter_pos += amount;
}
/*
* Map the current chunk into aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
void
abd_iter_map(struct abd_iter *aiter)
{
void *paddr;
size_t offset = 0;
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* Panic if someone has changed zfs_abd_chunk_size */
IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
/* There's nothing left to iterate over, so do nothing */
if (abd_iter_at_end(aiter))
return;
if (abd_is_linear(aiter->iter_abd)) {
offset = aiter->iter_pos;
aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
paddr = ABD_LINEAR_BUF(aiter->iter_abd);
} else {
size_t index = abd_iter_scatter_chunk_index(aiter);
offset = abd_iter_scatter_chunk_offset(aiter);
aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
}
aiter->iter_mapaddr = (char *)paddr + offset;
}
/*
* Unmap the current chunk from aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
void
abd_iter_unmap(struct abd_iter *aiter)
{
/* There's nothing left to unmap, so do nothing */
if (abd_iter_at_end(aiter))
return;
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
ASSERT3U(aiter->iter_mapsize, >, 0);
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
void
abd_enter_critical(unsigned long flags)
{
critical_enter();
}
void
abd_exit_critical(unsigned long flags)
{
critical_exit();
}

View File

@ -7,7 +7,7 @@ ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs
$(MODULE)-objs += ../os/linux/zfs/abd.o $(MODULE)-objs += ../os/linux/zfs/abd_os.o
$(MODULE)-objs += ../os/linux/zfs/arc_os.o $(MODULE)-objs += ../os/linux/zfs/arc_os.o
$(MODULE)-objs += ../os/linux/zfs/mmp_os.o $(MODULE)-objs += ../os/linux/zfs/mmp_os.o
$(MODULE)-objs += ../os/linux/zfs/policy.o $(MODULE)-objs += ../os/linux/zfs/policy.o

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,891 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
*/
/*
* See abd.c for an general overview of the arc buffered data (ABD).
*
* Linear buffers act exactly like normal buffers and are always mapped into the
* kernel's virtual memory space, while scattered ABD data chunks are allocated
* as physical pages and then mapped in only while they are actually being
* accessed through one of the abd_* library functions. Using scattered ABDs
* provides several benefits:
*
* (1) They avoid use of kmem_*, preventing performance problems where running
* kmem_reap on very large memory systems never finishes and causes
* constant TLB shootdowns.
*
* (2) Fragmentation is less of an issue since when we are at the limit of
* allocatable space, we won't have to search around for a long free
* hole in the VA space for large ARC allocations. Each chunk is mapped in
* individually, so even if we are using HIGHMEM (see next point) we
* wouldn't need to worry about finding a contiguous address range.
*
* (3) If we are not using HIGHMEM, then all physical memory is always
* mapped into the kernel's address space, so we also avoid the map /
* unmap costs on each ABD access.
*
* If we are not using HIGHMEM, scattered buffers which have only one chunk
* can be treated as linear buffers, because they are contiguous in the
* kernel's virtual address space. See abd_alloc_chunks() for details.
*/
#include <sys/abd_impl.h>
#include <sys/param.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/scatterlist.h>
#else
#define MAX_ORDER 1
#endif
typedef struct abd_stats {
kstat_named_t abdstat_struct_size;
kstat_named_t abdstat_linear_cnt;
kstat_named_t abdstat_linear_data_size;
kstat_named_t abdstat_scatter_cnt;
kstat_named_t abdstat_scatter_data_size;
kstat_named_t abdstat_scatter_chunk_waste;
kstat_named_t abdstat_scatter_orders[MAX_ORDER];
kstat_named_t abdstat_scatter_page_multi_chunk;
kstat_named_t abdstat_scatter_page_multi_zone;
kstat_named_t abdstat_scatter_page_alloc_retry;
kstat_named_t abdstat_scatter_sg_table_retry;
} abd_stats_t;
static abd_stats_t abd_stats = {
/* Amount of memory occupied by all of the abd_t struct allocations */
{ "struct_size", KSTAT_DATA_UINT64 },
/*
* The number of linear ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset() and abd_get_from_buf()). If an
* ABD takes ownership of its buf then it will become tracked.
*/
{ "linear_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all linear ABDs tracked by linear_cnt */
{ "linear_data_size", KSTAT_DATA_UINT64 },
/*
* The number of scatter ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset()).
*/
{ "scatter_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
{ "scatter_data_size", KSTAT_DATA_UINT64 },
/*
* The amount of space wasted at the end of the last chunk across all
* scatter ABDs tracked by scatter_cnt.
*/
{ "scatter_chunk_waste", KSTAT_DATA_UINT64 },
/*
* The number of compound allocations of a given order. These
* allocations are spread over all currently allocated ABDs, and
* act as a measure of memory fragmentation.
*/
{ { "scatter_order_N", KSTAT_DATA_UINT64 } },
/*
* The number of scatter ABDs which contain multiple chunks.
* ABDs are preferentially allocated from the minimum number of
* contiguous multi-page chunks, a single chunk is optimal.
*/
{ "scatter_page_multi_chunk", KSTAT_DATA_UINT64 },
/*
* The number of scatter ABDs which are split across memory zones.
* ABDs are preferentially allocated using pages from a single zone.
*/
{ "scatter_page_multi_zone", KSTAT_DATA_UINT64 },
/*
* The total number of retries encountered when attempting to
* allocate the pages to populate the scatter ABD.
*/
{ "scatter_page_alloc_retry", KSTAT_DATA_UINT64 },
/*
* The total number of retries encountered when attempting to
* allocate the sg table for an ABD.
*/
{ "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
};
#define abd_for_each_sg(abd, sg, n, i) \
for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
/*
* zfs_abd_scatter_min_size is the minimum allocation size to use scatter
* ABD's. Smaller allocations will use linear ABD's which uses
* zio_[data_]buf_alloc().
*
* Scatter ABD's use at least one page each, so sub-page allocations waste
* some space when allocated as scatter (e.g. 2KB scatter allocation wastes
* half of each page). Using linear ABD's for small allocations means that
* they will be put on slabs which contain many allocations. This can
* improve memory efficiency, but it also makes it much harder for ARC
* evictions to actually free pages, because all the buffers on one slab need
* to be freed in order for the slab (and underlying pages) to be freed.
* Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
* possible for them to actually waste more memory than scatter (one page per
* buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
*
* Spill blocks are typically 512B and are heavily used on systems running
* selinux with the default dnode size and the `xattr=sa` property set.
*
* By default we use linear allocations for 512B and 1KB, and scatter
* allocations for larger (1.5KB and up).
*/
int zfs_abd_scatter_min_size = 512 * 3;
static kmem_cache_t *abd_cache = NULL;
static kstat_t *abd_ksp;
static size_t
abd_chunkcnt_for_bytes(size_t size)
{
return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
}
abd_t *
abd_alloc_struct(size_t size)
{
/*
* In Linux we do not use the size passed in during ABD
* allocation, so we just ignore it.
*/
abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
ASSERT3P(abd, !=, NULL);
ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
return (abd);
}
void
abd_free_struct(abd_t *abd)
{
kmem_cache_free(abd_cache, abd);
ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
}
#ifdef _KERNEL
/*
* Mark zfs data pages so they can be excluded from kernel crash dumps
*/
#ifdef _LP64
#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E
static inline void
abd_mark_zfs_page(struct page *page)
{
get_page(page);
SetPagePrivate(page);
set_page_private(page, ABD_FILE_CACHE_PAGE);
}
static inline void
abd_unmark_zfs_page(struct page *page)
{
set_page_private(page, 0UL);
ClearPagePrivate(page);
put_page(page);
}
#else
#define abd_mark_zfs_page(page)
#define abd_unmark_zfs_page(page)
#endif /* _LP64 */
#ifndef CONFIG_HIGHMEM
#ifndef __GFP_RECLAIM
#define __GFP_RECLAIM __GFP_WAIT
#endif
/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
* progressively decreased until it can be satisfied without performing
* reclaim or compaction. When necessary this function will degenerate to
* allocating individual pages and allowing reclaim to satisfy allocations.
*/
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
struct list_head pages;
struct sg_table table;
struct scatterlist *sg;
struct page *page, *tmp_page = NULL;
gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
int nr_pages = abd_chunkcnt_for_bytes(size);
int chunks = 0, zones = 0;
size_t remaining_size;
int nid = NUMA_NO_NODE;
int alloc_pages = 0;
INIT_LIST_HEAD(&pages);
while (alloc_pages < nr_pages) {
unsigned chunk_pages;
int order;
order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
chunk_pages = (1U << order);
page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
if (page == NULL) {
if (order == 0) {
ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
schedule_timeout_interruptible(1);
} else {
max_order = MAX(0, order - 1);
}
continue;
}
list_add_tail(&page->lru, &pages);
if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
zones++;
nid = page_to_nid(page);
ABDSTAT_BUMP(abdstat_scatter_orders[order]);
chunks++;
alloc_pages += chunk_pages;
}
ASSERT3S(alloc_pages, ==, nr_pages);
while (sg_alloc_table(&table, chunks, gfp)) {
ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
schedule_timeout_interruptible(1);
}
sg = table.sgl;
remaining_size = size;
list_for_each_entry_safe(page, tmp_page, &pages, lru) {
size_t sg_size = MIN(PAGESIZE << compound_order(page),
remaining_size);
sg_set_page(sg, page, sg_size, 0);
abd_mark_zfs_page(page);
remaining_size -= sg_size;
sg = sg_next(sg);
list_del(&page->lru);
}
/*
* These conditions ensure that a possible transformation to a linear
* ABD would be valid.
*/
ASSERT(!PageHighMem(sg_page(table.sgl)));
ASSERT0(ABD_SCATTER(abd).abd_offset);
if (table.nents == 1) {
/*
* Since there is only one entry, this ABD can be represented
* as a linear buffer. All single-page (4K) ABD's can be
* represented this way. Some multi-page ABD's can also be
* represented this way, if we were able to allocate a single
* "chunk" (higher-order "page" which represents a power-of-2
* series of physically-contiguous pages). This is often the
* case for 2-page (8K) ABD's.
*
* Representing a single-entry scatter ABD as a linear ABD
* has the performance advantage of avoiding the copy (and
* allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
* A performance increase of around 5% has been observed for
* ARC-cached reads (of small blocks which can take advantage
* of this).
*
* Note that this optimization is only possible because the
* pages are always mapped into the kernel's address space.
* This is not the case for highmem pages, so the
* optimization can not be made there.
*/
abd->abd_flags |= ABD_FLAG_LINEAR;
abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
abd->abd_u.abd_linear.abd_sgl = table.sgl;
ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
} else if (table.nents > 1) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
if (zones) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
}
ABD_SCATTER(abd).abd_sgl = table.sgl;
ABD_SCATTER(abd).abd_nents = table.nents;
}
}
#else
/*
* Allocate N individual pages to construct a scatter ABD. This function
* makes no attempt to request contiguous pages and requires the minimal
* number of kernel interfaces. It's designed for maximum compatibility.
*/
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
struct scatterlist *sg = NULL;
struct sg_table table;
struct page *page;
gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
int nr_pages = abd_chunkcnt_for_bytes(size);
int i = 0;
while (sg_alloc_table(&table, nr_pages, gfp)) {
ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
schedule_timeout_interruptible(1);
}
ASSERT3U(table.nents, ==, nr_pages);
ABD_SCATTER(abd).abd_sgl = table.sgl;
ABD_SCATTER(abd).abd_nents = nr_pages;
abd_for_each_sg(abd, sg, nr_pages, i) {
while ((page = __page_cache_alloc(gfp)) == NULL) {
ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
schedule_timeout_interruptible(1);
}
ABDSTAT_BUMP(abdstat_scatter_orders[0]);
sg_set_page(sg, page, PAGESIZE, 0);
abd_mark_zfs_page(page);
}
if (nr_pages > 1) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
}
}
#endif /* !CONFIG_HIGHMEM */
/*
* This must be called if any of the sg_table allocation functions
* are called.
*/
static void
abd_free_sg_table(abd_t *abd)
{
struct sg_table table;
table.sgl = ABD_SCATTER(abd).abd_sgl;
table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
sg_free_table(&table);
}
void
abd_free_chunks(abd_t *abd)
{
struct scatterlist *sg = NULL;
struct page *page;
int nr_pages = ABD_SCATTER(abd).abd_nents;
int order, i = 0;
if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
abd_for_each_sg(abd, sg, nr_pages, i) {
page = sg_page(sg);
abd_unmark_zfs_page(page);
order = compound_order(page);
__free_pages(page, order);
ASSERT3U(sg->length, <=, PAGE_SIZE << order);
ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
}
abd_free_sg_table(abd);
}
#else /* _KERNEL */
#ifndef PAGE_SHIFT
#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
#endif
struct page;
#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
#define local_irq_restore(flags) do { (void)(flags); } while (0)
#define nth_page(pg, i) \
((struct page *)((void *)(pg) + (i) * PAGESIZE))
struct scatterlist {
struct page *page;
int length;
int end;
};
static void
sg_init_table(struct scatterlist *sg, int nr)
{
memset(sg, 0, nr * sizeof (struct scatterlist));
sg[nr - 1].end = 1;
}
/*
* This must be called if any of the sg_table allocation functions
* are called.
*/
static void
abd_free_sg_table(abd_t *abd)
{
int nents = ABD_SCATTER(abd).abd_nents;
vmem_free(ABD_SCATTER(abd).abd_sgl,
nents * sizeof (struct scatterlist));
}
#define for_each_sg(sgl, sg, nr, i) \
for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
static inline void
sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
unsigned int offset)
{
/* currently we don't use offset */
ASSERT(offset == 0);
sg->page = page;
sg->length = len;
}
static inline struct page *
sg_page(struct scatterlist *sg)
{
return (sg->page);
}
static inline struct scatterlist *
sg_next(struct scatterlist *sg)
{
if (sg->end)
return (NULL);
return (sg + 1);
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
unsigned nr_pages = abd_chunkcnt_for_bytes(size);
struct scatterlist *sg;
int i;
ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
sizeof (struct scatterlist), KM_SLEEP);
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
abd_for_each_sg(abd, sg, nr_pages, i) {
struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
sg_set_page(sg, p, PAGESIZE, 0);
}
ABD_SCATTER(abd).abd_nents = nr_pages;
}
void
abd_free_chunks(abd_t *abd)
{
int i, n = ABD_SCATTER(abd).abd_nents;
struct scatterlist *sg;
abd_for_each_sg(abd, sg, n, i) {
for (int j = 0; j < sg->length; j += PAGESIZE) {
struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
umem_free(p, PAGESIZE);
}
}
abd_free_sg_table(abd);
}
#endif /* _KERNEL */
boolean_t
abd_size_alloc_linear(size_t size)
{
return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
}
void
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
{
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
if (op == ABDSTAT_INCR) {
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size);
} else {
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
(int)abd->abd_size
-(int)P2ROUNDUP(abd->abd_size, PAGESIZE));
}
}
void
abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
{
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
if (op == ABDSTAT_INCR) {
ABDSTAT_BUMP(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
} else {
ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
}
}
void
abd_verify_scatter(abd_t *abd)
{
size_t n;
int i = 0;
struct scatterlist *sg = NULL;
ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
ABD_SCATTER(abd).abd_sgl->length);
n = ABD_SCATTER(abd).abd_nents;
abd_for_each_sg(abd, sg, n, i) {
ASSERT3P(sg_page(sg), !=, NULL);
}
}
void
abd_init(void)
{
int i;
abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
0, NULL, NULL, NULL, NULL, NULL, 0);
abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (abd_ksp != NULL) {
for (i = 0; i < MAX_ORDER; i++) {
snprintf(abd_stats.abdstat_scatter_orders[i].name,
KSTAT_STRLEN, "scatter_order_%d", i);
abd_stats.abdstat_scatter_orders[i].data_type =
KSTAT_DATA_UINT64;
}
abd_ksp->ks_data = &abd_stats;
kstat_install(abd_ksp);
}
}
void
abd_fini(void)
{
if (abd_ksp != NULL) {
kstat_delete(abd_ksp);
abd_ksp = NULL;
}
if (abd_cache) {
kmem_cache_destroy(abd_cache);
abd_cache = NULL;
}
}
void
abd_free_linear_page(abd_t *abd)
{
/* Transform it back into a scatter ABD for freeing */
struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
abd->abd_flags &= ~ABD_FLAG_LINEAR;
abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
ABD_SCATTER(abd).abd_nents = 1;
ABD_SCATTER(abd).abd_offset = 0;
ABD_SCATTER(abd).abd_sgl = sg;
abd_free_chunks(abd);
zfs_refcount_destroy(&abd->abd_children);
abd_update_scatter_stats(abd, ABDSTAT_DECR);
abd_free_struct(abd);
}
/*
* If we're going to use this ABD for doing I/O using the block layer, the
* consumer of the ABD data doesn't care if it's scattered or not, and we don't
* plan to store this ABD in memory for a long period of time, we should
* allocate the ABD type that requires the least data copying to do the I/O.
*
* On Linux the optimal thing to do would be to use abd_get_offset() and
* construct a new ABD which shares the original pages thereby eliminating
* the copy. But for the moment a new linear ABD is allocated until this
* performance optimization can be implemented.
*/
abd_t *
abd_alloc_for_io(size_t size, boolean_t is_metadata)
{
return (abd_alloc(size, is_metadata));
}
abd_t *
abd_get_offset_scatter(abd_t *sabd, size_t off)
{
abd_t *abd = NULL;
int i = 0;
struct scatterlist *sg = NULL;
abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size);
size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
abd = abd_alloc_struct(0);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = 0;
abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
if (new_offset < sg->length)
break;
new_offset -= sg->length;
}
ABD_SCATTER(abd).abd_sgl = sg;
ABD_SCATTER(abd).abd_offset = new_offset;
ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
return (abd);
}
/*
* Initialize the abd_iter.
*/
void
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
abd_verify(abd);
aiter->iter_abd = abd;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
aiter->iter_pos = 0;
if (abd_is_linear(abd)) {
aiter->iter_offset = 0;
aiter->iter_sg = NULL;
} else {
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
}
/*
* This is just a helper function to see if we have exhausted the
* abd_iter and reached the end.
*/
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
/*
* Advance the iterator by a certain amount. Cannot be called when a chunk is
* in use. This can be safely called when the aiter has already exhausted, in
* which case this does nothing.
*/
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
return;
aiter->iter_pos += amount;
aiter->iter_offset += amount;
if (!abd_is_linear(aiter->iter_abd)) {
while (aiter->iter_offset >= aiter->iter_sg->length) {
aiter->iter_offset -= aiter->iter_sg->length;
aiter->iter_sg = sg_next(aiter->iter_sg);
if (aiter->iter_sg == NULL) {
ASSERT0(aiter->iter_offset);
break;
}
}
}
}
/*
* Map the current chunk into aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
void
abd_iter_map(struct abd_iter *aiter)
{
void *paddr;
size_t offset = 0;
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to iterate over, so do nothing */
if (abd_iter_at_end(aiter))
return;
if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
offset = aiter->iter_offset;
aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
paddr = ABD_LINEAR_BUF(aiter->iter_abd);
} else {
offset = aiter->iter_offset;
aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
km_table[aiter->iter_km]);
}
aiter->iter_mapaddr = (char *)paddr + offset;
}
/*
* Unmap the current chunk from aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
void
abd_iter_unmap(struct abd_iter *aiter)
{
/* There's nothing left to unmap, so do nothing */
if (abd_iter_at_end(aiter))
return;
if (!abd_is_linear(aiter->iter_abd)) {
/* LINTED E_FUNC_SET_NOT_USED */
zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
km_table[aiter->iter_km]);
}
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
ASSERT3U(aiter->iter_mapsize, >, 0);
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
void
abd_enter_critical(unsigned long flags)
{
local_irq_save(flags);
}
void
abd_exit_critical(unsigned long flags)
{
local_irq_restore(flags);
}
#if defined(_KERNEL)
/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
*/
unsigned long
abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
{
unsigned long pos;
if (abd_is_linear(abd))
pos = (unsigned long)abd_to_buf(abd) + off;
else
pos = ABD_SCATTER(abd).abd_offset + off;
return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
(pos >> PAGE_SHIFT);
}
/*
* bio_map for scatter ABD.
* @off is the offset in @abd
* Remaining IO size is returned
*/
unsigned int
abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
unsigned int io_size, size_t off)
{
int i;
struct abd_iter aiter;
ASSERT(!abd_is_linear(abd));
ASSERT3U(io_size, <=, abd->abd_size - off);
abd_iter_init(&aiter, abd);
abd_iter_advance(&aiter, off);
for (i = 0; i < bio->bi_max_vecs; i++) {
struct page *pg;
size_t len, sgoff, pgoff;
struct scatterlist *sg;
if (io_size <= 0)
break;
sg = aiter.iter_sg;
sgoff = aiter.iter_offset;
pgoff = sgoff & (PAGESIZE - 1);
len = MIN(io_size, PAGESIZE - pgoff);
ASSERT(len > 0);
pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
if (bio_add_page(bio, pg, len, pgoff) != len)
break;
io_size -= len;
abd_iter_advance(&aiter, len);
}
return (io_size);
}
/* Tunable Parameters */
module_param(zfs_abd_scatter_enabled, int, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_enabled,
"Toggle whether ABD allocations must be linear.");
module_param(zfs_abd_scatter_min_size, int, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_min_size,
"Minimum size of scatter allocations.");
/* CSTYLED */
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif

View File

@ -14,6 +14,7 @@ ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE)
# Suppress unused-value warnings in sparc64 architecture headers # Suppress unused-value warnings in sparc64 architecture headers
ccflags-$(CONFIG_SPARC64) += -Wno-unused-value ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
$(MODULE)-objs += abd.o
$(MODULE)-objs += aggsum.o $(MODULE)-objs += aggsum.o
$(MODULE)-objs += arc.o $(MODULE)-objs += arc.o
$(MODULE)-objs += blkptr.o $(MODULE)-objs += blkptr.o

View File

@ -1,17 +1,26 @@
/* /*
* This file and its contents are supplied under the terms of the * CDDL HEADER START
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
* *
* A full copy of the text of the CDDL should have accompanied this * The contents of this file are subject to the terms of the
* source. A copy of the CDDL is also available via the Internet at * Common Development and Distribution License (the "License").
* http://www.illumos.org/license/CDDL. * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/ */
/* /*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved.
*/ */
/* /*
@ -50,11 +59,6 @@
* +----------------->| chunk N-1 | * +----------------->| chunk N-1 |
* +-----------+ * +-----------+
* *
* Using a large proportion of scattered ABDs decreases ARC fragmentation since
* when we are at the limit of allocatable space, using equal-size chunks will
* allow us to quickly reclaim enough space for a new large allocation (assuming
* it is also scattered).
*
* In addition to directly allocating a linear or scattered ABD, it is also * In addition to directly allocating a linear or scattered ABD, it is also
* possible to create an ABD by requesting the "sub-ABD" starting at an offset * possible to create an ABD by requesting the "sub-ABD" starting at an offset
* within an existing ABD. In linear buffers this is simple (set abd_buf of * within an existing ABD. In linear buffers this is simple (set abd_buf of
@ -83,186 +87,55 @@
* compare, copy, read, write, and fill with zeroes. If you need a custom * compare, copy, read, write, and fill with zeroes. If you need a custom
* function which progressively accesses the whole ABD, use the abd_iterate_* * function which progressively accesses the whole ABD, use the abd_iterate_*
* functions. * functions.
*
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
* B_FALSE.
*/ */
#include <sys/abd.h> #include <sys/abd_impl.h>
#include <sys/param.h> #include <sys/param.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
typedef struct abd_stats { /* see block comment above for description */
kstat_named_t abdstat_struct_size; int zfs_abd_scatter_enabled = B_TRUE;
kstat_named_t abdstat_scatter_cnt;
kstat_named_t abdstat_scatter_data_size;
kstat_named_t abdstat_scatter_chunk_waste;
kstat_named_t abdstat_linear_cnt;
kstat_named_t abdstat_linear_data_size;
} abd_stats_t;
static abd_stats_t abd_stats = { boolean_t
/* Amount of memory occupied by all of the abd_t struct allocations */ abd_is_linear(abd_t *abd)
{ "struct_size", KSTAT_DATA_UINT64 },
/*
* The number of scatter ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset()).
*/
{ "scatter_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
{ "scatter_data_size", KSTAT_DATA_UINT64 },
/*
* The amount of space wasted at the end of the last chunk across all
* scatter ABDs tracked by scatter_cnt.
*/
{ "scatter_chunk_waste", KSTAT_DATA_UINT64 },
/*
* The number of linear ABDs which are currently allocated, excluding
* ABDs which don't own their data (for instance the ones which were
* allocated through abd_get_offset() and abd_get_from_buf()). If an
* ABD takes ownership of its buf then it will become tracked.
*/
{ "linear_cnt", KSTAT_DATA_UINT64 },
/* Amount of data stored in all linear ABDs tracked by linear_cnt */
{ "linear_data_size", KSTAT_DATA_UINT64 },
};
#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
#define ABDSTAT_INCR(stat, val) \
atomic_add_64(&abd_stats.stat.value.ui64, (val))
#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
/*
* It is possible to make all future ABDs be linear by setting this to B_FALSE.
* Otherwise, ABDs are allocated scattered by default unless the caller uses
* abd_alloc_linear().
*/
boolean_t zfs_abd_scatter_enabled = B_TRUE;
/*
* The size of the chunks ABD allocates. Because the sizes allocated from the
* kmem_cache can't change, this tunable can only be modified at boot. Changing
* it at runtime would cause ABD iteration to work incorrectly for ABDs which
* were allocated with the old size, so a safeguard has been put in place which
* will cause the machine to panic if you change it and try to access the data
* within a scattered ABD.
*/
size_t zfs_abd_chunk_size = 4096;
#if defined(_KERNEL)
SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
&zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
#endif
kmem_cache_t *abd_chunk_cache;
static kstat_t *abd_ksp;
extern inline boolean_t abd_is_linear(abd_t *abd);
extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size);
extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size);
extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size);
extern inline void abd_zero(abd_t *abd, size_t size);
static void *
abd_alloc_chunk()
{ {
void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
ASSERT3P(c, !=, NULL);
return (c);
} }
static void boolean_t
abd_free_chunk(void *c) abd_is_linear_page(abd_t *abd)
{ {
kmem_cache_free(abd_chunk_cache, c); return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
B_TRUE : B_FALSE);
} }
void void
abd_init(void)
{
abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (abd_ksp != NULL) {
abd_ksp->ks_data = &abd_stats;
kstat_install(abd_ksp);
}
}
void
abd_fini(void)
{
if (abd_ksp != NULL) {
kstat_delete(abd_ksp);
abd_ksp = NULL;
}
kmem_cache_destroy(abd_chunk_cache);
abd_chunk_cache = NULL;
}
static inline size_t
abd_chunkcnt_for_bytes(size_t size)
{
return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
}
static inline size_t
abd_scatter_chunkcnt(abd_t *abd)
{
ASSERT(!abd_is_linear(abd));
return (abd_chunkcnt_for_bytes(
abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
}
static inline void
abd_verify(abd_t *abd) abd_verify(abd_t *abd)
{ {
ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, >, 0);
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META)); ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) { if (abd_is_linear(abd)) {
ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
} else { } else {
ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, abd_verify_scatter(abd);
zfs_abd_chunk_size);
size_t n = abd_scatter_chunkcnt(abd);
for (int i = 0; i < n; i++) {
ASSERT3P(
abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
}
} }
} }
static inline abd_t * uint_t
abd_alloc_struct(size_t chunkcnt) abd_get_size(abd_t *abd)
{ {
size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); abd_verify(abd);
abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); return (abd->abd_size);
ASSERT3P(abd, !=, NULL);
ABDSTAT_INCR(abdstat_struct_size, size);
return (abd);
}
static inline void
abd_free_struct(abd_t *abd)
{
size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
kmem_free(abd, size);
ABDSTAT_INCR(abdstat_struct_size, -size);
} }
/* /*
@ -272,15 +145,16 @@ abd_free_struct(abd_t *abd)
abd_t * abd_t *
abd_alloc(size_t size, boolean_t is_metadata) abd_alloc(size_t size, boolean_t is_metadata)
{ {
if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size))
return (abd_alloc_linear(size, is_metadata)); return (abd_alloc_linear(size, is_metadata));
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
size_t n = abd_chunkcnt_for_bytes(size); abd_t *abd = abd_alloc_struct(size);
abd_t *abd = abd_alloc_struct(n);
abd->abd_flags = ABD_FLAG_OWNER; abd->abd_flags = ABD_FLAG_OWNER;
abd->abd_u.abd_scatter.abd_offset = 0;
abd_alloc_chunks(abd, size);
if (is_metadata) { if (is_metadata) {
abd->abd_flags |= ABD_FLAG_META; abd->abd_flags |= ABD_FLAG_META;
} }
@ -288,19 +162,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
abd->abd_parent = NULL; abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children); zfs_refcount_create(&abd->abd_children);
abd->abd_u.abd_scatter.abd_offset = 0; abd_update_scatter_stats(abd, ABDSTAT_INCR);
abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
for (int i = 0; i < n; i++) {
void *c = abd_alloc_chunk();
ASSERT3P(c, !=, NULL);
abd->abd_u.abd_scatter.abd_chunks[i] = c;
}
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
n * zfs_abd_chunk_size - size);
return (abd); return (abd);
} }
@ -308,17 +170,32 @@ abd_alloc(size_t size, boolean_t is_metadata)
static void static void
abd_free_scatter(abd_t *abd) abd_free_scatter(abd_t *abd)
{ {
size_t n = abd_scatter_chunkcnt(abd); abd_free_chunks(abd);
for (int i = 0; i < n; i++) {
abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); zfs_refcount_destroy(&abd->abd_children);
abd_update_scatter_stats(abd, ABDSTAT_DECR);
abd_free_struct(abd);
}
/*
* Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
* free the underlying scatterlist or buffer.
*/
void
abd_put(abd_t *abd)
{
if (abd == NULL)
return;
abd_verify(abd);
ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
if (abd->abd_parent != NULL) {
(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
abd->abd_size, abd);
} }
zfs_refcount_destroy(&abd->abd_children); zfs_refcount_destroy(&abd->abd_children);
ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
abd->abd_size - n * zfs_abd_chunk_size);
abd_free_struct(abd); abd_free_struct(abd);
} }
@ -343,13 +220,12 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
zfs_refcount_create(&abd->abd_children); zfs_refcount_create(&abd->abd_children);
if (is_metadata) { if (is_metadata) {
abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
} else { } else {
abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);
} }
ABDSTAT_BUMP(abdstat_linear_cnt); abd_update_linear_stats(abd, ABDSTAT_INCR);
ABDSTAT_INCR(abdstat_linear_data_size, size);
return (abd); return (abd);
} }
@ -357,15 +233,18 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
static void static void
abd_free_linear(abd_t *abd) abd_free_linear(abd_t *abd)
{ {
if (abd_is_linear_page(abd)) {
abd_free_linear_page(abd);
return;
}
if (abd->abd_flags & ABD_FLAG_META) { if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
} else { } else {
zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
} }
zfs_refcount_destroy(&abd->abd_children); zfs_refcount_destroy(&abd->abd_children);
ABDSTAT_BUMPDOWN(abdstat_linear_cnt); abd_update_linear_stats(abd, ABDSTAT_DECR);
ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
abd_free_struct(abd); abd_free_struct(abd);
} }
@ -397,39 +276,23 @@ abd_t *
abd_alloc_sametype(abd_t *sabd, size_t size) abd_alloc_sametype(abd_t *sabd, size_t size)
{ {
boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
if (abd_is_linear(sabd)) { if (abd_is_linear(sabd) &&
!abd_is_linear_page(sabd)) {
return (abd_alloc_linear(size, is_metadata)); return (abd_alloc_linear(size, is_metadata));
} else { } else {
return (abd_alloc(size, is_metadata)); return (abd_alloc(size, is_metadata));
} }
} }
/*
* If we're going to use this ABD for doing I/O using the block layer, the
* consumer of the ABD data doesn't care if it's scattered or not, and we don't
* plan to store this ABD in memory for a long period of time, we should
* allocate the ABD type that requires the least data copying to do the I/O.
*
* Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
* using a scatter/gather list we should switch to that and replace this call
* with vanilla abd_alloc().
*/
abd_t *
abd_alloc_for_io(size_t size, boolean_t is_metadata)
{
return (abd_alloc_linear(size, is_metadata));
}
/* /*
* Allocate a new ABD to point to offset off of sabd. It shares the underlying * Allocate a new ABD to point to offset off of sabd. It shares the underlying
* buffer data with sabd. Use abd_put() to free. sabd must not be freed while * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
* any derived ABDs exist. * any derived ABDs exist.
*/ */
/* ARGSUSED */ static abd_t *
static inline abd_t *
abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
{ {
abd_t *abd; abd_t *abd = NULL;
abd_verify(sabd); abd_verify(sabd);
ASSERT3U(off, <=, sabd->abd_size); ASSERT3U(off, <=, sabd->abd_size);
@ -444,60 +307,33 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
*/ */
abd->abd_flags = ABD_FLAG_LINEAR; abd->abd_flags = ABD_FLAG_LINEAR;
abd->abd_u.abd_linear.abd_buf = ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
(char *)sabd->abd_u.abd_linear.abd_buf + off;
} else { } else {
size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; abd = abd_get_offset_scatter(sabd, off);
size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
(new_offset / zfs_abd_chunk_size);
abd = abd_alloc_struct(chunkcnt);
/*
* Even if this buf is filesystem metadata, we only track that
* if we own the underlying data buffer, which is not true in
* this case. Therefore, we don't ever use ABD_FLAG_META here.
*/
abd->abd_flags = 0;
abd->abd_u.abd_scatter.abd_offset =
new_offset % zfs_abd_chunk_size;
abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
/* Copy the scatterlist starting at the correct offset */
(void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
&sabd->abd_u.abd_scatter.abd_chunks[new_offset /
zfs_abd_chunk_size],
chunkcnt * sizeof (void *));
} }
if (size == 0) abd->abd_size = size;
abd->abd_size = sabd->abd_size - off;
else
abd->abd_size = size;
abd->abd_parent = sabd; abd->abd_parent = sabd;
zfs_refcount_create(&abd->abd_children); zfs_refcount_create(&abd->abd_children);
(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
return (abd); return (abd);
} }
abd_t * abd_t *
abd_get_offset(abd_t *sabd, size_t off) abd_get_offset(abd_t *sabd, size_t off)
{ {
size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
return (abd_get_offset_impl(sabd, off, 0)); VERIFY3U(size, >, 0);
return (abd_get_offset_impl(sabd, off, size));
} }
abd_t * abd_t *
abd_get_offset_size(abd_t *sabd, size_t off, size_t size) abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
{ {
ASSERT3U(off + size, <=, sabd->abd_size); ASSERT3U(off + size, <=, sabd->abd_size);
return (abd_get_offset_impl(sabd, off, size)); return (abd_get_offset_impl(sabd, off, size));
} }
/* /*
* Allocate a linear ABD structure for buf. You must free this with abd_put() * Allocate a linear ABD structure for buf. You must free this with abd_put()
* since the resulting ABD doesn't own its own buffer. * since the resulting ABD doesn't own its own buffer.
@ -519,32 +355,11 @@ abd_get_from_buf(void *buf, size_t size)
abd->abd_parent = NULL; abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children); zfs_refcount_create(&abd->abd_children);
abd->abd_u.abd_linear.abd_buf = buf; ABD_LINEAR_BUF(abd) = buf;
return (abd); return (abd);
} }
/*
* Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
* free the underlying scatterlist or buffer.
*/
void
abd_put(abd_t *abd)
{
if (abd == NULL)
return;
abd_verify(abd);
ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
if (abd->abd_parent != NULL) {
(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
abd->abd_size, abd);
}
zfs_refcount_destroy(&abd->abd_children);
abd_free_struct(abd);
}
/* /*
* Get the raw buffer associated with a linear ABD. * Get the raw buffer associated with a linear ABD.
*/ */
@ -553,7 +368,7 @@ abd_to_buf(abd_t *abd)
{ {
ASSERT(abd_is_linear(abd)); ASSERT(abd_is_linear(abd));
abd_verify(abd); abd_verify(abd);
return (abd->abd_u.abd_linear.abd_buf); return (ABD_LINEAR_BUF(abd));
} }
/* /*
@ -574,7 +389,6 @@ abd_borrow_buf(abd_t *abd, size_t n)
buf = zio_buf_alloc(n); buf = zio_buf_alloc(n);
} }
(void) zfs_refcount_add_many(&abd->abd_children, n, buf); (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
return (buf); return (buf);
} }
@ -617,6 +431,31 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
abd_return_buf(abd, buf, n); abd_return_buf(abd, buf, n);
} }
void
abd_release_ownership_of_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
/*
* abd_free() needs to handle LINEAR_PAGE ABD's specially.
* Since that flag does not survive the
* abd_release_ownership_of_buf() -> abd_get_from_buf() ->
* abd_take_ownership_of_buf() sequence, we don't allow releasing
* these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
*/
ASSERT(!abd_is_linear_page(abd));
abd_verify(abd);
abd->abd_flags &= ~ABD_FLAG_OWNER;
/* Disable this flag since we no longer own the data buffer */
abd->abd_flags &= ~ABD_FLAG_META;
abd_update_linear_stats(abd, ABDSTAT_DECR);
}
/* /*
* Give this ABD ownership of the buffer that it's storing. Can only be used on * Give this ABD ownership of the buffer that it's storing. Can only be used on
* linear ABDs which were allocated via abd_get_from_buf(), or ones allocated * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
@ -635,130 +474,7 @@ abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
abd->abd_flags |= ABD_FLAG_META; abd->abd_flags |= ABD_FLAG_META;
} }
ABDSTAT_BUMP(abdstat_linear_cnt); abd_update_linear_stats(abd, ABDSTAT_INCR);
ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
}
void
abd_release_ownership_of_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
abd_verify(abd);
abd->abd_flags &= ~ABD_FLAG_OWNER;
/* Disable this flag since we no longer own the data buffer */
abd->abd_flags &= ~ABD_FLAG_META;
ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
}
struct abd_iter {
abd_t *iter_abd; /* ABD being iterated through */
size_t iter_pos; /* position (relative to abd_offset) */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
};
static inline size_t
abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
{
ASSERT(!abd_is_linear(aiter->iter_abd));
return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
aiter->iter_pos) % zfs_abd_chunk_size);
}
static inline size_t
abd_iter_scatter_chunk_index(struct abd_iter *aiter)
{
ASSERT(!abd_is_linear(aiter->iter_abd));
return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
aiter->iter_pos) / zfs_abd_chunk_size);
}
/*
* Initialize the abd_iter.
*/
static void
abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
abd_verify(abd);
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
/*
* Advance the iterator by a certain amount. Cannot be called when a chunk is
* in use. This can be safely called when the aiter has already exhausted, in
* which case this does nothing.
*/
static void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* There's nothing left to advance to, so do nothing */
if (aiter->iter_pos == aiter->iter_abd->abd_size)
return;
aiter->iter_pos += amount;
}
/*
* Map the current chunk into aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
static void
abd_iter_map(struct abd_iter *aiter)
{
void *paddr;
size_t offset = 0;
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
/* Panic if someone has changed zfs_abd_chunk_size */
IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
/* There's nothing left to iterate over, so do nothing */
if (aiter->iter_pos == aiter->iter_abd->abd_size)
return;
if (abd_is_linear(aiter->iter_abd)) {
offset = aiter->iter_pos;
aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
} else {
size_t index = abd_iter_scatter_chunk_index(aiter);
offset = abd_iter_scatter_chunk_offset(aiter);
aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
}
aiter->iter_mapaddr = (char *)paddr + offset;
}
/*
* Unmap the current chunk from aiter. This can be safely called when the aiter
* has already exhausted, in which case this does nothing.
*/
static void
abd_iter_unmap(struct abd_iter *aiter)
{
/* There's nothing left to unmap, so do nothing */
if (aiter->iter_pos == aiter->iter_abd->abd_size)
return;
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
ASSERT3U(aiter->iter_mapsize, >, 0);
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
} }
int int
@ -987,6 +703,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
struct abd_iter caiters[3]; struct abd_iter caiters[3];
struct abd_iter daiter = {0}; struct abd_iter daiter = {0};
void *caddrs[3]; void *caddrs[3];
unsigned long flags = 0;
ASSERT3U(parity, <=, 3); ASSERT3U(parity, <=, 3);
@ -998,7 +715,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
ASSERT3S(dsize, >=, 0); ASSERT3S(dsize, >=, 0);
critical_enter(); abd_enter_critical(flags);
while (csize > 0) { while (csize > 0) {
len = csize; len = csize;
@ -1010,11 +727,14 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
caddrs[i] = caiters[i].iter_mapaddr; caddrs[i] = caiters[i].iter_mapaddr;
} }
switch (parity) { switch (parity) {
case 3: case 3:
len = MIN(caiters[2].iter_mapsize, len); len = MIN(caiters[2].iter_mapsize, len);
/* falls through */
case 2: case 2:
len = MIN(caiters[1].iter_mapsize, len); len = MIN(caiters[1].iter_mapsize, len);
/* falls through */
case 1: case 1:
len = MIN(caiters[0].iter_mapsize, len); len = MIN(caiters[0].iter_mapsize, len);
} }
@ -1055,7 +775,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
ASSERT3S(dsize, >=, 0); ASSERT3S(dsize, >=, 0);
ASSERT3S(csize, >=, 0); ASSERT3S(csize, >=, 0);
} }
critical_exit(); abd_exit_critical(flags);
} }
/* /*
@ -1080,6 +800,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
struct abd_iter citers[3]; struct abd_iter citers[3];
struct abd_iter xiters[3]; struct abd_iter xiters[3];
void *caddrs[3], *xaddrs[3]; void *caddrs[3], *xaddrs[3];
unsigned long flags = 0;
ASSERT3U(parity, <=, 3); ASSERT3U(parity, <=, 3);
@ -1088,7 +809,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
abd_iter_init(&xiters[i], tabds[i]); abd_iter_init(&xiters[i], tabds[i]);
} }
critical_enter(); abd_enter_critical(flags);
while (tsize > 0) { while (tsize > 0) {
for (i = 0; i < parity; i++) { for (i = 0; i < parity; i++) {
@ -1103,9 +824,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
case 3: case 3:
len = MIN(xiters[2].iter_mapsize, len); len = MIN(xiters[2].iter_mapsize, len);
len = MIN(citers[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len);
/* falls through */
case 2: case 2:
len = MIN(xiters[1].iter_mapsize, len); len = MIN(xiters[1].iter_mapsize, len);
len = MIN(citers[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len);
/* falls through */
case 1: case 1:
len = MIN(xiters[0].iter_mapsize, len); len = MIN(xiters[0].iter_mapsize, len);
len = MIN(citers[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len);
@ -1130,5 +853,5 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
tsize -= len; tsize -= len;
ASSERT3S(tsize, >=, 0); ASSERT3S(tsize, >=, 0);
} }
critical_exit(); abd_exit_critical(flags);
} }

View File

@ -1638,7 +1638,7 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
if (ic->ic_data == NULL) if (ic->ic_data == NULL)
continue; continue;
abd_zero(ic->ic_data, ic->ic_data->abd_size); abd_zero(ic->ic_data, abd_get_size(ic->ic_data));
} }
iv->iv_attempts_max *= 2; iv->iv_attempts_max *= 2;