Merge commit 'refs/top-bases/linux-debug-zerocopy' into linux-debug-zerocopy

This commit is contained in:
Brian Behlendorf 2009-02-18 14:35:05 -08:00
commit 8241395b5a
39 changed files with 1092 additions and 740 deletions

View File

@ -1 +1 @@
http://dlc.sun.com/osol/on/downloads/b105/on-src.tar.bz2 http://dlc.sun.com/osol/on/downloads/b108/on-src.tar.bz2

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -1747,7 +1747,7 @@ zfs_do_list(int argc, char **argv)
boolean_t scripted = B_FALSE; boolean_t scripted = B_FALSE;
static char default_fields[] = static char default_fields[] =
"name,used,available,referenced,mountpoint"; "name,used,available,referenced,mountpoint";
int types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; int types = ZFS_TYPE_DATASET;
boolean_t types_specified = B_FALSE; boolean_t types_specified = B_FALSE;
char *fields = NULL; char *fields = NULL;
list_cbdata_t cb = { 0 }; list_cbdata_t cb = { 0 };
@ -2440,7 +2440,7 @@ zfs_do_receive(int argc, char **argv)
bzero(&flags, sizeof (recvflags_t)); bzero(&flags, sizeof (recvflags_t));
/* check options */ /* check options */
while ((c = getopt(argc, argv, ":dnvF")) != -1) { while ((c = getopt(argc, argv, ":dnuvF")) != -1) {
switch (c) { switch (c) {
case 'd': case 'd':
flags.isprefix = B_TRUE; flags.isprefix = B_TRUE;
@ -2448,6 +2448,9 @@ zfs_do_receive(int argc, char **argv)
case 'n': case 'n':
flags.dryrun = B_TRUE; flags.dryrun = B_TRUE;
break; break;
case 'u':
flags.nomount = B_TRUE;
break;
case 'v': case 'v':
flags.verbose = B_TRUE; flags.verbose = B_TRUE;
break; break;

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -1323,7 +1323,8 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL); verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL);
if (zpool_enable_datasets(zhp, mntopts, 0) != 0) { if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
zpool_enable_datasets(zhp, mntopts, 0) != 0) {
zpool_close(zhp); zpool_close(zhp);
return (1); return (1);
} }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -264,9 +264,15 @@ typedef enum {
ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */
ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */
ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */
/*
* These faults have no corresponding message ID. At the time we are
* checking the status, the original reason for the FMA fault (I/O or
* checksum errors) has been lost.
*/
ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */
ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */
ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */
/* /*
* The following are not faults per se, but still an error possibly * The following are not faults per se, but still an error possibly
@ -466,6 +472,9 @@ typedef struct recvflags {
/* byteswap flag is used internally; callers need not specify */ /* byteswap flag is used internally; callers need not specify */
int byteswap : 1; int byteswap : 1;
/* do not mount file systems as they are extracted (private) */
int nomount : 1;
} recvflags_t; } recvflags_t;
extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t, extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t,

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -1190,10 +1190,12 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
/* /*
* And mount all the datasets, keeping track of which ones * And mount all the datasets, keeping track of which ones
* succeeded or failed. By using zfs_alloc(), the good pointer * succeeded or failed.
* will always be non-NULL.
*/ */
good = zfs_alloc(zhp->zpool_hdl, cb.cb_used * sizeof (int)); if ((good = zfs_alloc(zhp->zpool_hdl,
cb.cb_used * sizeof (int))) == NULL)
goto out;
ret = 0; ret = 0;
for (i = 0; i < cb.cb_used; i++) { for (i = 0; i < cb.cb_used; i++) {
if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0) if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0)

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -217,12 +217,39 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
uint_t vsc; uint_t vsc;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
if (prop == ZPOOL_PROP_NAME) switch (prop) {
case ZPOOL_PROP_NAME:
(void) strlcpy(buf, zpool_get_name(zhp), len); (void) strlcpy(buf, zpool_get_name(zhp), len);
else if (prop == ZPOOL_PROP_HEALTH) break;
case ZPOOL_PROP_HEALTH:
(void) strlcpy(buf, "FAULTED", len); (void) strlcpy(buf, "FAULTED", len);
else break;
case ZPOOL_PROP_GUID:
intval = zpool_get_prop_int(zhp, prop, &src);
(void) snprintf(buf, len, "%llu", intval);
break;
case ZPOOL_PROP_ALTROOT:
case ZPOOL_PROP_CACHEFILE:
if (zhp->zpool_props != NULL ||
zpool_get_all_props(zhp) == 0) {
(void) strlcpy(buf,
zpool_get_prop_string(zhp, prop, &src),
len);
if (srctype != NULL)
*srctype = src;
return (0);
}
/* FALLTHROUGH */
default:
(void) strlcpy(buf, "-", len); (void) strlcpy(buf, "-", len);
break;
}
if (srctype != NULL)
*srctype = src;
return (0); return (0);
} }
@ -532,9 +559,6 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
zhp->zpool_name); zhp->zpool_name);
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp))
return (zfs_error(zhp->zpool_hdl, EZFS_POOLPROPS, errbuf));
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
return (no_memory(zhp->zpool_hdl)); return (no_memory(zhp->zpool_hdl));

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -2078,7 +2078,7 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs); err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs);
if (err == 0 && top_zfs) { if (err == 0 && !flags.nomount && top_zfs) {
zfs_handle_t *zhp; zfs_handle_t *zhp;
prop_changelist_t *clp; prop_changelist_t *clp;

View File

@ -480,7 +480,6 @@ zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize)
if ((ret = realloc(ptr, newsize)) == NULL) { if ((ret = realloc(ptr, newsize)) == NULL) {
(void) no_memory(hdl); (void) no_memory(hdl);
free(ptr);
return (NULL); return (NULL);
} }
@ -1025,9 +1024,9 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
return (-1); return (-1);
} }
/* Rely on stroll() to process the numeric portion. */ /* Rely on stroull() to process the numeric portion. */
errno = 0; errno = 0;
*num = strtoll(value, &end, 10); *num = strtoull(value, &end, 10);
/* /*
* Check for ERANGE, which indicates that the value is too large to fit * Check for ERANGE, which indicates that the value is too large to fit

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -498,7 +498,7 @@ typedef struct callb_cpr {
/* /*
* Hostname information * Hostname information
*/ */
extern char hw_serial[]; extern char hw_serial[]; /* for userland-emulated hostid access */
extern int ddi_strtoul(const char *str, char **nptr, int base, extern int ddi_strtoul(const char *str, char **nptr, int base,
unsigned long *result); unsigned long *result);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -36,6 +36,7 @@
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/utsname.h> #include <sys/utsname.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/systeminfo.h>
/* /*
* Emulation of kernel services in userland. * Emulation of kernel services in userland.
@ -43,7 +44,7 @@
uint64_t physmem; uint64_t physmem;
vnode_t *rootdir = (vnode_t *)0xabcd1234; vnode_t *rootdir = (vnode_t *)0xabcd1234;
char hw_serial[11]; char hw_serial[HW_HOSTID_LEN];
struct utsname utsname = { struct utsname utsname = {
"userland", "libzpool", "1", "1", "na" "userland", "libzpool", "1", "1", "na"
@ -843,7 +844,7 @@ kernel_init(int mode)
dprintf("physmem = %llu pages (%.2f GB)\n", physmem, dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
(double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid()); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -152,6 +152,12 @@ typedef enum arc_reclaim_strategy {
/* number of seconds before growing cache again */ /* number of seconds before growing cache again */
static int arc_grow_retry = 60; static int arc_grow_retry = 60;
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
/* log2(fraction of arc to reclaim) */
static int arc_shrink_shift = 5;
/* /*
* minimum lifespan of a prefetch block in clock ticks * minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init()) * (initialized in arc_init())
@ -172,6 +178,9 @@ unsigned long zfs_arc_max = 0;
unsigned long zfs_arc_min = 0; unsigned long zfs_arc_min = 0;
unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_limit = 0;
int zfs_mdcomp_disable = 0; int zfs_mdcomp_disable = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
/* /*
* Note that buffers can be in one of 6 states: * Note that buffers can be in one of 6 states:
@ -250,10 +259,14 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max; kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size; kstat_named_t arcstat_size;
kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_hdr_size;
kstat_named_t arcstat_data_size;
kstat_named_t arcstat_other_size;
kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_feeds;
kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_rw_clash;
kstat_named_t arcstat_l2_read_bytes;
kstat_named_t arcstat_l2_write_bytes;
kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_error;
@ -299,10 +312,14 @@ static arc_stats_t arc_stats = {
{ "c_max", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 },
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
{ "l2_write_bytes", KSTAT_DATA_UINT64 },
{ "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 },
@ -425,7 +442,7 @@ struct arc_buf_hdr {
/* immutable */ /* immutable */
arc_buf_contents_t b_type; arc_buf_contents_t b_type;
uint64_t b_size; uint64_t b_size;
spa_t *b_spa; uint64_t b_spa;
/* protected by arc state mutex */ /* protected by arc state mutex */
arc_state_t *b_state; arc_state_t *b_state;
@ -447,7 +464,7 @@ static arc_buf_hdr_t arc_eviction_hdr;
static void arc_get_data_buf(arc_buf_t *buf); static void arc_get_data_buf(arc_buf_t *buf);
static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
static int arc_evict_needed(arc_buf_contents_t type); static int arc_evict_needed(arc_buf_contents_t type);
static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
#define GHOST_STATE(state) \ #define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
@ -476,6 +493,7 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
@ -529,8 +547,9 @@ uint64_t zfs_crc64_table[256];
*/ */
#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
#define L2ARC_HEADROOM 4 /* num of writes */ #define L2ARC_HEADROOM 2 /* num of writes */
#define L2ARC_FEED_SECS 1 /* caching interval */ #define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@ -542,7 +561,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
/* /*
* L2ARC Internals * L2ARC Internals
@ -557,6 +579,7 @@ typedef struct l2arc_dev {
uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_end; /* last addr on device */
uint64_t l2ad_evict; /* last addr eviction reached */ uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */ list_t *l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */ list_node_t l2ad_node; /* device list node */
} l2arc_dev_t; } l2arc_dev_t;
@ -607,9 +630,8 @@ static void l2arc_hdr_stat_add(void);
static void l2arc_hdr_stat_remove(void); static void l2arc_hdr_stat_remove(void);
static uint64_t static uint64_t
buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{ {
uintptr_t spav = (uintptr_t)spa;
uint8_t *vdva = (uint8_t *)dva; uint8_t *vdva = (uint8_t *)dva;
uint64_t crc = -1ULL; uint64_t crc = -1ULL;
int i; int i;
@ -619,7 +641,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
for (i = 0; i < sizeof (dva_t); i++) for (i = 0; i < sizeof (dva_t); i++)
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
crc ^= (spav>>8) ^ birth; crc ^= (spa>>8) ^ birth;
return (crc); return (crc);
} }
@ -635,7 +657,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
((buf)->b_birth == birth) && ((buf)->b_spa == spa) ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
static arc_buf_hdr_t * static arc_buf_hdr_t *
buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{ {
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx); kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@ -757,8 +779,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
list_link_init(&buf->b_arc_node); list_link_init(&buf->b_arc_node);
list_link_init(&buf->b_l2node); list_link_init(&buf->b_l2node);
arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
return (0); return (0);
} }
@ -770,6 +792,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
bzero(buf, sizeof (arc_buf_t)); bzero(buf, sizeof (arc_buf_t));
rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0); return (0);
} }
@ -786,8 +810,7 @@ hdr_dest(void *vbuf, void *unused)
refcount_destroy(&buf->b_refcnt); refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv); cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock); mutex_destroy(&buf->b_freeze_lock);
arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
} }
/* ARGSUSED */ /* ARGSUSED */
@ -797,6 +820,7 @@ buf_dest(void *vbuf, void *unused)
arc_buf_t *buf = vbuf; arc_buf_t *buf = vbuf;
rw_destroy(&buf->b_lock); rw_destroy(&buf->b_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
} }
/* /*
@ -1083,15 +1107,49 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
} }
void void
arc_space_consume(uint64_t space) arc_space_consume(uint64_t space, arc_space_type_t type)
{ {
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, space);
break;
case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, space);
break;
}
atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_meta_used, space);
atomic_add_64(&arc_size, space); atomic_add_64(&arc_size, space);
} }
void void
arc_space_return(uint64_t space) arc_space_return(uint64_t space, arc_space_type_t type)
{ {
ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
switch (type) {
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, -space);
break;
case ARC_SPACE_HDRS:
ARCSTAT_INCR(arcstat_hdr_size, -space);
break;
case ARC_SPACE_L2HDRS:
ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
break;
}
ASSERT(arc_meta_used >= space); ASSERT(arc_meta_used >= space);
if (arc_meta_max < arc_meta_used) if (arc_meta_max < arc_meta_used)
arc_meta_max = arc_meta_used; arc_meta_max = arc_meta_used;
@ -1128,7 +1186,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
ASSERT(BUF_EMPTY(hdr)); ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size; hdr->b_size = size;
hdr->b_type = type; hdr->b_type = type;
hdr->b_spa = spa; hdr->b_spa = spa_guid(spa);
hdr->b_state = arc_anon; hdr->b_state = arc_anon;
hdr->b_arc_access = 0; hdr->b_arc_access = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
@ -1191,6 +1249,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag); add_reference(hdr, hash_lock, tag);
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock); arc_access(hdr, hash_lock);
mutex_exit(hash_lock); mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits); ARCSTAT_BUMP(arcstat_hits);
@ -1238,11 +1297,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free, arc_buf_data_free(buf->b_hdr, zio_buf_free,
buf->b_data, size); buf->b_data, size);
arc_space_return(size); arc_space_return(size, ARC_SPACE_DATA);
} else { } else {
ASSERT(type == ARC_BUFC_DATA); ASSERT(type == ARC_BUFC_DATA);
arc_buf_data_free(buf->b_hdr, arc_buf_data_free(buf->b_hdr,
zio_data_buf_free, buf->b_data, size); zio_data_buf_free, buf->b_data, size);
ARCSTAT_INCR(arcstat_data_size, -size);
atomic_add_64(&arc_size, -size); atomic_add_64(&arc_size, -size);
} }
} }
@ -1442,7 +1502,7 @@ arc_buf_size(arc_buf_t *buf)
* It may also return without evicting as much space as requested. * It may also return without evicting as much space as requested.
*/ */
static void * static void *
arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
arc_buf_contents_t type) arc_buf_contents_t type)
{ {
arc_state_t *evicted_state; arc_state_t *evicted_state;
@ -1568,7 +1628,7 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
* bytes. Destroy the buffers that are removed. * bytes. Destroy the buffers that are removed.
*/ */
static void static void
arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
{ {
arc_buf_hdr_t *ab, *ab_prev; arc_buf_hdr_t *ab, *ab_prev;
list_t *list = &state->arcs_list[ARC_BUFC_DATA]; list_t *list = &state->arcs_list[ARC_BUFC_DATA];
@ -1637,61 +1697,63 @@ top:
static void static void
arc_adjust(void) arc_adjust(void)
{ {
int64_t top_sz, mru_over, arc_over, todelete; int64_t adjustment, delta;
top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; /*
* Adjust MRU size
*/
if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { adjustment = MIN(arc_size - arc_c,
int64_t toevict = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
(void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
top_sz = arc_anon->arcs_size + arc_mru->arcs_size; delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
adjustment -= delta;
} }
if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
int64_t toevict = delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); (void) arc_evict(arc_mru, NULL, delta, FALSE,
(void) arc_evict(arc_mru, NULL, toevict, FALSE,
ARC_BUFC_METADATA); ARC_BUFC_METADATA);
top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
} }
mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; /*
* Adjust MFU size
*/
if (mru_over > 0) { adjustment = arc_size - arc_c;
if (arc_mru_ghost->arcs_size > 0) {
todelete = MIN(arc_mru_ghost->arcs_size, mru_over); if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
arc_evict_ghost(arc_mru_ghost, NULL, todelete); delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
} (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
adjustment -= delta;
} }
if ((arc_over = arc_size - arc_c) > 0) { if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
int64_t tbl_over; int64_t delta = MIN(adjustment,
arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
(void) arc_evict(arc_mfu, NULL, delta, FALSE,
ARC_BUFC_METADATA);
}
if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { /*
int64_t toevict = * Adjust ghost lists
MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); */
(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
ARC_BUFC_DATA);
arc_over = arc_size - arc_c;
}
if (arc_over > 0 && adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
int64_t toevict =
MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
arc_over);
(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
ARC_BUFC_METADATA);
}
tbl_over = arc_size + arc_mru_ghost->arcs_size + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
arc_mfu_ghost->arcs_size - arc_c * 2; delta = MIN(arc_mru_ghost->arcs_size, adjustment);
arc_evict_ghost(arc_mru_ghost, NULL, delta);
}
if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { adjustment =
todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
} if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
arc_evict_ghost(arc_mfu_ghost, NULL, delta);
} }
} }
@ -1725,29 +1787,34 @@ arc_do_user_evicts(void)
void void
arc_flush(spa_t *spa) arc_flush(spa_t *spa)
{ {
uint64_t guid = 0;
if (spa)
guid = spa_guid(spa);
while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
if (spa) if (spa)
break; break;
} }
while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
if (spa) if (spa)
break; break;
} }
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
if (spa) if (spa)
break; break;
} }
while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
if (spa) if (spa)
break; break;
} }
arc_evict_ghost(arc_mru_ghost, spa, -1); arc_evict_ghost(arc_mru_ghost, guid, -1);
arc_evict_ghost(arc_mfu_ghost, spa, -1); arc_evict_ghost(arc_mfu_ghost, guid, -1);
mutex_enter(&arc_reclaim_thr_lock); mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts(); arc_do_user_evicts();
@ -1755,8 +1822,6 @@ arc_flush(spa_t *spa)
ASSERT(spa || arc_eviction_list == NULL); ASSERT(spa || arc_eviction_list == NULL);
} }
int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
void void
arc_shrink(void) arc_shrink(void)
{ {
@ -1955,6 +2020,7 @@ static void
arc_adapt(int bytes, arc_state_t *state) arc_adapt(int bytes, arc_state_t *state)
{ {
int mult; int mult;
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
if (state == arc_l2c_only) if (state == arc_l2c_only)
return; return;
@ -1972,12 +2038,15 @@ arc_adapt(int bytes, arc_state_t *state)
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
arc_p = MIN(arc_c, arc_p + bytes * mult); arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) { } else if (state == arc_mfu_ghost) {
uint64_t delta;
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
arc_p = MAX(0, (int64_t)arc_p - bytes * mult); delta = MIN(bytes * mult, arc_p);
arc_p = MAX(arc_p_min, arc_p - delta);
} }
ASSERT((int64_t)arc_p >= 0); ASSERT((int64_t)arc_p >= 0);
@ -2075,10 +2144,11 @@ arc_get_data_buf(arc_buf_t *buf)
if (!arc_evict_needed(type)) { if (!arc_evict_needed(type)) {
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size); buf->b_data = zio_buf_alloc(size);
arc_space_consume(size); arc_space_consume(size, ARC_SPACE_DATA);
} else { } else {
ASSERT(type == ARC_BUFC_DATA); ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size); buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size); atomic_add_64(&arc_size, size);
} }
goto out; goto out;
@ -2095,21 +2165,22 @@ arc_get_data_buf(arc_buf_t *buf)
if (state == arc_mru || state == arc_anon) { if (state == arc_mru || state == arc_anon) {
uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
state = (arc_mfu->arcs_lsize[type] > 0 && state = (arc_mfu->arcs_lsize[type] >= size &&
arc_p > mru_used) ? arc_mfu : arc_mru; arc_p > mru_used) ? arc_mfu : arc_mru;
} else { } else {
/* MFU cases */ /* MFU cases */
uint64_t mfu_space = arc_c - arc_p; uint64_t mfu_space = arc_c - arc_p;
state = (arc_mru->arcs_lsize[type] > 0 && state = (arc_mru->arcs_lsize[type] >= size &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
} }
if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size); buf->b_data = zio_buf_alloc(size);
arc_space_consume(size); arc_space_consume(size, ARC_SPACE_DATA);
} else { } else {
ASSERT(type == ARC_BUFC_DATA); ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size); buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size); atomic_add_64(&arc_size, size);
} }
ARCSTAT_BUMP(arcstat_recycle_miss); ARCSTAT_BUMP(arcstat_recycle_miss);
@ -2311,7 +2382,7 @@ arc_read_done(zio_t *zio)
* reason for it not to be found is if we were freed during the * reason for it not to be found is if we were freed during the
* read. * read.
*/ */
found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
&hash_lock); &hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
@ -2458,9 +2529,10 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
arc_buf_t *buf = NULL; arc_buf_t *buf = NULL;
kmutex_t *hash_lock; kmutex_t *hash_lock;
zio_t *rzio; zio_t *rzio;
uint64_t guid = spa_guid(spa);
top: top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
if (hdr && hdr->b_datacnt > 0) { if (hdr && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED; *arc_flags |= ARC_CACHED;
@ -2483,7 +2555,7 @@ top:
acb->acb_private = private; acb->acb_private = private;
if (pio != NULL) if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio, acb->acb_zio_dummy = zio_null(pio,
spa, NULL, NULL, zio_flags); spa, NULL, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL); ASSERT(acb->acb_done != NULL);
acb->acb_next = hdr->b_acb; acb->acb_next = hdr->b_acb;
@ -2535,6 +2607,7 @@ top:
arc_callback_t *acb; arc_callback_t *acb;
vdev_t *vd = NULL; vdev_t *vd = NULL;
daddr_t addr; daddr_t addr;
boolean_t devw = B_FALSE;
if (hdr == NULL) { if (hdr == NULL) {
/* this block is not in the cache */ /* this block is not in the cache */
@ -2613,6 +2686,7 @@ top:
if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
(vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
devw = hdr->b_l2hdr->b_dev->l2ad_writing;
addr = hdr->b_l2hdr->b_daddr; addr = hdr->b_l2hdr->b_daddr;
/* /*
* Lock out device removal. * Lock out device removal.
@ -2632,7 +2706,7 @@ top:
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses); data, metadata, misses);
if (vd != NULL) { if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/* /*
* Read from the L2ARC if the following are true: * Read from the L2ARC if the following are true:
* 1. The L2ARC vdev was previously cached. * 1. The L2ARC vdev was previously cached.
@ -2640,9 +2714,11 @@ top:
* 3. This buffer isn't currently writing to the L2ARC. * 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may * 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev. * also have invalidated the vdev.
* 5. This isn't prefetch and l2arc_noprefetch is set.
*/ */
if (hdr->b_l2hdr != NULL && if (hdr->b_l2hdr != NULL &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb; l2arc_read_callback_t *cb;
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
@ -2668,6 +2744,7 @@ top:
ZIO_FLAG_DONT_RETRY, B_FALSE); ZIO_FLAG_DONT_RETRY, B_FALSE);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd, DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio); zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
if (*arc_flags & ARC_NOWAIT) { if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio); zio_nowait(rzio);
@ -2687,6 +2764,14 @@ top:
ARCSTAT_BUMP(arcstat_l2_rw_clash); ARCSTAT_BUMP(arcstat_l2_rw_clash);
spa_config_exit(spa, SCL_L2ARC, vd); spa_config_exit(spa, SCL_L2ARC, vd);
} }
} else {
if (vd != NULL)
spa_config_exit(spa, SCL_L2ARC, vd);
if (l2arc_ndev != 0) {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_misses);
}
} }
rzio = zio_read(pio, spa, bp, buf->b_data, size, rzio = zio_read(pio, spa, bp, buf->b_data, size,
@ -2712,9 +2797,10 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
{ {
arc_buf_hdr_t *hdr; arc_buf_hdr_t *hdr;
kmutex_t *hash_mtx; kmutex_t *hash_mtx;
uint64_t guid = spa_guid(spa);
int rc = 0; int rc = 0;
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
arc_buf_t *buf = hdr->b_buf; arc_buf_t *buf = hdr->b_buf;
@ -2874,7 +2960,7 @@ arc_release(arc_buf_t *buf, void *tag)
arc_buf_hdr_t *nhdr; arc_buf_hdr_t *nhdr;
arc_buf_t **bufp; arc_buf_t **bufp;
uint64_t blksz = hdr->b_size; uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa; uint64_t spa = hdr->b_spa;
arc_buf_contents_t type = hdr->b_type; arc_buf_contents_t type = hdr->b_type;
uint32_t flags = hdr->b_flags; uint32_t flags = hdr->b_flags;
@ -3158,12 +3244,13 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_buf_hdr_t *ab; arc_buf_hdr_t *ab;
kmutex_t *hash_lock; kmutex_t *hash_lock;
zio_t *zio; zio_t *zio;
uint64_t guid = spa_guid(spa);
/* /*
* If this buffer is in the cache, release it, so it * If this buffer is in the cache, release it, so it
* can be re-used. * can be re-used.
*/ */
ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
if (ab != NULL) { if (ab != NULL) {
/* /*
* The checksum of blocks to free is not always * The checksum of blocks to free is not always
@ -3387,6 +3474,15 @@ arc_init(void)
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2; arc_c_min = arc_meta_limit / 2;
if (zfs_arc_grow_retry > 0)
arc_grow_retry = zfs_arc_grow_retry;
if (zfs_arc_shrink_shift > 0)
arc_shrink_shift = zfs_arc_shrink_shift;
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
/* if kmem_flags are set, lets try to use less memory */ /* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging()) if (kmem_debugging())
arc_c = arc_c / 2; arc_c = arc_c / 2;
@ -3625,8 +3721,70 @@ arc_fini(void)
* *
* Tunables may be removed or added as future performance improvements are * Tunables may be removed or added as future performance improvements are
* integrated, and also may become zpool properties. * integrated, and also may become zpool properties.
*
* There are three key functions that control how the L2ARC warms up:
*
* l2arc_write_eligible() check if a buffer is eligible to cache
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
*/ */
static boolean_t
l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
{
/*
* A buffer is *not* eligible for the L2ARC if it:
* 1. belongs to a different spa.
* 2. has no attached buffer.
* 3. is already cached on the L2ARC.
* 4. has an I/O in progress (it may be an incomplete read).
* 5. is flagged not eligible (zfs property).
*/
if (ab->b_spa != spa_guid || ab->b_buf == NULL || ab->b_l2hdr != NULL ||
HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
return (B_FALSE);
return (B_TRUE);
}
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
uint64_t size;
size = dev->l2ad_write;
if (arc_warm == B_FALSE)
size += dev->l2ad_boost;
return (size);
}
static clock_t
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
{
clock_t interval, next;
/*
* If the ARC lists are busy, increase our write rate; if the
* lists are stale, idle back. This is achieved by checking
* how much we previously wrote - if it was more than half of
* what we wanted, schedule the next write much sooner.
*/
if (l2arc_feed_again && wrote > (wanted / 2))
interval = (hz * l2arc_feed_min_ms) / 1000;
else
interval = hz * l2arc_feed_secs;
next = MAX(lbolt, MIN(lbolt + interval, began + interval));
return (next);
}
static void static void
l2arc_hdr_stat_add(void) l2arc_hdr_stat_add(void)
{ {
@ -3859,11 +4017,15 @@ l2arc_read_done(zio_t *zio)
* storage now. If there *is* a waiter, the caller must * storage now. If there *is* a waiter, the caller must
* issue the i/o in a context where it's OK to block. * issue the i/o in a context where it's OK to block.
*/ */
if (zio->io_waiter == NULL) if (zio->io_waiter == NULL) {
zio_nowait(zio_read(zio->io_parent, zio_t *pio = zio_unique_parent(zio);
cb->l2rcb_spa, &cb->l2rcb_bp,
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
buf->b_data, zio->io_size, arc_read_done, buf, buf->b_data, zio->io_size, arc_read_done, buf,
zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
}
} }
kmem_free(cb, sizeof (l2arc_read_callback_t)); kmem_free(cb, sizeof (l2arc_read_callback_t));
@ -4047,7 +4209,7 @@ top:
* An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
* for reading until they have completed writing. * for reading until they have completed writing.
*/ */
static void static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{ {
arc_buf_hdr_t *ab, *ab_prev, *head; arc_buf_hdr_t *ab, *ab_prev, *head;
@ -4059,6 +4221,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
boolean_t have_lock, full; boolean_t have_lock, full;
l2arc_write_callback_t *cb; l2arc_write_callback_t *cb;
zio_t *pio, *wzio; zio_t *pio, *wzio;
uint64_t guid = spa_guid(spa);
int try; int try;
ASSERT(dev->l2ad_vdev != NULL); ASSERT(dev->l2ad_vdev != NULL);
@ -4113,20 +4276,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break; break;
} }
if (ab->b_spa != spa) { if (!l2arc_write_eligible(guid, ab)) {
mutex_exit(hash_lock);
continue;
}
if (ab->b_l2hdr != NULL) {
/*
* Already in L2ARC.
*/
mutex_exit(hash_lock);
continue;
}
if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
mutex_exit(hash_lock); mutex_exit(hash_lock);
continue; continue;
} }
@ -4137,12 +4287,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break; break;
} }
if (ab->b_buf == NULL) {
DTRACE_PROBE1(l2arc__buf__null, void *, ab);
mutex_exit(hash_lock);
continue;
}
if (pio == NULL) { if (pio == NULL) {
/* /*
* Insert a dummy header on the buflist so * Insert a dummy header on the buflist so
@ -4209,11 +4353,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
if (pio == NULL) { if (pio == NULL) {
ASSERT3U(write_sz, ==, 0); ASSERT3U(write_sz, ==, 0);
kmem_cache_free(hdr_cache, head); kmem_cache_free(hdr_cache, head);
return; return (0);
} }
ASSERT3U(write_sz, <=, target_sz); ASSERT3U(write_sz, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz);
spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
@ -4229,7 +4374,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
dev->l2ad_first = B_FALSE; dev->l2ad_first = B_FALSE;
} }
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio); (void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
return (write_sz);
} }
/* /*
@ -4242,20 +4391,19 @@ l2arc_feed_thread(void)
callb_cpr_t cpr; callb_cpr_t cpr;
l2arc_dev_t *dev; l2arc_dev_t *dev;
spa_t *spa; spa_t *spa;
uint64_t size; uint64_t size, wrote;
clock_t begin, next = lbolt;
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
mutex_enter(&l2arc_feed_thr_lock); mutex_enter(&l2arc_feed_thr_lock);
while (l2arc_thread_exit == 0) { while (l2arc_thread_exit == 0) {
/*
* Pause for l2arc_feed_secs seconds between writes.
*/
CALLB_CPR_SAFE_BEGIN(&cpr); CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
lbolt + (hz * l2arc_feed_secs)); next);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
next = lbolt + hz;
/* /*
* Quick check for L2ARC devices. * Quick check for L2ARC devices.
@ -4266,6 +4414,7 @@ l2arc_feed_thread(void)
continue; continue;
} }
mutex_exit(&l2arc_dev_mtx); mutex_exit(&l2arc_dev_mtx);
begin = lbolt;
/* /*
* This selects the next l2arc device to write to, and in * This selects the next l2arc device to write to, and in
@ -4294,9 +4443,7 @@ l2arc_feed_thread(void)
ARCSTAT_BUMP(arcstat_l2_feeds); ARCSTAT_BUMP(arcstat_l2_feeds);
size = dev->l2ad_write; size = l2arc_write_size(dev);
if (arc_warm == B_FALSE)
size += dev->l2ad_boost;
/* /*
* Evict L2ARC buffers that will be overwritten. * Evict L2ARC buffers that will be overwritten.
@ -4306,7 +4453,12 @@ l2arc_feed_thread(void)
/* /*
* Write ARC buffers. * Write ARC buffers.
*/ */
l2arc_write_buffers(spa, dev, size); wrote = l2arc_write_buffers(spa, dev, size);
/*
* Calculate interval between writes.
*/
next = l2arc_write_interval(begin, size, wrote);
spa_config_exit(spa, SCL_L2ARC, dev); spa_config_exit(spa, SCL_L2ARC, dev);
} }
@ -4356,6 +4508,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE; adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
list_link_init(&adddev->l2ad_node); list_link_init(&adddev->l2ad_node);
ASSERT3U(adddev->l2ad_write, >, 0); ASSERT3U(adddev->l2ad_write, >, 0);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -477,7 +477,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT3U(bonuslen, <=, db->db.db_size); ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
if (bonuslen < DN_MAX_BONUSLEN) if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN); bzero(db->db.db_data, DN_MAX_BONUSLEN);
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
@ -673,7 +673,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (db->db_blkid == DB_BONUS_BLKID) { if (db->db_blkid == DB_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */ /* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size; int size = db->db.db_size;
@ -1350,7 +1350,7 @@ dbuf_clear(dmu_buf_impl_t *db)
ASSERT(db->db.db_data != NULL); ASSERT(db->db.db_data != NULL);
if (db->db_blkid == DB_BONUS_BLKID) { if (db->db_blkid == DB_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
} }
db->db.db_data = NULL; db->db.db_data = NULL;
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
@ -1472,7 +1472,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db.db_offset = DB_BONUS_BLKID; db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */ /* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t)); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db); return (db);
} else { } else {
int blocksize = int blocksize =
@ -1499,7 +1499,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
list_insert_head(&dn->dn_dbufs, db); list_insert_head(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx); mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t)); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
if (parent && parent != dn->dn_dbuf) if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db); dbuf_add_ref(parent, db);
@ -1568,7 +1568,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(db->db_data_pending == NULL); ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db); kmem_cache_free(dbuf_cache, db);
arc_space_return(sizeof (dmu_buf_impl_t)); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
} }
void void
@ -1996,7 +1996,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
if (*datap != db->db.db_data) { if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN); zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
} }
db->db_data_pending = NULL; db->db_data_pending = NULL;
drp = &db->db_last_dirty; drp = &db->db_last_dirty;

View File

@ -1217,6 +1217,39 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
return (err); return (err);
} }
/* ARGSUSED */
int
dmu_objset_prefetch(char *name, void *arg)
{
dsl_dataset_t *ds;
if (dsl_dataset_hold(name, FTAG, &ds))
return (0);
if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
mutex_enter(&ds->ds_opening_lock);
if (!dsl_dataset_get_user_ptr(ds)) {
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
zb.zb_objset = ds->ds_object;
zb.zb_object = 0;
zb.zb_level = -1;
zb.zb_blkid = 0;
(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
&ds->ds_phys->ds_bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
mutex_exit(&ds->ds_opening_lock);
}
dsl_dataset_rele(ds, FTAG);
return (0);
}
void void
dmu_objset_set_user(objset_t *os, void *user_ptr) dmu_objset_set_user(objset_t *os, void *user_ptr)
{ {

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -816,10 +816,11 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
/* currently allocated, want to be allocated */ /* currently allocated, want to be allocated */
dmu_tx_hold_bonus(tx, drro->drr_object); dmu_tx_hold_bonus(tx, drro->drr_object);
/* /*
* We may change blocksize, so need to * We may change blocksize and delete old content,
* hold_write * so need to hold_write and hold_free.
*/ */
dmu_tx_hold_write(tx, drro->drr_object, 0, 1); dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
dmu_tx_hold_free(tx, drro->drr_object, 0, DMU_OBJECT_END);
err = dmu_tx_assign(tx, TXG_WAIT); err = dmu_tx_assign(tx, TXG_WAIT);
if (err) { if (err) {
dmu_tx_abort(tx); dmu_tx_abort(tx);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -309,7 +309,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
list_insert_head(&os->os_dnodes, dn); list_insert_head(&os->os_dnodes, dn);
mutex_exit(&os->os_lock); mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t)); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn); return (dn);
} }
@ -344,7 +344,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_bonus = NULL; dn->dn_bonus = NULL;
} }
kmem_cache_free(dnode_cache, dn); kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t)); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
} }
void void
@ -424,7 +424,7 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{ {
int i, old_nblkptr; int i, nblkptr;
dmu_buf_impl_t *db = NULL; dmu_buf_impl_t *db = NULL;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@ -454,6 +454,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dnode_free_range(dn, 0, -1ULL, tx); dnode_free_range(dn, 0, -1ULL, tx);
} }
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
/* change blocksize */ /* change blocksize */
rw_enter(&dn->dn_struct_rwlock, RW_WRITER); rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (blocksize != dn->dn_datablksz && if (blocksize != dn->dn_datablksz &&
@ -466,6 +468,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dnode_setdirty(dn, tx); dnode_setdirty(dn, tx);
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (db) if (db)
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
@ -475,19 +479,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
/* change bonus size and type */ /* change bonus size and type */
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
old_nblkptr = dn->dn_nblkptr;
dn->dn_bonustype = bonustype; dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen; dn->dn_bonuslen = bonuslen;
dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT;
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
/* XXX - for now, we can't make nblkptr smaller */ /* fix up the bonus db_size */
ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); if (dn->dn_bonus) {
/* fix up the bonus db_size if dn_nblkptr has changed */
if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
dn->dn_bonus->db.db_size = dn->dn_bonus->db.db_size =
DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/dbuf.h> #include <sys/dbuf.h>
#include <sys/dnode.h> #include <sys/dnode.h>
@ -542,18 +540,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
/* XXX shouldn't the phys already be zeroed? */ /* XXX shouldn't the phys already be zeroed? */
bzero(dnp, DNODE_CORE_SIZE); bzero(dnp, DNODE_CORE_SIZE);
dnp->dn_nlevels = 1; dnp->dn_nlevels = 1;
dnp->dn_nblkptr = dn->dn_nblkptr;
} }
if (dn->dn_nblkptr > dnp->dn_nblkptr) {
/* zero the new blkptrs we are gaining */
bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
sizeof (blkptr_t) *
(dn->dn_nblkptr - dnp->dn_nblkptr));
}
dnp->dn_type = dn->dn_type; dnp->dn_type = dn->dn_type;
dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen; dnp->dn_bonuslen = dn->dn_bonuslen;
dnp->dn_nblkptr = dn->dn_nblkptr;
} }
ASSERT(dnp->dn_nlevels > 1 || ASSERT(dnp->dn_nlevels > 1 ||
@ -613,6 +605,30 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
return; return;
} }
if (dn->dn_next_nblkptr[txgoff]) {
/* this should only happen on a realloc */
ASSERT(dn->dn_allocated_txg == tx->tx_txg);
if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
/* zero the new blkptrs we are gaining */
bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
sizeof (blkptr_t) *
(dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
#ifdef ZFS_DEBUG
} else {
int i;
ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
/* the blkptrs we are losing better be unallocated */
for (i = dn->dn_next_nblkptr[txgoff];
i < dnp->dn_nblkptr; i++)
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
#endif
}
mutex_enter(&dn->dn_mtx);
dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
dn->dn_next_nblkptr[txgoff] = 0;
mutex_exit(&dn->dn_mtx);
}
if (dn->dn_next_nlevels[txgoff]) { if (dn->dn_next_nlevels[txgoff]) {
dnode_increase_indirection(dn, tx); dnode_increase_indirection(dn, tx);
dn->dn_next_nlevels[txgoff] = 0; dn->dn_next_nlevels[txgoff] = 0;

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -2208,6 +2208,12 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
err = dsl_dir_open(oldname, FTAG, &dd, &tail); err = dsl_dir_open(oldname, FTAG, &dd, &tail);
if (err) if (err)
return (err); return (err);
/*
* If there are more than 2 references there may be holds
* hanging around that haven't been cleared out yet.
*/
if (dmu_buf_refcount(dd->dd_dbuf) > 2)
txg_wait_synced(dd->dd_pool, 0);
if (tail == NULL) { if (tail == NULL) {
int delta = strlen(newname) - strlen(oldname); int delta = strlen(newname) - strlen(oldname);
@ -3028,12 +3034,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
dsl_dataset_t *ds = arg1; dsl_dataset_t *ds = arg1;
uint64_t *reservationp = arg2; uint64_t *reservationp = arg2;
uint64_t new_reservation = *reservationp; uint64_t new_reservation = *reservationp;
int64_t delta;
uint64_t unique; uint64_t unique;
if (new_reservation > INT64_MAX)
return (EOVERFLOW);
if (spa_version(ds->ds_dir->dd_pool->dp_spa) < if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
SPA_VERSION_REFRESERVATION) SPA_VERSION_REFRESERVATION)
return (ENOTSUP); return (ENOTSUP);
@ -3050,15 +3052,18 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
mutex_enter(&ds->ds_lock); mutex_enter(&ds->ds_lock);
unique = dsl_dataset_unique(ds); unique = dsl_dataset_unique(ds);
delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
mutex_exit(&ds->ds_lock); mutex_exit(&ds->ds_lock);
if (delta > 0 && if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) {
delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) uint64_t delta = MAX(unique, new_reservation) -
return (ENOSPC); MAX(unique, ds->ds_reserved);
if (delta > 0 && ds->ds_quota > 0 &&
new_reservation > ds->ds_quota) if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (ENOSPC); return (ENOSPC);
if (ds->ds_quota > 0 &&
new_reservation > ds->ds_quota)
return (ENOSPC);
}
return (0); return (0);
} }

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -1078,10 +1078,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
uint64_t *reservationp = arg2; uint64_t *reservationp = arg2;
uint64_t new_reservation = *reservationp; uint64_t new_reservation = *reservationp;
uint64_t used, avail; uint64_t used, avail;
int64_t delta;
if (new_reservation > INT64_MAX)
return (EOVERFLOW);
/* /*
* If we are doing the preliminary check in open context, the * If we are doing the preliminary check in open context, the
@ -1092,8 +1088,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
mutex_enter(&dd->dd_lock); mutex_enter(&dd->dd_lock);
used = dd->dd_phys->dd_used_bytes; used = dd->dd_phys->dd_used_bytes;
delta = MAX(used, new_reservation) -
MAX(used, dd->dd_phys->dd_reserved);
mutex_exit(&dd->dd_lock); mutex_exit(&dd->dd_lock);
if (dd->dd_parent) { if (dd->dd_parent) {
@ -1103,11 +1097,17 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
} }
if (delta > 0 && delta > avail) if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) {
return (ENOSPC); uint64_t delta = MAX(used, new_reservation) -
if (delta > 0 && dd->dd_phys->dd_quota > 0 && MAX(used, dd->dd_phys->dd_reserved);
new_reservation > dd->dd_phys->dd_quota)
return (ENOSPC); if (delta > avail)
return (ENOSPC);
if (dd->dd_phys->dd_quota > 0 &&
new_reservation > dd->dd_phys->dd_quota)
return (ENOSPC);
}
return (0); return (0);
} }

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -95,6 +95,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ESC_ZFS_RESILVER_START); ESC_ZFS_RESILVER_START);
dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
tx->tx_txg); tx->tx_txg);
} else {
spa_event_notify(dp->dp_spa, NULL,
ESC_ZFS_SCRUB_START);
} }
/* zero out the scrub stats in all vdev_stat_t's */ /* zero out the scrub stats in all vdev_stat_t's */
@ -212,8 +215,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
*/ */
vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
*completep ? dp->dp_scrub_max_txg : 0, B_TRUE); *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
if (dp->dp_scrub_min_txg && *completep) if (*completep)
spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
spa_errlog_rotate(dp->dp_spa); spa_errlog_rotate(dp->dp_spa);
/* /*

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -68,8 +68,19 @@ typedef enum arc_buf_contents {
#define ARC_CACHED (1 << 4) /* I/O was already in cache */ #define ARC_CACHED (1 << 4) /* I/O was already in cache */
#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ #define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
void arc_space_consume(uint64_t space); /*
void arc_space_return(uint64_t space); * The following breakdows of arc_size exist for kstat only.
*/
typedef enum arc_space_type {
ARC_SPACE_DATA,
ARC_SPACE_HDRS,
ARC_SPACE_L2HDRS,
ARC_SPACE_OTHER,
ARC_SPACE_NUMTYPES
} arc_space_type_t;
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
void *arc_data_buf_alloc(uint64_t space); void *arc_data_buf_alloc(uint64_t space);
void arc_data_buf_free(void *buf, uint64_t space); void arc_data_buf_free(void *buf, uint64_t space);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,

View File

@ -26,8 +26,6 @@
#ifndef _SYS_DMU_OBJSET_H #ifndef _SYS_DMU_OBJSET_H
#define _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/txg.h> #include <sys/txg.h>
@ -118,6 +116,7 @@ int dmu_objset_find(char *name, int func(char *, void *), void *arg,
int flags); int flags);
int dmu_objset_find_spa(spa_t *spa, const char *name, int dmu_objset_find_spa(spa_t *spa, const char *name,
int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
int dmu_objset_prefetch(char *name, void *arg);
void dmu_objset_byteswap(void *buf, size_t size); void dmu_objset_byteswap(void *buf, size_t size);
int dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_evict_dbufs(objset_t *os);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -160,6 +160,7 @@ typedef struct dnode {
uint16_t dn_datablkszsec; /* in 512b sectors */ uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */ uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid; uint64_t dn_maxblkid;
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE];
uint16_t dn_next_bonuslen[TXG_SIZE]; uint16_t dn_next_bonuslen[TXG_SIZE];

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -532,6 +532,7 @@ extern void spa_boot_init(void);
extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
/* asynchronous event notification */ /* asynchronous event notification */
extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -182,7 +182,6 @@ typedef struct znode {
vnode_t *z_vnode; vnode_t *z_vnode;
uint64_t z_id; /* object ID for this znode */ uint64_t z_id; /* object ID for this znode */
kmutex_t z_lock; /* znode modification lock */ kmutex_t z_lock; /* znode modification lock */
krwlock_t z_map_lock; /* page map lock */
krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */ krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -265,6 +265,13 @@ typedef int zio_pipe_stage_t(zio_t *zio);
#define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_NOW 0x01
#define ZIO_REEXECUTE_SUSPEND 0x02 #define ZIO_REEXECUTE_SUSPEND 0x02
typedef struct zio_link {
zio_t *zl_parent;
zio_t *zl_child;
list_node_t zl_parent_node;
list_node_t zl_child_node;
} zio_link_t;
struct zio { struct zio {
/* Core information about this I/O */ /* Core information about this I/O */
zbookmark_t io_bookmark; zbookmark_t io_bookmark;
@ -275,14 +282,14 @@ struct zio {
uint8_t io_priority; uint8_t io_priority;
uint8_t io_reexecute; uint8_t io_reexecute;
uint8_t io_async_root; uint8_t io_async_root;
uint8_t io_state[ZIO_WAIT_TYPES];
uint64_t io_txg; uint64_t io_txg;
spa_t *io_spa; spa_t *io_spa;
blkptr_t *io_bp; blkptr_t *io_bp;
blkptr_t io_bp_copy; blkptr_t io_bp_copy;
zio_t *io_parent; list_t io_parent_list;
zio_t *io_child; list_t io_child_list;
zio_t *io_sibling_prev; zio_link_t *io_walk_link;
zio_t *io_sibling_next;
zio_t *io_logical; zio_t *io_logical;
zio_transform_t *io_transform_stack; zio_transform_t *io_transform_stack;
@ -305,8 +312,6 @@ struct zio {
avl_node_t io_offset_node; avl_node_t io_offset_node;
avl_node_t io_deadline_node; avl_node_t io_deadline_node;
avl_tree_t *io_vdev_tree; avl_tree_t *io_vdev_tree;
zio_t *io_delegate_list;
zio_t *io_delegate_next;
/* Internal pipeline state */ /* Internal pipeline state */
int io_flags; int io_flags;
@ -329,7 +334,7 @@ struct zio {
uint64_t io_ena; uint64_t io_ena;
}; };
extern zio_t *zio_null(zio_t *pio, spa_t *spa, extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *private, int flags); zio_done_func_t *done, void *private, int flags);
extern zio_t *zio_root(spa_t *spa, extern zio_t *zio_root(spa_t *spa,
@ -379,6 +384,11 @@ extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio); extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio); extern void zio_interrupt(zio_t *zio);
extern zio_t *zio_walk_parents(zio_t *cio);
extern zio_t *zio_walk_children(zio_t *pio);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
extern void *zio_buf_alloc(size_t size); extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size); extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size); extern void *zio_data_buf_alloc(size_t size);

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -61,6 +61,10 @@
#include <sys/sunddi.h> #include <sys/sunddi.h>
#include <sys/spa_boot.h> #include <sys/spa_boot.h>
#ifdef _KERNEL
#include <sys/zone.h>
#endif /* _KERNEL */
#include "zfs_prop.h" #include "zfs_prop.h"
#include "zfs_comutil.h" #include "zfs_comutil.h"
@ -111,38 +115,38 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
static void static void
spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
{ {
uint64_t size = spa_get_space(spa); uint64_t size;
uint64_t used = spa_get_alloc(spa); uint64_t used;
uint64_t cap, version; uint64_t cap, version;
zprop_source_t src = ZPROP_SRC_NONE; zprop_source_t src = ZPROP_SRC_NONE;
spa_config_dirent_t *dp; spa_config_dirent_t *dp;
ASSERT(MUTEX_HELD(&spa->spa_props_lock)); ASSERT(MUTEX_HELD(&spa->spa_props_lock));
/* if (spa->spa_root_vdev != NULL) {
* readonly properties size = spa_get_space(spa);
*/ used = spa_get_alloc(spa);
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
size - used, src);
cap = (size == 0) ? 0 : (used * 100 / size); cap = (size == 0) ? 0 : (used * 100 / size);
spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
spa->spa_root_vdev->vdev_state, src);
version = spa_version(spa);
if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
src = ZPROP_SRC_DEFAULT;
else
src = ZPROP_SRC_LOCAL;
spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
}
spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
spa->spa_root_vdev->vdev_state, src);
/*
* settable properties that are not stored in the pool property object.
*/
version = spa_version(spa);
if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
src = ZPROP_SRC_DEFAULT;
else
src = ZPROP_SRC_LOCAL;
spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
if (spa->spa_root != NULL) if (spa->spa_root != NULL)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
@ -417,16 +421,60 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
return (error); return (error);
} }
void
spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
{
char *cachefile;
spa_config_dirent_t *dp;
if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
&cachefile) != 0)
return;
dp = kmem_alloc(sizeof (spa_config_dirent_t),
KM_SLEEP);
if (cachefile[0] == '\0')
dp->scd_path = spa_strdup(spa_config_path);
else if (strcmp(cachefile, "none") == 0)
dp->scd_path = NULL;
else
dp->scd_path = spa_strdup(cachefile);
list_insert_head(&spa->spa_config_list, dp);
if (need_sync)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
int int
spa_prop_set(spa_t *spa, nvlist_t *nvp) spa_prop_set(spa_t *spa, nvlist_t *nvp)
{ {
int error; int error;
nvpair_t *elem;
boolean_t need_sync = B_FALSE;
zpool_prop_t prop;
if ((error = spa_prop_validate(spa, nvp)) != 0) if ((error = spa_prop_validate(spa, nvp)) != 0)
return (error); return (error);
return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, elem = NULL;
spa, nvp, 3)); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
if ((prop = zpool_name_to_prop(
nvpair_name(elem))) == ZPROP_INVAL)
return (EINVAL);
if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
continue;
need_sync = B_TRUE;
break;
}
if (need_sync)
return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
spa, nvp, 3));
else
return (0);
} }
/* /*
@ -1187,9 +1235,17 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
VERIFY(nvlist_lookup_string(newconfig, VERIFY(nvlist_lookup_string(newconfig,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
#ifdef _KERNEL
myhostid = zone_get_hostid(NULL);
#else /* _KERNEL */
/*
* We're emulating the system's hostid in userland, so
* we can't use zone_get_hostid().
*/
(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
#endif /* _KERNEL */
if (hostid != 0 && myhostid != 0 && if (hostid != 0 && myhostid != 0 &&
(unsigned long)hostid != myhostid) { hostid != myhostid) {
cmn_err(CE_WARN, "pool '%s' could not be " cmn_err(CE_WARN, "pool '%s' could not be "
"loaded as it was last accessed by " "loaded as it was last accessed by "
"another system (host: %s hostid: 0x%lx). " "another system (host: %s hostid: 0x%lx). "
@ -2081,8 +2137,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
if (props) if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_sync_props(spa, props, CRED(), tx); spa_sync_props(spa, props, CRED(), tx);
}
dmu_tx_commit(tx); dmu_tx_commit(tx);
@ -2100,10 +2158,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
mutex_exit(&spa_namespace_lock);
spa->spa_minref = refcount_count(&spa->spa_refcount); spa->spa_minref = refcount_count(&spa->spa_refcount);
mutex_exit(&spa_namespace_lock);
return (0); return (0);
} }
@ -2186,6 +2244,9 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
VDEV_ALLOC_L2CACHE); VDEV_ALLOC_L2CACHE);
spa_config_exit(spa, SCL_ALL, FTAG); spa_config_exit(spa, SCL_ALL, FTAG);
if (props != NULL)
spa_configfile_set(spa, props, B_FALSE);
if (error != 0 || (props && spa_writeable(spa) && if (error != 0 || (props && spa_writeable(spa) &&
(error = spa_prop_set(spa, props)))) { (error = spa_prop_set(spa, props)))) {
if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
@ -2502,6 +2563,7 @@ spa_tryimport(nvlist_t *tryconfig)
char *poolname; char *poolname;
spa_t *spa; spa_t *spa;
uint64_t state; uint64_t state;
int error;
if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
return (NULL); return (NULL);
@ -2521,7 +2583,7 @@ spa_tryimport(nvlist_t *tryconfig)
* Pass TRUE for mosconfig because the user-supplied config * Pass TRUE for mosconfig because the user-supplied config
* is actually the one to trust when doing an import. * is actually the one to trust when doing an import.
*/ */
(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
/* /*
* If 'tryconfig' was at least parsable, return the current config. * If 'tryconfig' was at least parsable, return the current config.
@ -2540,7 +2602,7 @@ spa_tryimport(nvlist_t *tryconfig)
* copy it out so that external consumers can tell which * copy it out so that external consumers can tell which
* pools are bootable. * pools are bootable.
*/ */
if (spa->spa_bootfs) { if ((!error || error == EEXIST) && spa->spa_bootfs) {
char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
/* /*
@ -3811,7 +3873,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
zpool_prop_t prop; zpool_prop_t prop;
const char *propname; const char *propname;
zprop_type_t proptype; zprop_type_t proptype;
spa_config_dirent_t *dp;
mutex_enter(&spa->spa_props_lock); mutex_enter(&spa->spa_props_lock);
@ -3844,23 +3905,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_CACHEFILE:
/* /*
* 'cachefile' is a non-persistent property, but note * 'cachefile' is also a non-persisitent property.
* an async request that the config cache needs to be
* udpated.
*/ */
VERIFY(nvpair_value_string(elem, &strval) == 0);
dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP);
if (strval[0] == '\0')
dp->scd_path = spa_strdup(spa_config_path);
else if (strcmp(strval, "none") == 0)
dp->scd_path = NULL;
else
dp->scd_path = spa_strdup(strval);
list_insert_head(&spa->spa_config_list, dp);
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
break; break;
default: default:
/* /*

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -36,6 +36,7 @@
#include <sys/sunddi.h> #include <sys/sunddi.h>
#ifdef _KERNEL #ifdef _KERNEL
#include <sys/kobj.h> #include <sys/kobj.h>
#include <sys/zone.h>
#endif #endif
/* /*
@ -352,7 +353,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
txg) == 0); txg) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
spa_guid(spa)) == 0); spa_guid(spa)) == 0);
#ifdef _KERNEL
hostid = zone_get_hostid(NULL);
#else /* _KERNEL */
/*
* We're emulating the system's hostid in userland, so we can't use
* zone_get_hostid().
*/
(void) ddi_strtoul(hw_serial, NULL, 10, &hostid); (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
#endif /* _KERNEL */
if (hostid != 0) { if (hostid != 0) {
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
hostid) == 0); hostid) == 0);

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -819,23 +819,22 @@ typedef struct vdev_probe_stats {
boolean_t vps_readable; boolean_t vps_readable;
boolean_t vps_writeable; boolean_t vps_writeable;
int vps_flags; int vps_flags;
zio_t *vps_root;
vdev_t *vps_vd;
} vdev_probe_stats_t; } vdev_probe_stats_t;
static void static void
vdev_probe_done(zio_t *zio) vdev_probe_done(zio_t *zio)
{ {
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
vdev_t *vd = zio->io_vd;
vdev_probe_stats_t *vps = zio->io_private; vdev_probe_stats_t *vps = zio->io_private;
vdev_t *vd = vps->vps_vd;
ASSERT(vd->vdev_probe_zio != NULL);
if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_type == ZIO_TYPE_READ) {
ASSERT(zio->io_vd == vd);
if (zio->io_error == 0) if (zio->io_error == 0)
vps->vps_readable = 1; vps->vps_readable = 1;
if (zio->io_error == 0 && spa_writeable(spa)) { if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vps->vps_root, vd, zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
zio->io_offset, zio->io_size, zio->io_data, zio->io_offset, zio->io_size, zio->io_data,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
@ -843,13 +842,11 @@ vdev_probe_done(zio_t *zio)
zio_buf_free(zio->io_data, zio->io_size); zio_buf_free(zio->io_data, zio->io_size);
} }
} else if (zio->io_type == ZIO_TYPE_WRITE) { } else if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT(zio->io_vd == vd);
if (zio->io_error == 0) if (zio->io_error == 0)
vps->vps_writeable = 1; vps->vps_writeable = 1;
zio_buf_free(zio->io_data, zio->io_size); zio_buf_free(zio->io_data, zio->io_size);
} else if (zio->io_type == ZIO_TYPE_NULL) { } else if (zio->io_type == ZIO_TYPE_NULL) {
ASSERT(zio->io_vd == NULL); zio_t *pio;
ASSERT(zio == vps->vps_root);
vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable; vd->vdev_cant_write |= !vps->vps_writeable;
@ -863,6 +860,16 @@ vdev_probe_done(zio_t *zio)
spa, vd, NULL, 0, 0); spa, vd, NULL, 0, 0);
zio->io_error = ENXIO; zio->io_error = ENXIO;
} }
mutex_enter(&vd->vdev_probe_lock);
ASSERT(vd->vdev_probe_zio == zio);
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
while ((pio = zio_walk_parents(zio)) != NULL)
if (!vdev_accessible(vd, pio))
pio->io_error = ENXIO;
kmem_free(vps, sizeof (*vps)); kmem_free(vps, sizeof (*vps));
} }
} }
@ -873,46 +880,79 @@ vdev_probe_done(zio_t *zio)
* but the first (which we leave alone in case it contains a VTOC). * but the first (which we leave alone in case it contains a VTOC).
*/ */
zio_t * zio_t *
vdev_probe(vdev_t *vd, zio_t *pio) vdev_probe(vdev_t *vd, zio_t *zio)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
vdev_probe_stats_t *vps; vdev_probe_stats_t *vps = NULL;
zio_t *zio; zio_t *pio;
int l; int l;
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
* vdev_cant_read and vdev_cant_write can only transition
* from TRUE to FALSE when we have the SCL_ZIO lock as writer;
* otherwise they can only transition from FALSE to TRUE.
* This ensures that any zio looking at these values can
* assume that failures persist for the life of the I/O.
* That's important because when a device has intermittent
* connectivity problems, we want to ensure that they're
* ascribed to the device (ENXIO) and not the zio (EIO).
*
* Since we hold SCL_ZIO as writer here, clear both values
* so the probe can reevaluate from first principles.
*/
vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
}
ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vd->vdev_ops->vdev_op_leaf);
zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); /*
* Don't probe the probe.
*/
if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
return (NULL);
vps->vps_root = zio; /*
vps->vps_vd = vd; * To prevent 'probe storms' when a device fails, we create
* just one probe i/o at a time. All zios that want to probe
* this vdev will become parents of the probe io.
*/
mutex_enter(&vd->vdev_probe_lock);
if ((pio = vd->vdev_probe_zio) == NULL) {
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_DONT_RETRY;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
* vdev_cant_read and vdev_cant_write can only
* transition from TRUE to FALSE when we have the
* SCL_ZIO lock as writer; otherwise they can only
* transition from FALSE to TRUE. This ensures that
* any zio looking at these values can assume that
* failures persist for the life of the I/O. That's
* important because when a device has intermittent
* connectivity problems, we want to ensure that
* they're ascribed to the device (ENXIO) and not
* the zio (EIO).
*
* Since we hold SCL_ZIO as writer here, clear both
* values so the probe can reevaluate from first
* principles.
*/
vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
}
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}
if (zio != NULL)
zio_add_child(zio, pio);
mutex_exit(&vd->vdev_probe_lock);
if (vps == NULL) {
ASSERT(zio != NULL);
return (NULL);
}
for (l = 1; l < VDEV_LABELS; l++) { for (l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(zio, vd, zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l, vdev_label_offset(vd->vdev_psize, l,
offsetof(vdev_label_t, vl_pad)), offsetof(vdev_label_t, vl_pad)),
VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
@ -920,7 +960,11 @@ vdev_probe(vdev_t *vd, zio_t *pio)
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
} }
return (zio); if (zio == NULL)
return (pio);
zio_nowait(pio);
return (NULL);
} }
/* /*

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -203,23 +203,23 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
* Fill a previously allocated cache entry with data. * Fill a previously allocated cache entry with data.
*/ */
static void static void
vdev_cache_fill(zio_t *zio) vdev_cache_fill(zio_t *fio)
{ {
vdev_t *vd = zio->io_vd; vdev_t *vd = fio->io_vd;
vdev_cache_t *vc = &vd->vdev_cache; vdev_cache_t *vc = &vd->vdev_cache;
vdev_cache_entry_t *ve = zio->io_private; vdev_cache_entry_t *ve = fio->io_private;
zio_t *dio; zio_t *pio;
ASSERT(zio->io_size == VCBS); ASSERT(fio->io_size == VCBS);
/* /*
* Add data to the cache. * Add data to the cache.
*/ */
mutex_enter(&vc->vc_lock); mutex_enter(&vc->vc_lock);
ASSERT(ve->ve_fill_io == zio); ASSERT(ve->ve_fill_io == fio);
ASSERT(ve->ve_offset == zio->io_offset); ASSERT(ve->ve_offset == fio->io_offset);
ASSERT(ve->ve_data == zio->io_data); ASSERT(ve->ve_data == fio->io_data);
ve->ve_fill_io = NULL; ve->ve_fill_io = NULL;
@ -228,20 +228,13 @@ vdev_cache_fill(zio_t *zio)
* any reads that were queued up before the missed update are still * any reads that were queued up before the missed update are still
* valid, so we can satisfy them from this line before we evict it. * valid, so we can satisfy them from this line before we evict it.
*/ */
for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) while ((pio = zio_walk_parents(fio)) != NULL)
vdev_cache_hit(vc, ve, dio); vdev_cache_hit(vc, ve, pio);
if (zio->io_error || ve->ve_missed_update) if (fio->io_error || ve->ve_missed_update)
vdev_cache_evict(vc, ve); vdev_cache_evict(vc, ve);
mutex_exit(&vc->vc_lock); mutex_exit(&vc->vc_lock);
while ((dio = zio->io_delegate_list) != NULL) {
zio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = zio->io_error;
zio_execute(dio);
}
} }
/* /*
@ -283,9 +276,8 @@ vdev_cache_read(zio_t *zio)
} }
if ((fio = ve->ve_fill_io) != NULL) { if ((fio = ve->ve_fill_io) != NULL) {
zio->io_delegate_next = fio->io_delegate_list;
fio->io_delegate_list = zio;
zio_vdev_io_bypass(zio); zio_vdev_io_bypass(zio);
zio_add_child(zio, fio);
mutex_exit(&vc->vc_lock); mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_delegations); VDCSTAT_BUMP(vdc_stat_delegations);
return (0); return (0);
@ -295,7 +287,6 @@ vdev_cache_read(zio_t *zio)
zio_vdev_io_bypass(zio); zio_vdev_io_bypass(zio);
mutex_exit(&vc->vc_lock); mutex_exit(&vc->vc_lock);
zio_execute(zio);
VDCSTAT_BUMP(vdc_stat_hits); VDCSTAT_BUMP(vdc_stat_hits);
return (0); return (0);
} }
@ -312,8 +303,8 @@ vdev_cache_read(zio_t *zio)
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio; ve->ve_fill_io = fio;
fio->io_delegate_list = zio;
zio_vdev_io_bypass(zio); zio_vdev_io_bypass(zio);
zio_add_child(zio, fio);
mutex_exit(&vc->vc_lock); mutex_exit(&vc->vc_lock);
zio_nowait(fio); zio_nowait(fio);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -968,7 +968,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
KM_SLEEP); KM_SLEEP);
zio_t *vio = zio_null(zio, spa, zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ? (vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done, vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags); good_writes, flags);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -180,11 +180,16 @@ vdev_mirror_scrub_done(zio_t *zio)
mirror_child_t *mc = zio->io_private; mirror_child_t *mc = zio->io_private;
if (zio->io_error == 0) { if (zio->io_error == 0) {
zio_t *pio = zio->io_parent; zio_t *pio;
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size); mutex_enter(&zio->io_lock);
bcopy(zio->io_data, pio->io_data, pio->io_size); while ((pio = zio_walk_parents(zio)) != NULL) {
mutex_exit(&pio->io_lock); mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size);
mutex_exit(&pio->io_lock);
}
mutex_exit(&zio->io_lock);
} }
zio_buf_free(zio->io_data, zio->io_size); zio_buf_free(zio->io_data, zio->io_size);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -149,20 +149,12 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
static void static void
vdev_queue_agg_io_done(zio_t *aio) vdev_queue_agg_io_done(zio_t *aio)
{ {
zio_t *dio; zio_t *pio;
uint64_t offset = 0;
while ((dio = aio->io_delegate_list) != NULL) { while ((pio = zio_walk_parents(aio)) != NULL)
if (aio->io_type == ZIO_TYPE_READ) if (aio->io_type == ZIO_TYPE_READ)
bcopy((char *)aio->io_data + offset, dio->io_data, bcopy((char *)aio->io_data + (pio->io_offset -
dio->io_size); aio->io_offset), pio->io_data, pio->io_size);
offset += dio->io_size;
aio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = aio->io_error;
zio_execute(dio);
}
ASSERT3U(offset, ==, aio->io_size);
zio_buf_free(aio->io_data, aio->io_size); zio_buf_free(aio->io_data, aio->io_size);
} }
@ -173,8 +165,8 @@ vdev_queue_agg_io_done(zio_t *aio)
static zio_t * static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{ {
zio_t *fio, *lio, *aio, *dio; zio_t *fio, *lio, *aio, *dio, *nio;
avl_tree_t *tree; avl_tree_t *t;
uint64_t size; uint64_t size;
int flags; int flags;
@ -186,7 +178,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
fio = lio = avl_first(&vq->vq_deadline_tree); fio = lio = avl_first(&vq->vq_deadline_tree);
tree = fio->io_vdev_tree; t = fio->io_vdev_tree;
size = fio->io_size; size = fio->io_size;
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
@ -198,55 +190,54 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
* of the I/O, such as whether it's a normal I/O or a * of the I/O, such as whether it's a normal I/O or a
* scrub/resilver, can be preserved in the aggregate. * scrub/resilver, can be preserved in the aggregate.
*/ */
while ((dio = AVL_PREV(tree, fio)) != NULL && while ((dio = AVL_PREV(t, fio)) != NULL &&
IS_ADJACENT(dio, fio) && IS_ADJACENT(dio, fio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
size + dio->io_size <= zfs_vdev_aggregation_limit) { size + dio->io_size <= zfs_vdev_aggregation_limit) {
dio->io_delegate_next = fio;
fio = dio; fio = dio;
size += dio->io_size; size += dio->io_size;
} }
while ((dio = AVL_NEXT(tree, lio)) != NULL && while ((dio = AVL_NEXT(t, lio)) != NULL &&
IS_ADJACENT(lio, dio) && IS_ADJACENT(lio, dio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
size + dio->io_size <= zfs_vdev_aggregation_limit) { size + dio->io_size <= zfs_vdev_aggregation_limit) {
lio->io_delegate_next = dio;
lio = dio; lio = dio;
size += dio->io_size; size += dio->io_size;
} }
} }
if (fio != lio) { if (fio != lio) {
char *buf = zio_buf_alloc(size);
uint64_t offset = 0;
ASSERT(size <= zfs_vdev_aggregation_limit); ASSERT(size <= zfs_vdev_aggregation_limit);
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
buf, size, fio->io_type, ZIO_PRIORITY_NOW, zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL); vdev_queue_agg_io_done, NULL);
aio->io_delegate_list = fio; /* We want to process lio, then stop */
lio = AVL_NEXT(t, lio);
for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { for (dio = fio; dio != lio; dio = nio) {
ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_type == aio->io_type);
ASSERT(dio->io_vdev_tree == tree); ASSERT(dio->io_vdev_tree == t);
if (dio->io_type == ZIO_TYPE_WRITE) if (dio->io_type == ZIO_TYPE_WRITE)
bcopy(dio->io_data, buf + offset, dio->io_size); bcopy(dio->io_data, (char *)aio->io_data +
offset += dio->io_size; (dio->io_offset - aio->io_offset),
dio->io_size);
nio = AVL_NEXT(t, dio);
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio); vdev_queue_io_remove(vq, dio);
zio_vdev_io_bypass(dio); zio_vdev_io_bypass(dio);
zio_execute(dio);
} }
ASSERT(offset == size);
avl_add(&vq->vq_pending_tree, aio); avl_add(&vq->vq_pending_tree, aio);
return (aio); return (aio);
} }
ASSERT(fio->io_vdev_tree == tree); ASSERT(fio->io_vdev_tree == t);
vdev_queue_io_remove(vq, fio); vdev_queue_io_remove(vq, fio);
avl_add(&vq->vq_pending_tree, fio); avl_add(&vq->vq_pending_tree, fio);

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* ZFS control directory (a.k.a. ".zfs") * ZFS control directory (a.k.a. ".zfs")
* *
@ -275,8 +273,13 @@ static int
zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
caller_context_t *ct) caller_context_t *ct)
{ {
if (mode & VWRITE) if (flags & V_ACE_MASK) {
return (EACCES); if (mode & ACE_ALL_WRITE_PERMS)
return (EACCES);
} else {
if (mode & VWRITE)
return (EACCES);
}
return (0); return (0);
} }
@ -411,6 +414,22 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
return (err); return (err);
} }
static int
zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
caller_context_t *ct)
{
/*
* We only care about ACL_ENABLED so that libsec can
* display ACL correctly and not default to POSIX draft.
*/
if (cmd == _PC_ACL_ENABLED) {
*valp = _ACL_ACE_ENABLED;
return (0);
}
return (fs_pathconf(vp, cmd, valp, cr, ct));
}
static const fs_operation_def_t zfsctl_tops_root[] = { static const fs_operation_def_t zfsctl_tops_root[] = {
{ VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
{ VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
@ -421,6 +440,7 @@ static const fs_operation_def_t zfsctl_tops_root[] = {
{ VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } },
{ VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } },
{ VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
{ VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } },
{ VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
{ NULL } { NULL }
}; };

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -90,7 +90,7 @@ typedef struct zfs_ioc_vec {
boolean_t zvec_his_log; boolean_t zvec_his_log;
} zfs_ioc_vec_t; } zfs_ioc_vec_t;
static void clear_props(char *dataset, nvlist_t *props); static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops);
static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
boolean_t *); boolean_t *);
int zfs_set_prop_nvlist(const char *, nvlist_t *); int zfs_set_prop_nvlist(const char *, nvlist_t *);
@ -1322,6 +1322,14 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
p = zc->zc_name + strlen(zc->zc_name); p = zc->zc_name + strlen(zc->zc_name);
if (zc->zc_cookie == 0) {
uint64_t cookie = 0;
int len = sizeof (zc->zc_name) - (p - zc->zc_name);
while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
dmu_objset_prefetch(p, NULL);
}
do { do {
error = dmu_dir_list_next(os, error = dmu_dir_list_next(os,
sizeof (zc->zc_name) - (p - zc->zc_name), p, sizeof (zc->zc_name) - (p - zc->zc_name), p,
@ -1365,6 +1373,9 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
if (error) if (error)
return (error == ENOENT ? ESRCH : error); return (error == ENOENT ? ESRCH : error);
if (zc->zc_cookie == 0)
dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
NULL, DS_FIND_SNAPSHOTS);
/* /*
* A dataset name of maximum length cannot have any snapshots, * A dataset name of maximum length cannot have any snapshots,
* so exit immediately. * so exit immediately.
@ -1606,7 +1617,7 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
if (dmu_objset_open(zc->zc_name, DMU_OST_ANY, if (dmu_objset_open(zc->zc_name, DMU_OST_ANY,
DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
if (dsl_prop_get_all(os, &origprops, TRUE) == 0) { if (dsl_prop_get_all(os, &origprops, TRUE) == 0) {
clear_props(zc->zc_name, origprops); clear_props(zc->zc_name, origprops, nvl);
nvlist_free(origprops); nvlist_free(origprops);
} }
dmu_objset_close(os); dmu_objset_close(os);
@ -1640,11 +1651,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
nvlist_t *props; nvlist_t *props;
spa_t *spa; spa_t *spa;
int error; int error;
nvpair_t *elem;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
&props))) &props)))
return (error); return (error);
/*
* If the only property is the configfile, then just do a spa_lookup()
* to handle the faulted case.
*/
elem = nvlist_next_nvpair(props, NULL);
if (elem != NULL && strcmp(nvpair_name(elem),
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
nvlist_next_nvpair(props, elem) == NULL) {
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_config_sync(spa, B_FALSE, B_TRUE);
}
mutex_exit(&spa_namespace_lock);
if (spa != NULL)
return (0);
}
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
nvlist_free(props); nvlist_free(props);
return (error); return (error);
@ -1665,20 +1695,27 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
int error; int error;
nvlist_t *nvp = NULL; nvlist_t *nvp = NULL;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
return (error); /*
* If the pool is faulted, there may be properties we can still
error = spa_prop_get(spa, &nvp); * get (such as altroot and cachefile), so attempt to get them
* anyway.
*/
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL)
error = spa_prop_get(spa, &nvp);
mutex_exit(&spa_namespace_lock);
} else {
error = spa_prop_get(spa, &nvp);
spa_close(spa, FTAG);
}
if (error == 0 && zc->zc_nvlist_dst != 0) if (error == 0 && zc->zc_nvlist_dst != 0)
error = put_nvlist(zc, nvp); error = put_nvlist(zc, nvp);
else else
error = EFAULT; error = EFAULT;
spa_close(spa, FTAG); nvlist_free(nvp);
if (nvp)
nvlist_free(nvp);
return (error); return (error);
} }
@ -2385,7 +2422,7 @@ zfs_ioc_rename(zfs_cmd_t *zc)
} }
static void static void
clear_props(char *dataset, nvlist_t *props) clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops)
{ {
zfs_cmd_t *zc; zfs_cmd_t *zc;
nvpair_t *prop; nvpair_t *prop;
@ -2396,6 +2433,9 @@ clear_props(char *dataset, nvlist_t *props)
(void) strcpy(zc->zc_name, dataset); (void) strcpy(zc->zc_name, dataset);
for (prop = nvlist_next_nvpair(props, NULL); prop; for (prop = nvlist_next_nvpair(props, NULL); prop;
prop = nvlist_next_nvpair(props, prop)) { prop = nvlist_next_nvpair(props, prop)) {
if (newprops != NULL &&
nvlist_exists(newprops, nvpair_name(prop)))
continue;
(void) strcpy(zc->zc_value, nvpair_name(prop)); (void) strcpy(zc->zc_value, nvpair_name(prop));
if (zfs_secpolicy_inherit(zc, CRED()) == 0) if (zfs_secpolicy_inherit(zc, CRED()) == 0)
(void) zfs_ioc_inherit_prop(zc); (void) zfs_ioc_inherit_prop(zc);
@ -2503,7 +2543,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
* so that the properties are applied to the new data. * so that the properties are applied to the new data.
*/ */
if (props) { if (props) {
clear_props(tofs, origprops); clear_props(tofs, origprops, props);
/* /*
* XXX - Note, this is all-or-nothing; should be best-effort. * XXX - Note, this is all-or-nothing; should be best-effort.
*/ */
@ -2542,7 +2582,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
* On error, restore the original props. * On error, restore the original props.
*/ */
if (error && props) { if (error && props) {
clear_props(tofs, props); clear_props(tofs, props, NULL);
(void) zfs_set_prop_nvlist(tofs, origprops); (void) zfs_set_prop_nvlist(tofs, origprops);
} }
out: out:

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* This file contains the code to implement file range locking in * This file contains the code to implement file range locking in
* ZFS, although there isn't much specific to ZFS (all that comes to mind * ZFS, although there isn't much specific to ZFS (all that comes to mind
@ -431,6 +429,8 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
new = kmem_alloc(sizeof (rl_t), KM_SLEEP); new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
new->r_zp = zp; new->r_zp = zp;
new->r_off = off; new->r_off = off;
if (len + off < off) /* overflow */
len = UINT64_MAX - off;
new->r_len = len; new->r_len = len;
new->r_cnt = 1; /* assume it's going to be in the tree */ new->r_cnt = 1; /* assume it's going to be in the tree */
new->r_type = type; new->r_type = type;

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -348,56 +348,28 @@ zfs_unmap_page(page_t *pp, caddr_t addr)
* *
* On Write: If we find a memory mapped page, we write to *both* * On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer. * the page and the dmu buffer.
*
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
*/ */
static int static void
mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
{ {
znode_t *zp = VTOZ(vp); int64_t off;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int64_t start, off;
int len = nbytes;
int error = 0;
start = uio->uio_loffset;
off = start & PAGEOFFSET; off = start & PAGEOFFSET;
for (start &= PAGEMASK; len > 0; start += PAGESIZE) { for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
page_t *pp; page_t *pp;
uint64_t bytes = MIN(PAGESIZE - off, len); uint64_t nbytes = MIN(PAGESIZE - off, len);
uint64_t woff = uio->uio_loffset;
/*
* We don't want a new page to "appear" in the middle of
* the file update (because it may not get the write
* update data), so we grab a lock to block
* zfs_getpage().
*/
rw_enter(&zp->z_map_lock, RW_WRITER);
if (pp = page_lookup(vp, start, SE_SHARED)) { if (pp = page_lookup(vp, start, SE_SHARED)) {
caddr_t va; caddr_t va;
rw_exit(&zp->z_map_lock);
va = zfs_map_page(pp, S_WRITE); va = zfs_map_page(pp, S_WRITE);
error = uiomove(va+off, bytes, UIO_WRITE, uio); (void) dmu_read(os, oid, start+off, nbytes, va+off);
if (error == 0) {
dmu_write(zfsvfs->z_os, zp->z_id,
woff, bytes, va+off, tx);
}
zfs_unmap_page(pp, va); zfs_unmap_page(pp, va);
page_unlock(pp); page_unlock(pp);
} else {
error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
uio, bytes, tx);
rw_exit(&zp->z_map_lock);
} }
len -= bytes; len -= nbytes;
off = 0; off = 0;
if (error)
break;
} }
return (error);
} }
/* /*
@ -733,18 +705,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* Perhaps we should use SPA_MAXBLOCKSIZE chunks? * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/ */
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
rw_enter(&zp->z_map_lock, RW_READER);
tx_bytes = uio->uio_resid; tx_bytes = uio->uio_resid;
if (vn_has_cached_data(vp)) { error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx);
rw_exit(&zp->z_map_lock);
error = mappedwrite(vp, nbytes, uio, tx);
} else {
error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
uio, nbytes, tx);
rw_exit(&zp->z_map_lock);
}
tx_bytes -= uio->uio_resid; tx_bytes -= uio->uio_resid;
if (tx_bytes && vn_has_cached_data(vp))
update_pages(vp, woff,
tx_bytes, zfsvfs->z_os, zp->z_id);
/* /*
* If we made no progress, we're done. If we made even * If we made no progress, we're done. If we made even
@ -3610,9 +3577,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
{ {
znode_t *zp = VTOZ(vp); znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog = zfsvfs->z_log;
dmu_tx_t *tx; dmu_tx_t *tx;
rl_t *rl;
u_offset_t off, koff; u_offset_t off, koff;
size_t len, klen; size_t len, klen;
uint64_t filesz; uint64_t filesz;
@ -3627,26 +3592,18 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
* a read-modify-write). * a read-modify-write).
*/ */
if (off < filesz && zp->z_blksz > PAGESIZE) { if (off < filesz && zp->z_blksz > PAGESIZE) {
if (!ISP2(zp->z_blksz)) { klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
/* Only one block in the file. */ koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
koff = 0;
} else {
klen = zp->z_blksz;
koff = P2ALIGN(off, (u_offset_t)klen);
}
ASSERT(koff <= filesz); ASSERT(koff <= filesz);
if (koff + klen > filesz) if (koff + klen > filesz)
klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
} }
ASSERT3U(btop(len), ==, btopr(len)); ASSERT3U(btop(len), ==, btopr(len));
top:
rl = zfs_range_lock(zp, off, len, RL_WRITER);
/* /*
* Can't push pages past end-of-file. * Can't push pages past end-of-file.
*/ */
filesz = zp->z_phys->zp_size;
if (off >= filesz) { if (off >= filesz) {
/* ignore all pages */ /* ignore all pages */
err = 0; err = 0;
@ -3661,17 +3618,15 @@ top:
pvn_write_done(trunc, flags); pvn_write_done(trunc, flags);
len = filesz - off; len = filesz - off;
} }
top:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_write(tx, zp->z_id, off, len);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
err = dmu_tx_assign(tx, TXG_NOWAIT); err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err != 0) { if (err != 0) {
if (err == ERESTART) { if (err == ERESTART) {
zfs_range_unlock(rl);
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
err = 0;
goto top; goto top;
} }
dmu_tx_abort(tx); dmu_tx_abort(tx);
@ -3689,12 +3644,11 @@ top:
if (err == 0) { if (err == 0) {
zfs_time_stamper(zp, CONTENT_MODIFIED, tx); zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
dmu_tx_commit(tx); dmu_tx_commit(tx);
} }
out: out:
zfs_range_unlock(rl);
pvn_write_done(pp, (err ? B_ERROR : 0) | flags); pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
if (offp) if (offp)
*offp = off; *offp = off;
@ -3731,31 +3685,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
page_t *pp; page_t *pp;
size_t io_len; size_t io_len;
u_offset_t io_off; u_offset_t io_off;
uint64_t filesz; uint_t blksz;
rl_t *rl;
int error = 0; int error = 0;
ZFS_ENTER(zfsvfs); ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp); ZFS_VERIFY_ZP(zp);
if (len == 0) { /*
* Align this request to the file block size in case we kluster.
* XXX - this can result in pretty aggresive locking, which can
* impact simultanious read/write access. One option might be
* to break up long requests (len == 0) into block-by-block
* operations to get narrower locking.
*/
blksz = zp->z_blksz;
if (ISP2(blksz))
io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
else
io_off = 0;
if (len > 0 && ISP2(blksz))
io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t);
else
io_len = 0;
if (io_len == 0) {
/* /*
* Search the entire vp list for pages >= off. * Search the entire vp list for pages >= io_off.
*/ */
error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
flags, cr); error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
goto out; goto out;
} }
rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ if (off > zp->z_phys->zp_size) {
if (off > filesz) {
/* past end of file */ /* past end of file */
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (0); return (0);
} }
len = MIN(len, filesz - off); len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off);
for (io_off = off; io_off < off + len; io_off += io_len) { for (off = io_off; io_off < off + len; io_off += io_len) {
if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
pp = page_lookup(vp, io_off, pp = page_lookup(vp, io_off,
(flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
@ -3778,6 +3751,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
} }
} }
out: out:
zfs_range_unlock(rl);
if ((flags & B_ASYNC) == 0) if ((flags & B_ASYNC) == 0)
zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
@ -3894,7 +3868,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
/* /*
* If we can't find a page in the cache, we will create a new page * If we can't find a page in the cache, we will create a new page
* and fill it with file data. For efficiency, we may try to fill * and fill it with file data. For efficiency, we may try to fill
* multiple pages at once (klustering). * multiple pages at once (klustering) to fill up the supplied page
* list.
*/ */
static int static int
zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
@ -3903,57 +3878,27 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
znode_t *zp = VTOZ(vp); znode_t *zp = VTOZ(vp);
page_t *pp, *cur_pp; page_t *pp, *cur_pp;
objset_t *os = zp->z_zfsvfs->z_os; objset_t *os = zp->z_zfsvfs->z_os;
caddr_t va;
u_offset_t io_off, total; u_offset_t io_off, total;
uint64_t oid = zp->z_id;
size_t io_len; size_t io_len;
uint64_t filesz;
int err; int err;
/*
* If we are only asking for a single page don't bother klustering.
*/
filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
if (off >= filesz)
return (EFAULT);
if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
/*
* We only have a single page, don't bother klustering
*/
io_off = off; io_off = off;
io_len = PAGESIZE; io_len = PAGESIZE;
pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
} else { } else {
/* /*
* Try to fill a kluster of pages (a blocks worth). * Try to find enough pages to fill the page list
*/ */
size_t klen;
u_offset_t koff;
if (!ISP2(zp->z_blksz)) {
/* Only one block in the file. */
klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
koff = 0;
} else {
/*
* It would be ideal to align our offset to the
* blocksize but doing so has resulted in some
* strange application crashes. For now, we
* leave the offset as is and only adjust the
* length if we are off the end of the file.
*/
koff = off;
klen = plsz;
}
ASSERT(koff <= filesz);
if (koff + klen > filesz)
klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff;
ASSERT3U(off, >=, koff);
ASSERT3U(off, <, koff + klen);
pp = pvn_read_kluster(vp, off, seg, addr, &io_off, pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
&io_len, koff, klen, 0); &io_len, off, plsz, 0);
} }
if (pp == NULL) { if (pp == NULL) {
/* /*
* Some other thread entered the page before us. * The page already exists, nothing to do here.
* Return to zfs_getpage to retry the lookup.
*/ */
*pl = NULL; *pl = NULL;
return (0); return (0);
@ -3964,9 +3909,11 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
*/ */
cur_pp = pp; cur_pp = pp;
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
caddr_t va;
ASSERT3U(io_off, ==, cur_pp->p_offset); ASSERT3U(io_off, ==, cur_pp->p_offset);
va = zfs_map_page(cur_pp, S_WRITE); va = zfs_map_page(cur_pp, S_WRITE);
err = dmu_read(os, oid, io_off, PAGESIZE, va); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va);
zfs_unmap_page(cur_pp, va); zfs_unmap_page(cur_pp, va);
if (err) { if (err) {
/* On error, toss the entire kluster */ /* On error, toss the entire kluster */
@ -3978,15 +3925,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
} }
cur_pp = cur_pp->p_next; cur_pp = cur_pp->p_next;
} }
out:
/* /*
* Fill in the page list array from the kluster. If * Fill in the page list array from the kluster starting
* there are too many pages in the kluster, return * from the desired offset `off'.
* as many pages as possible starting from the desired
* offset `off'.
* NOTE: the page list will always be null terminated. * NOTE: the page list will always be null terminated.
*/ */
pvn_plist_init(pp, pl, plsz, off, io_len, rw); pvn_plist_init(pp, pl, plsz, off, io_len, rw);
ASSERT(pl == NULL || (*pl)->p_offset == off);
return (0); return (0);
} }
@ -3994,10 +3940,10 @@ out:
/* /*
* Return pointers to the pages for the file region [off, off + len] * Return pointers to the pages for the file region [off, off + len]
* in the pl array. If plsz is greater than len, this function may * in the pl array. If plsz is greater than len, this function may
* also return page pointers from before or after the specified * also return page pointers from after the specified region
* region (i.e. some region [off', off' + plsz]). These additional * (i.e. the region [off, off + plsz]). These additional pages are
* pages are only returned if they are already in the cache, or were * only returned if they are already in the cache, or were created as
* created as part of a klustered read. * part of a klustered read.
* *
* IN: vp - vnode of file to get data from. * IN: vp - vnode of file to get data from.
* off - position in file to get data from. * off - position in file to get data from.
@ -4026,9 +3972,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
{ {
znode_t *zp = VTOZ(vp); znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = zp->z_zfsvfs;
page_t *pp, **pl0 = pl; page_t **pl0 = pl;
int need_unlock = 0, err = 0; int err = 0;
offset_t orig_off;
/* we do our own caching, faultahead is unnecessary */
if (pl == NULL)
return (0);
else if (len > plsz)
len = plsz;
else
len = P2ROUNDUP(len, PAGESIZE);
ASSERT(plsz >= len);
ZFS_ENTER(zfsvfs); ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp); ZFS_VERIFY_ZP(zp);
@ -4036,104 +3990,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
if (protp) if (protp)
*protp = PROT_ALL; *protp = PROT_ALL;
/* no faultahead (for now) */
if (pl == NULL) {
ZFS_EXIT(zfsvfs);
return (0);
}
/* can't fault past EOF */
if (off >= zp->z_phys->zp_size) {
ZFS_EXIT(zfsvfs);
return (EFAULT);
}
orig_off = off;
/*
* If we already own the lock, then we must be page faulting
* in the middle of a write to this file (i.e., we are writing
* to this file using data from a mapped region of the file).
*/
if (rw_owner(&zp->z_map_lock) != curthread) {
rw_enter(&zp->z_map_lock, RW_WRITER);
need_unlock = TRUE;
}
/* /*
* Loop through the requested range [off, off + len] looking * Loop through the requested range [off, off + len] looking
* for pages. If we don't find a page, we will need to create * for pages. If we don't find a page, we will need to create
* a new page and fill it with data from the file. * a new page and fill it with data from the file.
*/ */
while (len > 0) { while (len > 0) {
if (plsz < PAGESIZE) if (*pl = page_lookup(vp, off, SE_SHARED))
break; *(pl+1) = NULL;
if (pp = page_lookup(vp, off, SE_SHARED)) { else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
*pl++ = pp; goto out;
while (*pl) {
ASSERT3U((*pl)->p_offset, ==, off);
off += PAGESIZE; off += PAGESIZE;
addr += PAGESIZE; addr += PAGESIZE;
len -= PAGESIZE; if (len > 0) {
ASSERT3U(len, >=, PAGESIZE);
len -= PAGESIZE;
}
ASSERT3U(plsz, >=, PAGESIZE);
plsz -= PAGESIZE; plsz -= PAGESIZE;
} else { pl++;
err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
if (err)
goto out;
/*
* klustering may have changed our region
* to be block aligned.
*/
if (((pp = *pl) != 0) && (off != pp->p_offset)) {
int delta = off - pp->p_offset;
len += delta;
off -= delta;
addr -= delta;
}
while (*pl) {
pl++;
off += PAGESIZE;
addr += PAGESIZE;
plsz -= PAGESIZE;
if (len > PAGESIZE)
len -= PAGESIZE;
else
len = 0;
}
} }
} }
/* /*
* Fill out the page array with any pages already in the cache. * Fill out the page array with any pages already in the cache.
*/ */
while (plsz > 0) { while (plsz > 0 &&
pp = page_lookup_nowait(vp, off, SE_SHARED); (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
if (pp == NULL) off += PAGESIZE;
break; plsz -= PAGESIZE;
*pl++ = pp;
off += PAGESIZE;
plsz -= PAGESIZE;
} }
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
out: out:
/*
* We can't grab the range lock for the page as reader which would
* stop truncation as this leads to deadlock. So we need to recheck
* the file size.
*/
if (orig_off >= zp->z_phys->zp_size)
err = EFAULT;
if (err) { if (err) {
/* /*
* Release any pages we have previously locked. * Release any pages we have previously locked.
*/ */
while (pl > pl0) while (pl > pl0)
page_unlock(*--pl); page_unlock(*--pl);
} else {
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
} }
*pl = NULL; *pl = NULL;
if (need_unlock)
rw_exit(&zp->z_map_lock);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (err); return (err);
} }

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -117,7 +117,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
list_link_init(&zp->z_link_node); list_link_init(&zp->z_link_node);
mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
@ -142,7 +141,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
vn_free(ZTOV(zp)); vn_free(ZTOV(zp));
ASSERT(!list_link_active(&zp->z_link_node)); ASSERT(!list_link_active(&zp->z_link_node));
mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_lock);
rw_destroy(&zp->z_map_lock);
rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_parent_lock);
rw_destroy(&zp->z_name_lock); rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock); mutex_destroy(&zp->z_acl_lock);
@ -1375,15 +1373,12 @@ top:
dmu_tx_commit(tx); dmu_tx_commit(tx);
zfs_range_unlock(rl);
/* /*
* Clear any mapped pages in the truncated region. This has to * Clear any mapped pages in the truncated region. This has to
* happen outside of the transaction to avoid the possibility of * happen outside of the transaction to avoid the possibility of
* a deadlock with someone trying to push a page that we are * a deadlock with someone trying to push a page that we are
* about to invalidate. * about to invalidate.
*/ */
rw_enter(&zp->z_map_lock, RW_WRITER);
if (vn_has_cached_data(vp)) { if (vn_has_cached_data(vp)) {
page_t *pp; page_t *pp;
uint64_t start = end & PAGEMASK; uint64_t start = end & PAGEMASK;
@ -1401,7 +1396,8 @@ top:
B_INVAL | B_TRUNC, NULL); B_INVAL | B_TRUNC, NULL);
ASSERT(error == 0); ASSERT(error == 0);
} }
rw_exit(&zp->z_map_lock);
zfs_range_unlock(rl);
return (0); return (0);
} }

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -69,6 +69,7 @@ char *zio_type_name[ZIO_TYPES] = {
* ========================================================================== * ==========================================================================
*/ */
kmem_cache_t *zio_cache; kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
int zio_bulk_flags = 0; int zio_bulk_flags = 0;
@ -93,8 +94,10 @@ zio_init(void)
#ifdef _KERNEL #ifdef _KERNEL
data_alloc_arena = zio_alloc_arena; data_alloc_arena = zio_alloc_arena;
#endif #endif
zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, zio_cache = kmem_cache_create("zio_cache",
NULL, NULL, NULL, NULL, NULL, 0); sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
/* /*
* For small buffers, we want a cache for each multiple of * For small buffers, we want a cache for each multiple of
@ -166,6 +169,7 @@ zio_fini(void)
zio_data_buf_cache[c] = NULL; zio_data_buf_cache[c] = NULL;
} }
kmem_cache_destroy(zio_link_cache);
kmem_cache_destroy(zio_cache); kmem_cache_destroy(zio_cache);
zio_inject_fini(); zio_inject_fini();
@ -300,41 +304,102 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
* I/O parent/child relationships and pipeline interlocks * I/O parent/child relationships and pipeline interlocks
* ========================================================================== * ==========================================================================
*/ */
/*
static void * NOTE - Callers to zio_walk_parents() and zio_walk_children must
zio_add_child(zio_t *pio, zio_t *zio) * continue calling these functions until they return NULL.
* Otherwise, the next caller will pick up the list walk in
* some indeterminate state. (Otherwise every caller would
* have to pass in a cookie to keep the state represented by
* io_walk_link, which gets annoying.)
*/
zio_t *
zio_walk_parents(zio_t *cio)
{ {
zio_link_t *zl = cio->io_walk_link;
list_t *pl = &cio->io_parent_list;
zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
cio->io_walk_link = zl;
if (zl == NULL)
return (NULL);
ASSERT(zl->zl_child == cio);
return (zl->zl_parent);
}
zio_t *
zio_walk_children(zio_t *pio)
{
zio_link_t *zl = pio->io_walk_link;
list_t *cl = &pio->io_child_list;
zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
pio->io_walk_link = zl;
if (zl == NULL)
return (NULL);
ASSERT(zl->zl_parent == pio);
return (zl->zl_child);
}
zio_t *
zio_unique_parent(zio_t *cio)
{
zio_t *pio = zio_walk_parents(cio);
VERIFY(zio_walk_parents(cio) == NULL);
return (pio);
}
void
zio_add_child(zio_t *pio, zio_t *cio)
{
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
* Vdev I/Os can only have vdev children.
* The following ASSERT captures all of these constraints.
*/
ASSERT(cio->io_child_type <= pio->io_child_type);
zl->zl_parent = pio;
zl->zl_child = cio;
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock); mutex_enter(&pio->io_lock);
if (zio->io_stage < ZIO_STAGE_READY)
pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
if (zio->io_stage < ZIO_STAGE_DONE)
pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; for (int w = 0; w < ZIO_WAIT_TYPES; w++)
zio->io_sibling_prev = NULL; pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
zio->io_sibling_next = pio->io_child;
if (pio->io_child != NULL) list_insert_head(&pio->io_child_list, zl);
pio->io_child->io_sibling_prev = zio; list_insert_head(&cio->io_parent_list, zl);
pio->io_child = zio;
zio->io_parent = pio;
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
} }
static void static void
zio_remove_child(zio_t *pio, zio_t *zio) zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
{ {
zio_t *next, *prev; ASSERT(zl->zl_parent == pio);
ASSERT(zl->zl_child == cio);
ASSERT(zio->io_parent == pio);
mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock); mutex_enter(&pio->io_lock);
next = zio->io_sibling_next;
prev = zio->io_sibling_prev; list_remove(&pio->io_child_list, zl);
if (next != NULL) list_remove(&cio->io_parent_list, zl);
next->io_sibling_prev = prev;
if (prev != NULL)
prev->io_sibling_next = next;
if (pio->io_child == zio)
pio->io_child = next;
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
kmem_cache_free(zio_link_cache, zl);
} }
static boolean_t static boolean_t
@ -409,6 +474,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
list_create(&zio->io_parent_list, sizeof (zio_link_t),
offsetof(zio_link_t, zl_parent_node));
list_create(&zio->io_child_list, sizeof (zio_link_t),
offsetof(zio_link_t, zl_child_node));
if (vd != NULL) if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV; zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD) else if (flags & ZIO_FLAG_GANG_CHILD)
@ -443,17 +513,13 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_orig_pipeline = zio->io_pipeline = pipeline;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
if (zb != NULL) if (zb != NULL)
zio->io_bookmark = *zb; zio->io_bookmark = *zb;
if (pio != NULL) { if (pio != NULL) {
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
* Vdev I/Os can only have vdev children.
* The following ASSERT captures all of these constraints.
*/
ASSERT(zio->io_child_type <= pio->io_child_type);
if (zio->io_logical == NULL) if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical; zio->io_logical = pio->io_logical;
zio_add_child(pio, zio); zio_add_child(pio, zio);
@ -468,6 +534,8 @@ zio_destroy(zio_t *zio)
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
uint8_t async_root = zio->io_async_root; uint8_t async_root = zio->io_async_root;
list_destroy(&zio->io_parent_list);
list_destroy(&zio->io_child_list);
mutex_destroy(&zio->io_lock); mutex_destroy(&zio->io_lock);
cv_destroy(&zio->io_cv); cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio); kmem_cache_free(zio_cache, zio);
@ -481,13 +549,13 @@ zio_destroy(zio_t *zio)
} }
zio_t * zio_t *
zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
int flags) void *private, int flags)
{ {
zio_t *zio; zio_t *zio;
zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
return (zio); return (zio);
@ -496,7 +564,7 @@ zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
zio_t * zio_t *
zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
{ {
return (zio_null(NULL, spa, done, private, flags)); return (zio_null(NULL, spa, NULL, done, private, flags));
} }
zio_t * zio_t *
@ -575,12 +643,12 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_HOLE(bp));
if (bp->blk_fill == BLK_FILL_ALREADY_FREED) if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
return (zio_null(pio, spa, NULL, NULL, flags)); return (zio_null(pio, spa, NULL, NULL, NULL, flags));
if (txg == spa->spa_syncing_txg && if (txg == spa->spa_syncing_txg &&
spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
return (zio_null(pio, spa, NULL, NULL, flags)); return (zio_null(pio, spa, NULL, NULL, NULL, flags));
} }
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@ -631,7 +699,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio->io_cmd = cmd; zio->io_cmd = cmd;
} else { } else {
zio = zio_null(pio, spa, NULL, NULL, flags); zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
for (c = 0; c < vd->vdev_children; c++) for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
@ -1023,11 +1091,12 @@ zio_nowait(zio_t *zio)
{ {
ASSERT(zio->io_executor == NULL); ASSERT(zio->io_executor == NULL);
if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
zio_unique_parent(zio) == NULL) {
/* /*
* This is a logical async I/O with no parent to wait for it. * This is a logical async I/O with no parent to wait for it.
* Attach it to the pool's global async root zio so that * Track how many outstanding I/Os of this type exist so
* spa_unload() has a way of waiting for async I/O to finish. * that spa_unload() knows when they are all done.
*/ */
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio->io_async_root = B_TRUE; zio->io_async_root = B_TRUE;
@ -1048,14 +1117,19 @@ zio_nowait(zio_t *zio)
static void static void
zio_reexecute(zio_t *pio) zio_reexecute(zio_t *pio)
{ {
zio_t *zio, *zio_next; zio_t *cio, *cio_next;
int c; int c, w;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
pio->io_flags = pio->io_orig_flags; pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage; pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline; pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0; pio->io_reexecute = 0;
pio->io_error = 0; pio->io_error = 0;
for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
for (c = 0; c < ZIO_CHILD_TYPES; c++) for (c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0; pio->io_child_error[c] = 0;
@ -1075,18 +1149,18 @@ zio_reexecute(zio_t *pio)
/* /*
* As we reexecute pio's children, new children could be created. * As we reexecute pio's children, new children could be created.
* New children go to the head of the io_child list, however, * New children go to the head of pio's io_child_list, however,
* so we will (correctly) not reexecute them. The key is that * so we will (correctly) not reexecute them. The key is that
* the remainder of the io_child list, from 'zio_next' onward, * the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'zio'. * cannot be affected by any side effects of reexecuting 'cio'.
*/ */
for (zio = pio->io_child; zio != NULL; zio = zio_next) { for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
zio_next = zio->io_sibling_next; cio_next = zio_walk_children(pio);
mutex_enter(&pio->io_lock); mutex_enter(&pio->io_lock);
pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
zio_reexecute(zio); zio_reexecute(cio);
} }
/* /*
@ -1115,7 +1189,7 @@ zio_suspend(spa_t *spa, zio_t *zio)
if (zio != NULL) { if (zio != NULL) {
ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio != spa->spa_suspend_zio_root);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_parent == NULL); ASSERT(zio_unique_parent(zio) == NULL);
ASSERT(zio->io_stage == ZIO_STAGE_DONE); ASSERT(zio->io_stage == ZIO_STAGE_DONE);
zio_add_child(spa->spa_suspend_zio_root, zio); zio_add_child(spa->spa_suspend_zio_root, zio);
} }
@ -1126,7 +1200,7 @@ zio_suspend(spa_t *spa, zio_t *zio)
void void
zio_resume(spa_t *spa) zio_resume(spa_t *spa)
{ {
zio_t *pio, *zio; zio_t *pio, *cio, *cio_next;
/* /*
* Reexecute all previously suspended i/o. * Reexecute all previously suspended i/o.
@ -1141,10 +1215,11 @@ zio_resume(spa_t *spa)
if (pio == NULL) if (pio == NULL)
return; return;
while ((zio = pio->io_child) != NULL) { for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
zio_remove_child(pio, zio); zio_link_t *zl = pio->io_walk_link;
zio->io_parent = NULL; cio_next = zio_walk_children(pio);
zio_reexecute(zio); zio_remove_child(pio, cio, zl);
zio_reexecute(cio);
} }
ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
@ -1358,10 +1433,11 @@ zio_gang_tree_assemble_done(zio_t *zio)
zio_t *lio = zio->io_logical; zio_t *lio = zio->io_logical;
zio_gang_node_t *gn = zio->io_private; zio_gang_node_t *gn = zio->io_private;
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
zio_t *pio = zio_unique_parent(zio);
int g; int g;
ASSERT(zio->io_parent == lio); ASSERT(pio == lio);
ASSERT(zio->io_child == NULL); ASSERT(zio_walk_children(zio) == NULL);
if (zio->io_error) if (zio->io_error)
return; return;
@ -1453,7 +1529,7 @@ zio_gang_issue(zio_t *zio)
static void static void
zio_write_gang_member_ready(zio_t *zio) zio_write_gang_member_ready(zio_t *zio)
{ {
zio_t *pio = zio->io_parent; zio_t *pio = zio_unique_parent(zio);
zio_t *lio = zio->io_logical; zio_t *lio = zio->io_logical;
dva_t *cdva = zio->io_bp->blk_dva; dva_t *cdva = zio->io_bp->blk_dva;
dva_t *pdva = pio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva;
@ -1700,72 +1776,6 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
* Read and write to physical devices * Read and write to physical devices
* ========================================================================== * ==========================================================================
*/ */
static void
zio_vdev_io_probe_done(zio_t *zio)
{
zio_t *dio;
vdev_t *vd = zio->io_private;
mutex_enter(&vd->vdev_probe_lock);
ASSERT(vd->vdev_probe_zio == zio);
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
while ((dio = zio->io_delegate_list) != NULL) {
zio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
if (!vdev_accessible(vd, dio))
dio->io_error = ENXIO;
zio_execute(dio);
}
}
/*
* Probe the device to determine whether I/O failure is specific to this
* zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
*/
static int
zio_vdev_io_probe(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
zio_t *pio = NULL;
boolean_t created_pio = B_FALSE;
/*
* Don't probe the probe.
*/
if (zio->io_flags & ZIO_FLAG_PROBE)
return (ZIO_PIPELINE_CONTINUE);
/*
* To prevent 'probe storms' when a device fails, we create
* just one probe i/o at a time. All zios that want to probe
* this vdev will join the probe zio's io_delegate_list.
*/
mutex_enter(&vd->vdev_probe_lock);
if ((pio = vd->vdev_probe_zio) == NULL) {
vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
created_pio = B_TRUE;
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
}
zio->io_delegate_next = pio->io_delegate_list;
pio->io_delegate_list = zio;
mutex_exit(&vd->vdev_probe_lock);
if (created_pio) {
zio_nowait(vdev_probe(vd, pio));
zio_nowait(pio);
}
return (ZIO_PIPELINE_STOP);
}
static int static int
zio_vdev_io_start(zio_t *zio) zio_vdev_io_start(zio_t *zio)
{ {
@ -1821,7 +1831,6 @@ zio_vdev_io_start(zio_t *zio)
zio->io_txg != 0 && /* not a delegated i/o */ zio->io_txg != 0 && /* not a delegated i/o */
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_delegate_list == NULL);
zio_vdev_io_bypass(zio); zio_vdev_io_bypass(zio);
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
@ -1830,7 +1839,7 @@ zio_vdev_io_start(zio_t *zio)
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_CONTINUE);
if ((zio = vdev_queue_io(zio)) == NULL) if ((zio = vdev_queue_io(zio)) == NULL)
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_STOP);
@ -1882,7 +1891,7 @@ zio_vdev_io_done(zio_t *zio)
ops->vdev_op_io_done(zio); ops->vdev_op_io_done(zio);
if (unexpected_error) if (unexpected_error)
return (zio_vdev_io_probe(zio)); VERIFY(vdev_probe(vd, zio) == NULL);
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
@ -2078,7 +2087,7 @@ static int
zio_ready(zio_t *zio) zio_ready(zio_t *zio)
{ {
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
zio_t *pio = zio->io_parent; zio_t *pio, *pio_next;
if (zio->io_ready) { if (zio->io_ready) {
if (BP_IS_GANG(bp) && if (BP_IS_GANG(bp) &&
@ -2098,8 +2107,22 @@ zio_ready(zio_t *zio)
if (zio->io_error) if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (pio != NULL) mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
pio = zio_walk_parents(zio);
mutex_exit(&zio->io_lock);
/*
* As we notify zio's parents, new parents could be added.
* New parents go to the head of zio's io_parent_list, however,
* so we will (correctly) not notify them. The remainder of zio's
* io_parent_list, from 'pio_next' onward, cannot change because
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
pio_next = zio_walk_parents(zio);
zio_notify_parent(pio, zio, ZIO_WAIT_READY); zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
@ -2108,11 +2131,11 @@ static int
zio_done(zio_t *zio) zio_done(zio_t *zio)
{ {
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio_t *pio = zio->io_parent;
zio_t *lio = zio->io_logical; zio_t *lio = zio->io_logical;
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
vdev_t *vd = zio->io_vd; vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size; uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
int c, w; int c, w;
/* /*
@ -2133,7 +2156,7 @@ zio_done(zio_t *zio)
ASSERT(bp->blk_pad[1] == 0); ASSERT(bp->blk_pad[1] == 0);
ASSERT(bp->blk_pad[2] == 0); ASSERT(bp->blk_pad[2] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
(pio != NULL && bp == pio->io_bp)); (bp == zio_unique_parent(zio)->io_bp));
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!BP_SHOULD_BYTESWAP(bp));
@ -2228,7 +2251,11 @@ zio_done(zio_t *zio)
zio_gang_tree_free(&zio->io_gang_tree); zio_gang_tree_free(&zio->io_gang_tree);
if (pio != NULL) { mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
if ((pio = zio_unique_parent(zio)) != NULL) {
/* /*
* We're not a root i/o, so there's nothing to do * We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors * but notify our parent. Don't propagate errors
@ -2254,20 +2281,28 @@ zio_done(zio_t *zio)
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_STOP);
} }
ASSERT(zio->io_child == NULL); ASSERT(zio_walk_children(zio) == NULL);
ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
* such, cannot acquire any new parents.
*/
if (zio->io_done) if (zio->io_done)
zio->io_done(zio); zio->io_done(zio);
zio_gang_tree_free(&zio->io_gang_tree); zio_gang_tree_free(&zio->io_gang_tree);
ASSERT(zio->io_delegate_list == NULL); mutex_enter(&zio->io_lock);
ASSERT(zio->io_delegate_next == NULL); zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
if (pio != NULL) { for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
zio_remove_child(pio, zio); zio_link_t *zl = zio->io_walk_link;
pio_next = zio_walk_parents(zio);
zio_remove_child(pio, zio, zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE); zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
} }

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -1324,6 +1324,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
break; break;
} }
zfs_range_unlock(rl); zfs_range_unlock(rl);
if (!zil_disable)
zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
return (error); return (error);
} }