Merge commit 'refs/top-bases/linux-arc' into linux-arc

This commit is contained in:
Brian Behlendorf 2009-01-15 14:16:25 -08:00
commit 8684f3ea64
49 changed files with 1591 additions and 848 deletions

View File

@ -1 +1 @@
http://dlc.sun.com/osol/on/downloads/b103/on-src.tar.bz2 http://dlc.sun.com/osol/on/downloads/b105/on-src.tar.bz2

View File

@ -86,8 +86,8 @@ static void
usage(void) usage(void)
{ {
(void) fprintf(stderr, (void) fprintf(stderr,
"Usage: %s [-udibcsv] [-U cachefile_path] " "Usage: %s [-udibcsvL] [-U cachefile_path] [-t txg]\n"
"[-S user:cksumalg] " "\t [-S user:cksumalg] "
"dataset [object...]\n" "dataset [object...]\n"
" %s -C [pool]\n" " %s -C [pool]\n"
" %s -l dev\n" " %s -l dev\n"
@ -107,6 +107,8 @@ usage(void)
"dump blkptr signatures\n"); "dump blkptr signatures\n");
(void) fprintf(stderr, " -v verbose (applies to all others)\n"); (void) fprintf(stderr, " -v verbose (applies to all others)\n");
(void) fprintf(stderr, " -l dump label contents\n"); (void) fprintf(stderr, " -l dump label contents\n");
(void) fprintf(stderr, " -L disable leak tracking (do not "
"load spacemaps)\n");
(void) fprintf(stderr, " -U cachefile_path -- use alternate " (void) fprintf(stderr, " -U cachefile_path -- use alternate "
"cachefile\n"); "cachefile\n");
(void) fprintf(stderr, " -R read and display block from a " (void) fprintf(stderr, " -R read and display block from a "
@ -114,6 +116,8 @@ usage(void)
(void) fprintf(stderr, " -e Pool is exported/destroyed/" (void) fprintf(stderr, " -e Pool is exported/destroyed/"
"has altroot\n"); "has altroot\n");
(void) fprintf(stderr, " -p <Path to vdev dir> (use with -e)\n"); (void) fprintf(stderr, " -p <Path to vdev dir> (use with -e)\n");
(void) fprintf(stderr, " -t <txg> highest txg to use when "
"searching for uberblocks\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n"); "to make only that option verbose\n");
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
@ -515,45 +519,53 @@ dump_metaslabs(spa_t *spa)
} }
} }
static void
dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
{
char *prefix = (void *)sm;
(void) printf("%s [%llu,%llu) length %llu\n",
prefix,
(u_longlong_t)start,
(u_longlong_t)(start + size),
(u_longlong_t)(size));
}
static void static void
dump_dtl(vdev_t *vd, int indent) dump_dtl(vdev_t *vd, int indent)
{ {
avl_tree_t *t = &vd->vdev_dtl_map.sm_root; spa_t *spa = vd->vdev_spa;
space_seg_t *ss; boolean_t required;
vdev_t *pvd; char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
int c; char prefix[256];
spa_vdev_state_enter(spa);
required = vdev_dtl_required(vd);
(void) spa_vdev_state_exit(spa, NULL, 0);
if (indent == 0) if (indent == 0)
(void) printf("\nDirty time logs:\n\n"); (void) printf("\nDirty time logs:\n\n");
(void) printf("\t%*s%s\n", indent, "", (void) printf("\t%*s%s [%s]\n", indent, "",
vd->vdev_path ? vd->vdev_path : vd->vdev_path ? vd->vdev_path :
vd->vdev_parent ? vd->vdev_ops->vdev_op_type : vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
spa_name(vd->vdev_spa)); required ? "DTL-required" : "DTL-expendable");
for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) { for (int t = 0; t < DTL_TYPES; t++) {
/* space_map_t *sm = &vd->vdev_dtl[t];
* Everything in this DTL must appear in all parent DTL unions. if (sm->sm_space == 0)
*/ continue;
for (pvd = vd; pvd; pvd = pvd->vdev_parent) (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map, indent + 2, "", name[t]);
ss->ss_start, ss->ss_end - ss->ss_start)); mutex_enter(sm->sm_lock);
(void) printf("\t%*soutage [%llu,%llu] length %llu\n", space_map_walk(sm, dump_dtl_seg, (void *)prefix);
indent, "", mutex_exit(sm->sm_lock);
(u_longlong_t)ss->ss_start, if (dump_opt['d'] > 5 && vd->vdev_children == 0)
(u_longlong_t)ss->ss_end - 1, dump_spacemap(spa->spa_meta_objset,
(u_longlong_t)(ss->ss_end - ss->ss_start)); &vd->vdev_dtl_smo, sm);
} }
(void) printf("\n"); for (int c = 0; c < vd->vdev_children; c++)
if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
&vd->vdev_dtl_map);
(void) printf("\n");
}
for (c = 0; c < vd->vdev_children; c++)
dump_dtl(vd->vdev_child[c], indent + 4); dump_dtl(vd->vdev_child[c], indent + 4);
} }
@ -667,7 +679,8 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
break; break;
fill += cbp->blk_fill; fill += cbp->blk_fill;
} }
ASSERT3U(fill, ==, bp->blk_fill); if (!err)
ASSERT3U(fill, ==, bp->blk_fill);
(void) arc_buf_remove_ref(buf, &buf); (void) arc_buf_remove_ref(buf, &buf);
} }
@ -1484,8 +1497,9 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
} }
} }
VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp, if (!dump_opt['L'])
NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0); VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
} }
static int static int
@ -1560,9 +1574,11 @@ dump_block_stats(spa_t *spa)
int c, e; int c, e;
if (!dump_opt['S']) { if (!dump_opt['S']) {
(void) printf("\nTraversing all blocks to %sverify" (void) printf("\nTraversing all blocks %s%s%s%s...\n",
" nothing leaked ...\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
dump_opt['c'] ? "verify checksums and " : ""); dump_opt['c'] ? "checksums " : "",
(dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
!dump_opt['L'] ? "nothing leaked " : "");
} }
/* /*
@ -1573,7 +1589,8 @@ dump_block_stats(spa_t *spa)
* it's not part of any space map) is a double allocation, * it's not part of any space map) is a double allocation,
* reference to a freed block, or an unclaimed log block. * reference to a freed block, or an unclaimed log block.
*/ */
zdb_leak_init(spa); if (!dump_opt['L'])
zdb_leak_init(spa);
/* /*
* If there's a deferred-free bplist, process that first. * If there's a deferred-free bplist, process that first.
@ -1615,7 +1632,8 @@ dump_block_stats(spa_t *spa)
/* /*
* Report any leaked segments. * Report any leaked segments.
*/ */
zdb_leak_fini(spa); if (!dump_opt['L'])
zdb_leak_fini(spa);
/* /*
* If we're interested in printing out the blkptr signatures, * If we're interested in printing out the blkptr signatures,
@ -1641,14 +1659,16 @@ dump_block_stats(spa_t *spa)
tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL]; tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
if (tzb->zb_asize == alloc + logalloc) { if (tzb->zb_asize == alloc + logalloc) {
(void) printf("\n\tNo leaks (block sum matches space" if (!dump_opt['L'])
" maps exactly)\n"); (void) printf("\n\tNo leaks (block sum matches space"
" maps exactly)\n");
} else { } else {
(void) printf("block traversal size %llu != alloc %llu " (void) printf("block traversal size %llu != alloc %llu "
"(leaked %lld)\n", "(%s %lld)\n",
(u_longlong_t)tzb->zb_asize, (u_longlong_t)tzb->zb_asize,
(u_longlong_t)alloc + logalloc, (u_longlong_t)alloc + logalloc,
(u_longlong_t)(alloc + logalloc - tzb->zb_asize)); (dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(alloc + logalloc - tzb->zb_asize));
leaks = 1; leaks = 1;
} }
@ -2238,7 +2258,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv); dprintf_setup(&argc, argv);
while ((c = getopt(argc, argv, "udibcsvCS:U:lRep:")) != -1) { while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) {
switch (c) { switch (c) {
case 'u': case 'u':
case 'd': case 'd':
@ -2252,6 +2272,9 @@ main(int argc, char **argv)
dump_opt[c]++; dump_opt[c]++;
dump_all = 0; dump_all = 0;
break; break;
case 'L':
dump_opt[c]++;
break;
case 'v': case 'v':
verbose++; verbose++;
break; break;
@ -2282,6 +2305,14 @@ main(int argc, char **argv)
else else
usage(); usage();
break; break;
case 't':
ub_max_txg = strtoull(optarg, NULL, 0);
if (ub_max_txg < TXG_INITIAL) {
(void) fprintf(stderr, "incorrect txg "
"specified: %s\n", optarg);
usage();
}
break;
default: default:
usage(); usage();
break; break;

View File

@ -370,18 +370,12 @@ usage(boolean_t requested)
zfs_deleg_permissions(); zfs_deleg_permissions();
} else { } else {
/*
* TRANSLATION NOTE:
* "zfs set|get" must not be localised this is the
* command name and arguments.
*/
(void) fprintf(fp, (void) fprintf(fp,
gettext("\nFor the property list, run: zfs set|get\n")); gettext("\nFor the property list, run: %s\n"),
"zfs set|get");
(void) fprintf(fp, (void) fprintf(fp,
gettext("\nFor the delegated permission list, run:" gettext("\nFor the delegated permission list, run: %s\n"),
" zfs allow|unallow\n")); "zfs allow|unallow");
} }
/* /*
@ -419,7 +413,6 @@ parseprop(nvlist_t *props)
return (-1); return (-1);
} }
return (0); return (0);
} }
/* /*
@ -2584,14 +2577,15 @@ zfs_print_allows(char *ds)
for (curperms = perms; curperms; curperms = curperms->z_next) { for (curperms = perms; curperms; curperms = curperms->z_next) {
(void) snprintf(banner, sizeof (banner), (void) snprintf(banner, sizeof (banner),
"Permission sets on (%s)", curperms->z_setpoint); gettext("Permission sets on (%s)"), curperms->z_setpoint);
allowcb.a_treeoffset = allowcb.a_treeoffset =
offsetof(zfs_allow_node_t, z_localdescend); offsetof(zfs_allow_node_t, z_localdescend);
allowcb.a_permcnt = 0; allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_sets, banner, &allowcb); zfs_iter_perms(&curperms->z_sets, banner, &allowcb);
(void) snprintf(banner, sizeof (banner), (void) snprintf(banner, sizeof (banner),
"Create time permissions on (%s)", curperms->z_setpoint); gettext("Create time permissions on (%s)"),
curperms->z_setpoint);
allowcb.a_treeoffset = allowcb.a_treeoffset =
offsetof(zfs_allow_node_t, z_localdescend); offsetof(zfs_allow_node_t, z_localdescend);
allowcb.a_permcnt = 0; allowcb.a_permcnt = 0;
@ -2599,7 +2593,7 @@ zfs_print_allows(char *ds)
(void) snprintf(banner, sizeof (banner), (void) snprintf(banner, sizeof (banner),
"Local permissions on (%s)", curperms->z_setpoint); gettext("Local permissions on (%s)"), curperms->z_setpoint);
allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local); allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local);
allowcb.a_permcnt = 0; allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_user, banner, &allowcb); zfs_iter_perms(&curperms->z_user, banner, &allowcb);
@ -2607,7 +2601,8 @@ zfs_print_allows(char *ds)
zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
(void) snprintf(banner, sizeof (banner), (void) snprintf(banner, sizeof (banner),
"Descendent permissions on (%s)", curperms->z_setpoint); gettext("Descendent permissions on (%s)"),
curperms->z_setpoint);
allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend); allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend);
allowcb.a_permcnt = 0; allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_user, banner, &allowcb); zfs_iter_perms(&curperms->z_user, banner, &allowcb);
@ -2615,7 +2610,7 @@ zfs_print_allows(char *ds)
zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
(void) snprintf(banner, sizeof (banner), (void) snprintf(banner, sizeof (banner),
"Local+Descendent permissions on (%s)", gettext("Local+Descendent permissions on (%s)"),
curperms->z_setpoint); curperms->z_setpoint);
allowcb.a_treeoffset = allowcb.a_treeoffset =
offsetof(zfs_allow_node_t, z_localdescend); offsetof(zfs_allow_node_t, z_localdescend);
@ -3071,7 +3066,6 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
strcmp(smbshareopts, "off") == 0) { strcmp(smbshareopts, "off") == 0) {
@ -3081,7 +3075,8 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
(void) fprintf(stderr, gettext("cannot share '%s': " (void) fprintf(stderr, gettext("cannot share '%s': "
"legacy share\n"), zfs_get_name(zhp)); "legacy share\n"), zfs_get_name(zhp));
(void) fprintf(stderr, gettext("use share(1M) to " (void) fprintf(stderr, gettext("use share(1M) to "
"share this filesystem\n")); "share this filesystem, or set "
"sharenfs property on\n"));
return (1); return (1);
} }
@ -3119,6 +3114,7 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
* noauto no return 0 * noauto no return 0
* noauto yes pass through * noauto yes pass through
*/ */
canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
if (canmount == ZFS_CANMOUNT_OFF) { if (canmount == ZFS_CANMOUNT_OFF) {
if (!explicit) if (!explicit)
return (0); return (0);

View File

@ -877,17 +877,21 @@ int
zpool_do_export(int argc, char **argv) zpool_do_export(int argc, char **argv)
{ {
boolean_t force = B_FALSE; boolean_t force = B_FALSE;
boolean_t hardforce = B_FALSE;
int c; int c;
zpool_handle_t *zhp; zpool_handle_t *zhp;
int ret; int ret;
int i; int i;
/* check options */ /* check options */
while ((c = getopt(argc, argv, "f")) != -1) { while ((c = getopt(argc, argv, "fF")) != -1) {
switch (c) { switch (c) {
case 'f': case 'f':
force = B_TRUE; force = B_TRUE;
break; break;
case 'F':
hardforce = B_TRUE;
break;
case '?': case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"), (void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt); optopt);
@ -917,8 +921,12 @@ zpool_do_export(int argc, char **argv)
continue; continue;
} }
if (zpool_export(zhp, force) != 0) if (hardforce) {
if (zpool_export_force(zhp) != 0)
ret = 1;
} else if (zpool_export(zhp, force) != 0) {
ret = 1; ret = 1;
}
zpool_close(zhp); zpool_close(zhp);
} }

View File

@ -419,10 +419,10 @@ ztest_random(uint64_t range)
return (r % range); return (r % range);
} }
/* ARGSUSED */
static void static void
ztest_record_enospc(char *s) ztest_record_enospc(char *s)
{ {
dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
ztest_shared->zs_enospc_count++; ztest_shared->zs_enospc_count++;
} }
@ -698,15 +698,9 @@ ztest_random_compress(void)
return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS)); return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
} }
typedef struct ztest_replay {
objset_t *zr_os;
uint64_t zr_assign;
} ztest_replay_t;
static int static int
ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap) ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
{ {
objset_t *os = zr->zr_os;
dmu_tx_t *tx; dmu_tx_t *tx;
int error; int error;
@ -715,7 +709,7 @@ ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
error = dmu_tx_assign(tx, zr->zr_assign); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
return (error); return (error);
@ -732,16 +726,15 @@ ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
(void) printf("replay create of %s object %llu" (void) printf("replay create of %s object %llu"
" in txg %llu = %d\n", " in txg %llu = %d\n",
osname, (u_longlong_t)lr->lr_doid, osname, (u_longlong_t)lr->lr_doid,
(u_longlong_t)zr->zr_assign, error); (u_longlong_t)dmu_tx_get_txg(tx), error);
} }
return (error); return (error);
} }
static int static int
ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap) ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
{ {
objset_t *os = zr->zr_os;
dmu_tx_t *tx; dmu_tx_t *tx;
int error; int error;
@ -750,7 +743,7 @@ ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END); dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
error = dmu_tx_assign(tx, zr->zr_assign); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
return (error); return (error);
@ -978,7 +971,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
uint64_t leaf, top; uint64_t leaf, top;
uint64_t ashift = ztest_get_ashift(); uint64_t ashift = ztest_get_ashift();
uint64_t oldguid; uint64_t oldguid, pguid;
size_t oldsize, newsize; size_t oldsize, newsize;
char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
int replacing; int replacing;
@ -1010,10 +1003,16 @@ ztest_vdev_attach_detach(ztest_args_t *za)
* Locate this vdev. * Locate this vdev.
*/ */
oldvd = rvd->vdev_child[top]; oldvd = rvd->vdev_child[top];
if (zopt_mirrors >= 1) if (zopt_mirrors >= 1) {
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
ASSERT(oldvd->vdev_children >= zopt_mirrors);
oldvd = oldvd->vdev_child[leaf / zopt_raidz]; oldvd = oldvd->vdev_child[leaf / zopt_raidz];
if (zopt_raidz > 1) }
if (zopt_raidz > 1) {
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
ASSERT(oldvd->vdev_children == zopt_raidz);
oldvd = oldvd->vdev_child[leaf % zopt_raidz]; oldvd = oldvd->vdev_child[leaf % zopt_raidz];
}
/* /*
* If we're already doing an attach or replace, oldvd may be a * If we're already doing an attach or replace, oldvd may be a
@ -1021,8 +1020,8 @@ ztest_vdev_attach_detach(ztest_args_t *za)
*/ */
while (oldvd->vdev_children != 0) { while (oldvd->vdev_children != 0) {
oldvd_has_siblings = B_TRUE; oldvd_has_siblings = B_TRUE;
ASSERT(oldvd->vdev_children == 2); ASSERT(oldvd->vdev_children >= 2);
oldvd = oldvd->vdev_child[ztest_random(2)]; oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
} }
oldguid = oldvd->vdev_guid; oldguid = oldvd->vdev_guid;
@ -1030,16 +1029,17 @@ ztest_vdev_attach_detach(ztest_args_t *za)
oldvd_is_log = oldvd->vdev_top->vdev_islog; oldvd_is_log = oldvd->vdev_top->vdev_islog;
(void) strcpy(oldpath, oldvd->vdev_path); (void) strcpy(oldpath, oldvd->vdev_path);
pvd = oldvd->vdev_parent; pvd = oldvd->vdev_parent;
pguid = pvd->vdev_guid;
/* /*
* If oldvd has siblings, then half of the time, detach it. * If oldvd has siblings, then half of the time, detach it.
*/ */
if (oldvd_has_siblings && ztest_random(2) == 0) { if (oldvd_has_siblings && ztest_random(2) == 0) {
spa_config_exit(spa, SCL_VDEV, FTAG); spa_config_exit(spa, SCL_VDEV, FTAG);
error = spa_vdev_detach(spa, oldguid, B_FALSE); error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
if (error != 0 && error != ENODEV && error != EBUSY) if (error != 0 && error != ENODEV && error != EBUSY &&
fatal(0, "detach (%s) returned %d", error != ENOTSUP)
oldpath, error); fatal(0, "detach (%s) returned %d", oldpath, error);
(void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock);
return; return;
} }
@ -1139,7 +1139,6 @@ ztest_vdev_attach_detach(ztest_args_t *za)
/* /*
* Verify that dynamic LUN growth works as expected. * Verify that dynamic LUN growth works as expected.
*/ */
/* ARGSUSED */
void void
ztest_vdev_LUN_growth(ztest_args_t *za) ztest_vdev_LUN_growth(ztest_args_t *za)
{ {
@ -1279,7 +1278,6 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
zilog_t *zilog; zilog_t *zilog;
uint64_t seq; uint64_t seq;
uint64_t objects; uint64_t objects;
ztest_replay_t zr;
(void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock);
(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool, (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
@ -1296,8 +1294,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
*/ */
if (ztest_random(2) == 0 && if (ztest_random(2) == 0 &&
dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) { dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
zr.zr_os = os; zil_replay(os, os, ztest_replay_vector);
zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
dmu_objset_close(os); dmu_objset_close(os);
} }
@ -2060,8 +2057,6 @@ ztest_dmu_write_parallel(ztest_args_t *za)
error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db); error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
za->za_dbuf = db; za->za_dbuf = db;
if (error) { if (error) {
dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
osname, ZTEST_DIROBJ, blkoff, error);
(void) pthread_mutex_unlock(lp); (void) pthread_mutex_unlock(lp);
return; return;
} }
@ -2072,11 +2067,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
(void) pthread_mutex_unlock(lp); (void) pthread_mutex_unlock(lp);
if (error) { if (error)
dprintf("dmu_sync(%s, %d, %llx) = %d\n",
osname, ZTEST_DIROBJ, off, error);
return; return;
}
if (blk.blk_birth == 0) /* concurrent free */ if (blk.blk_birth == 0) /* concurrent free */
return; return;
@ -2585,8 +2577,6 @@ ztest_fault_inject(ztest_args_t *za)
maxfaults = INT_MAX; /* no limit on cache devices */ maxfaults = INT_MAX; /* no limit on cache devices */
} }
dprintf("damaging %s and %s\n", path0, pathrand);
spa_config_exit(spa, SCL_STATE, FTAG); spa_config_exit(spa, SCL_STATE, FTAG);
if (maxfaults == 0) if (maxfaults == 0)
@ -2596,10 +2586,13 @@ ztest_fault_inject(ztest_args_t *za)
* If we can tolerate two or more faults, randomly online/offline vd0. * If we can tolerate two or more faults, randomly online/offline vd0.
*/ */
if (maxfaults >= 2 && guid0 != 0) { if (maxfaults >= 2 && guid0 != 0) {
if (ztest_random(10) < 6) if (ztest_random(10) < 6) {
(void) vdev_offline(spa, guid0, B_TRUE); int flags = (ztest_random(2) == 0 ?
else ZFS_OFFLINE_TEMPORARY : 0);
(void) vdev_online(spa, guid0, B_FALSE, NULL); VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
} else {
(void) vdev_online(spa, guid0, 0, NULL);
}
} }
/* /*
@ -2854,7 +2847,7 @@ ztest_walk_pool_directory(char *header)
static void static void
ztest_spa_import_export(char *oldname, char *newname) ztest_spa_import_export(char *oldname, char *newname)
{ {
nvlist_t *config; nvlist_t *config, *newconfig;
uint64_t pool_guid; uint64_t pool_guid;
spa_t *spa; spa_t *spa;
int error; int error;
@ -2876,6 +2869,12 @@ ztest_spa_import_export(char *oldname, char *newname)
if (error) if (error)
fatal(0, "spa_open('%s') = %d", oldname, error); fatal(0, "spa_open('%s') = %d", oldname, error);
/*
* Kick off a scrub to tickle scrub/export races.
*/
if (ztest_random(2) == 0)
(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
pool_guid = spa_guid(spa); pool_guid = spa_guid(spa);
spa_close(spa, FTAG); spa_close(spa, FTAG);
@ -2884,12 +2883,19 @@ ztest_spa_import_export(char *oldname, char *newname)
/* /*
* Export it. * Export it.
*/ */
error = spa_export(oldname, &config, B_FALSE); error = spa_export(oldname, &config, B_FALSE, B_FALSE);
if (error) if (error)
fatal(0, "spa_export('%s') = %d", oldname, error); fatal(0, "spa_export('%s') = %d", oldname, error);
ztest_walk_pool_directory("pools after export"); ztest_walk_pool_directory("pools after export");
/*
* Try to import it.
*/
newconfig = spa_tryimport(config);
ASSERT(newconfig != NULL);
nvlist_free(newconfig);
/* /*
* Import it under the new name. * Import it under the new name.
*/ */
@ -2932,22 +2938,25 @@ ztest_spa_import_export(char *oldname, char *newname)
nvlist_free(config); nvlist_free(config);
} }
static void
ztest_resume(spa_t *spa)
{
if (spa_suspended(spa)) {
spa_vdev_state_enter(spa);
vdev_clear(spa, NULL);
(void) spa_vdev_state_exit(spa, NULL, 0);
zio_resume(spa);
}
}
static void * static void *
ztest_resume(void *arg) ztest_resume_thread(void *arg)
{ {
spa_t *spa = arg; spa_t *spa = arg;
while (!ztest_exiting) { while (!ztest_exiting) {
(void) poll(NULL, 0, 1000); (void) poll(NULL, 0, 1000);
ztest_resume(spa);
if (!spa_suspended(spa))
continue;
spa_vdev_state_enter(spa);
vdev_clear(spa, NULL);
(void) spa_vdev_state_exit(spa, NULL, 0);
zio_resume(spa);
} }
return (NULL); return (NULL);
} }
@ -3090,6 +3099,16 @@ ztest_run(char *pool)
*/ */
VERIFY(spa_open(pool, &spa, FTAG) == 0); VERIFY(spa_open(pool, &spa, FTAG) == 0);
/*
* We don't expect the pool to suspend unless maxfaults == 0,
* in which case ztest_fault_inject() temporarily takes away
* the only valid replica.
*/
if (zopt_maxfaults == 0)
spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
else
spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
/* /*
* Create a thread to periodically resume suspended I/O. * Create a thread to periodically resume suspended I/O.
*/ */
@ -3141,7 +3160,6 @@ ztest_run(char *pool)
za[t].za_kill = za[0].za_kill; za[t].za_kill = za[0].za_kill;
if (t < zopt_datasets) { if (t < zopt_datasets) {
ztest_replay_t zr;
int test_future = FALSE; int test_future = FALSE;
(void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock);
(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
@ -3165,9 +3183,8 @@ ztest_run(char *pool)
(void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock);
if (test_future) if (test_future)
ztest_dmu_check_future_leak(&za[t]); ztest_dmu_check_future_leak(&za[t]);
zr.zr_os = za[d].za_os; zil_replay(za[d].za_os, za[d].za_os,
zil_replay(zr.zr_os, &zr, &zr.zr_assign, ztest_replay_vector);
ztest_replay_vector, NULL);
za[d].za_zilog = zil_open(za[d].za_os, NULL); za[d].za_zilog = zil_open(za[d].za_os, NULL);
} }
@ -3212,6 +3229,7 @@ ztest_run(char *pool)
/* Kill the resume thread */ /* Kill the resume thread */
ztest_exiting = B_TRUE; ztest_exiting = B_TRUE;
VERIFY(pthread_join(resume_tid, NULL) == 0); VERIFY(pthread_join(resume_tid, NULL) == 0);
ztest_resume(spa);
/* /*
* Right before closing the pool, kick off a bunch of async I/O; * Right before closing the pool, kick off a bunch of async I/O;
@ -3307,11 +3325,6 @@ main(int argc, char **argv)
process_options(argc, argv); process_options(argc, argv);
argc -= optind;
argv += optind;
dprintf_setup(&argc, argv);
/* /*
* Blow away any existing copy of zpool.cache * Blow away any existing copy of zpool.cache
*/ */

View File

@ -29,6 +29,7 @@
#include <assert.h> #include <assert.h>
#include <libnvpair.h> #include <libnvpair.h>
#include <sys/mnttab.h>
#include <sys/param.h> #include <sys/param.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/varargs.h> #include <sys/varargs.h>
@ -175,6 +176,13 @@ extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
extern int libzfs_errno(libzfs_handle_t *); extern int libzfs_errno(libzfs_handle_t *);
extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_action(libzfs_handle_t *);
extern const char *libzfs_error_description(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *);
extern void libzfs_mnttab_init(libzfs_handle_t *);
extern void libzfs_mnttab_fini(libzfs_handle_t *);
extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
struct mnttab *);
extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
const char *, const char *);
extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *);
/* /*
* Basic handle functions * Basic handle functions
@ -289,6 +297,7 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
* Import and export functions * Import and export functions
*/ */
extern int zpool_export(zpool_handle_t *, boolean_t); extern int zpool_export(zpool_handle_t *, boolean_t);
extern int zpool_export_force(zpool_handle_t *);
extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
char *altroot); char *altroot);
extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,

View File

@ -63,6 +63,7 @@ struct libzfs_handle {
int libzfs_printerr; int libzfs_printerr;
void *libzfs_sharehdl; /* libshare handle */ void *libzfs_sharehdl; /* libshare handle */
uint_t libzfs_shareflags; uint_t libzfs_shareflags;
avl_tree_t libzfs_mnttab_cache;
}; };
#define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ #define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */

View File

@ -38,7 +38,6 @@
#include <zone.h> #include <zone.h>
#include <fcntl.h> #include <fcntl.h>
#include <sys/mntent.h> #include <sys/mntent.h>
#include <sys/mnttab.h>
#include <sys/mount.h> #include <sys/mount.h>
#include <sys/avl.h> #include <sys/avl.h>
#include <priv.h> #include <priv.h>
@ -110,7 +109,6 @@ path_to_str(const char *path, int types)
return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT)); return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
} }
/* /*
* The user has requested either filesystems or volumes. * The user has requested either filesystems or volumes.
* We have no way of knowing a priori what type this would be, so always * We have no way of knowing a priori what type this would be, so always
@ -323,38 +321,35 @@ zpool_free_handles(libzfs_handle_t *hdl)
* Utility function to gather stats (objset and zpl) for the given object. * Utility function to gather stats (objset and zpl) for the given object.
*/ */
static int static int
get_stats(zfs_handle_t *zhp) get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
{ {
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl; libzfs_handle_t *hdl = zhp->zfs_hdl;
nvlist_t *allprops, *userprops;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
return (-1);
while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
if (errno == ENOMEM) { if (errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
zcmd_free_nvlists(&zc);
return (-1); return (-1);
} }
} else { } else {
zcmd_free_nvlists(&zc);
return (-1); return (-1);
} }
} }
return (0);
}
zhp->zfs_dmustats = zc.zc_objset_stats; /* structure assignment */ static int
put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
{
nvlist_t *allprops, *userprops;
if (zcmd_read_dst_nvlist(hdl, &zc, &allprops) != 0) { zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
zcmd_free_nvlists(&zc);
if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
return (-1); return (-1);
} }
zcmd_free_nvlists(&zc);
if ((userprops = process_user_props(zhp, allprops)) == NULL) { if ((userprops = process_user_props(zhp, allprops)) == NULL) {
nvlist_free(allprops); nvlist_free(allprops);
return (-1); return (-1);
@ -369,6 +364,22 @@ get_stats(zfs_handle_t *zhp)
return (0); return (0);
} }
static int
get_stats(zfs_handle_t *zhp)
{
int rc = 0;
zfs_cmd_t zc = { 0 };
if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
return (-1);
if (get_stats_ioctl(zhp, &zc) != 0)
rc = -1;
else if (put_stats_zhdl(zhp, &zc) != 0)
rc = -1;
zcmd_free_nvlists(&zc);
return (rc);
}
/* /*
* Refresh the properties currently stored in the handle. * Refresh the properties currently stored in the handle.
*/ */
@ -382,16 +393,11 @@ zfs_refresh_properties(zfs_handle_t *zhp)
* Makes a handle from the given dataset name. Used by zfs_open() and * Makes a handle from the given dataset name. Used by zfs_open() and
* zfs_iter_* to create child handles on the fly. * zfs_iter_* to create child handles on the fly.
*/ */
zfs_handle_t * static int
make_dataset_handle(libzfs_handle_t *hdl, const char *path) make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
{ {
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
char *logstr; char *logstr;
libzfs_handle_t *hdl = zhp->zfs_hdl;
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = hdl;
/* /*
* Preserve history log string. * Preserve history log string.
@ -400,17 +406,16 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path)
*/ */
logstr = zhp->zfs_hdl->libzfs_log_str; logstr = zhp->zfs_hdl->libzfs_log_str;
zhp->zfs_hdl->libzfs_log_str = NULL; zhp->zfs_hdl->libzfs_log_str = NULL;
top:
(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
if (get_stats(zhp) != 0) { top:
if (put_stats_zhdl(zhp, zc) != 0) {
zhp->zfs_hdl->libzfs_log_str = logstr; zhp->zfs_hdl->libzfs_log_str = logstr;
free(zhp); return (-1);
return (NULL);
} }
if (zhp->zfs_dmustats.dds_inconsistent) { if (zhp->zfs_dmustats.dds_inconsistent) {
zfs_cmd_t zc = { 0 }; zfs_cmd_t zc2 = { 0 };
/* /*
* If it is dds_inconsistent, then we've caught it in * If it is dds_inconsistent, then we've caught it in
@ -427,28 +432,33 @@ top:
* will fail with EBUSY and we will drive on as usual. * will fail with EBUSY and we will drive on as usual.
*/ */
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc2.zc_name, zhp->zfs_name,
sizeof (zc2.zc_name));
if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) { if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
(void) zvol_remove_link(hdl, zhp->zfs_name); (void) zvol_remove_link(hdl, zhp->zfs_name);
zc.zc_objset_type = DMU_OST_ZVOL; zc2.zc_objset_type = DMU_OST_ZVOL;
} else { } else {
zc.zc_objset_type = DMU_OST_ZFS; zc2.zc_objset_type = DMU_OST_ZFS;
} }
/* /*
* If we can successfully destroy it, pretend that it * If we can successfully destroy it, pretend that it
* never existed. * never existed.
*/ */
if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) == 0) { if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc2) == 0) {
zhp->zfs_hdl->libzfs_log_str = logstr; zhp->zfs_hdl->libzfs_log_str = logstr;
free(zhp);
errno = ENOENT; errno = ENOENT;
return (NULL); return (-1);
} }
/* If we can successfully roll it back, reget the stats */ /* If we can successfully roll it back, reset the stats */
if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc) == 0) if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc2) == 0) {
if (get_stats_ioctl(zhp, zc) != 0) {
zhp->zfs_hdl->libzfs_log_str = logstr;
return (-1);
}
goto top; goto top;
}
} }
/* /*
@ -473,6 +483,52 @@ top:
zhp->zfs_hdl->libzfs_log_str = logstr; zhp->zfs_hdl->libzfs_log_str = logstr;
zhp->zpool_hdl = zpool_handle(zhp); zhp->zpool_hdl = zpool_handle(zhp);
return (0);
}
zfs_handle_t *
make_dataset_handle(libzfs_handle_t *hdl, const char *path)
{
zfs_cmd_t zc = { 0 };
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = hdl;
(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
free(zhp);
return (NULL);
}
if (get_stats_ioctl(zhp, &zc) == -1) {
zcmd_free_nvlists(&zc);
free(zhp);
return (NULL);
}
if (make_dataset_handle_common(zhp, &zc) == -1) {
free(zhp);
zhp = NULL;
}
zcmd_free_nvlists(&zc);
return (zhp);
}
static zfs_handle_t *
make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
{
zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
if (zhp == NULL)
return (NULL);
zhp->zfs_hdl = hdl;
(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
if (make_dataset_handle_common(zhp, zc) == -1) {
free(zhp);
return (NULL);
}
return (zhp); return (zhp);
} }
@ -531,6 +587,117 @@ zfs_close(zfs_handle_t *zhp)
free(zhp); free(zhp);
} }
typedef struct mnttab_node {
struct mnttab mtn_mt;
avl_node_t mtn_node;
} mnttab_node_t;
static int
libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
{
const mnttab_node_t *mtn1 = arg1;
const mnttab_node_t *mtn2 = arg2;
int rv;
rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
if (rv == 0)
return (0);
return (rv > 0 ? 1 : -1);
}
void
libzfs_mnttab_init(libzfs_handle_t *hdl)
{
struct mnttab entry;
assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
rewind(hdl->libzfs_mnttab);
while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
mnttab_node_t *mtn;
if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
continue;
mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
avl_add(&hdl->libzfs_mnttab_cache, mtn);
}
}
void
libzfs_mnttab_fini(libzfs_handle_t *hdl)
{
void *cookie = NULL;
mnttab_node_t *mtn;
while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) {
free(mtn->mtn_mt.mnt_special);
free(mtn->mtn_mt.mnt_mountp);
free(mtn->mtn_mt.mnt_fstype);
free(mtn->mtn_mt.mnt_mntopts);
free(mtn);
}
avl_destroy(&hdl->libzfs_mnttab_cache);
}
int
libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
struct mnttab *entry)
{
mnttab_node_t find;
mnttab_node_t *mtn;
if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
libzfs_mnttab_init(hdl);
find.mtn_mt.mnt_special = (char *)fsname;
mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
if (mtn) {
*entry = mtn->mtn_mt;
return (0);
}
return (ENOENT);
}
void
libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
const char *mountp, const char *mntopts)
{
mnttab_node_t *mtn;
if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
return;
mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
avl_add(&hdl->libzfs_mnttab_cache, mtn);
}
void
libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
{
mnttab_node_t find;
mnttab_node_t *ret;
find.mtn_mt.mnt_special = (char *)fsname;
if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) {
avl_remove(&hdl->libzfs_mnttab_cache, ret);
free(ret->mtn_mt.mnt_special);
free(ret->mtn_mt.mnt_mountp);
free(ret->mtn_mt.mnt_fstype);
free(ret->mtn_mt.mnt_mntopts);
free(ret);
}
}
int int
zfs_spa_version(zfs_handle_t *zhp, int *version) zfs_spa_version(zfs_handle_t *zhp, int *version)
{ {
@ -2140,15 +2307,11 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
*/ */
if (!zhp->zfs_mntcheck && if (!zhp->zfs_mntcheck &&
(mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
struct mnttab entry, search = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl;
FILE *mnttab = zhp->zfs_hdl->libzfs_mnttab; struct mnttab entry;
search.mnt_special = (char *)zhp->zfs_name; if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
search.mnt_fstype = MNTTYPE_ZFS; zhp->zfs_mntopts = zfs_strdup(hdl,
rewind(mnttab);
if (getmntany(mnttab, &entry, &search) == 0) {
zhp->zfs_mntopts = zfs_strdup(zhp->zfs_hdl,
entry.mnt_mntopts); entry.mnt_mntopts);
if (zhp->zfs_mntopts == NULL) if (zhp->zfs_mntopts == NULL)
return (-1); return (-1);
@ -2592,6 +2755,46 @@ zfs_get_type(const zfs_handle_t *zhp)
return (zhp->zfs_type); return (zhp->zfs_type);
} }
static int
zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc)
{
int rc;
uint64_t orig_cookie;
orig_cookie = zc->zc_cookie;
top:
(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
if (rc == -1) {
switch (errno) {
case ENOMEM:
/* expand nvlist memory and try again */
if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) {
zcmd_free_nvlists(zc);
return (-1);
}
zc->zc_cookie = orig_cookie;
goto top;
/*
* An errno value of ESRCH indicates normal completion.
* If ENOENT is returned, then the underlying dataset
* has been removed since we obtained the handle.
*/
case ESRCH:
case ENOENT:
rc = 1;
break;
default:
rc = zfs_standard_error(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN,
"cannot iterate filesystems"));
break;
}
}
return (rc);
}
/* /*
* Iterate over all child filesystems * Iterate over all child filesystems
*/ */
@ -2605,9 +2808,11 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM)
return (0); return (0);
for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; return (-1);
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
&zc)) == 0) {
/* /*
* Ignore private dataset names. * Ignore private dataset names.
*/ */
@ -2618,24 +2823,18 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
* Silently ignore errors, as the only plausible explanation is * Silently ignore errors, as the only plausible explanation is
* that the pool has since been removed. * that the pool has since been removed.
*/ */
if ((nzhp = make_dataset_handle(zhp->zfs_hdl, if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
zc.zc_name)) == NULL) &zc)) == NULL) {
continue; continue;
}
if ((ret = func(nzhp, data)) != 0) if ((ret = func(nzhp, data)) != 0) {
zcmd_free_nvlists(&zc);
return (ret); return (ret);
}
} }
zcmd_free_nvlists(&zc);
/* return ((ret < 0) ? ret : 0);
* An errno value of ESRCH indicates normal completion. If ENOENT is
* returned, then the underlying dataset has been removed since we
* obtained the handle.
*/
if (errno != ESRCH && errno != ENOENT)
return (zfs_standard_error(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
return (0);
} }
/* /*
@ -2651,29 +2850,23 @@ zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
return (0); return (0);
for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, return (-1);
&zc) == 0; while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) { &zc)) == 0) {
if ((nzhp = make_dataset_handle(zhp->zfs_hdl, if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
zc.zc_name)) == NULL) &zc)) == NULL) {
continue; continue;
}
if ((ret = func(nzhp, data)) != 0) if ((ret = func(nzhp, data)) != 0) {
zcmd_free_nvlists(&zc);
return (ret); return (ret);
}
} }
zcmd_free_nvlists(&zc);
/* return ((ret < 0) ? ret : 0);
* An errno value of ESRCH indicates normal completion. If ENOENT is
* returned, then the underlying dataset has been removed since we
* obtained the handle. Silently ignore this case, and return success.
*/
if (errno != ESRCH && errno != ENOENT)
return (zfs_standard_error(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
return (0);
} }
/* /*
@ -2726,8 +2919,8 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
zfs_handle_t *zhp; zfs_handle_t *zhp;
char errbuf[1024]; char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), "cannot create '%s'", (void) snprintf(errbuf, sizeof (errbuf),
path); dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
/* get parent, and check to see if this is just a pool */ /* get parent, and check to see if this is just a pool */
if (parent_name(path, parent, sizeof (parent)) != 0) { if (parent_name(path, parent, sizeof (parent)) != 0) {

View File

@ -74,7 +74,6 @@
#include <unistd.h> #include <unistd.h>
#include <zone.h> #include <zone.h>
#include <sys/mntent.h> #include <sys/mntent.h>
#include <sys/mnttab.h>
#include <sys/mount.h> #include <sys/mount.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -242,18 +241,9 @@ dir_is_empty(const char *dirn)
boolean_t boolean_t
is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where)
{ {
struct mnttab search = { 0 }, entry; struct mnttab entry;
/* if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0)
* Search for the entry in /etc/mnttab. We don't bother getting the
* mountpoint, as we can just search for the special device. This will
* also let us find mounts when the mountpoint is 'legacy'.
*/
search.mnt_special = (char *)special;
search.mnt_fstype = MNTTYPE_ZFS;
rewind(zfs_hdl->libzfs_mnttab);
if (getmntany(zfs_hdl->libzfs_mnttab, &entry, &search) != 0)
return (B_FALSE); return (B_FALSE);
if (where != NULL) if (where != NULL)
@ -364,12 +354,14 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
} else { } else {
zfs_error_aux(hdl, strerror(errno)); zfs_error_aux(hdl, strerror(errno));
} }
return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
dgettext(TEXT_DOMAIN, "cannot mount '%s'"), dgettext(TEXT_DOMAIN, "cannot mount '%s'"),
zhp->zfs_name)); zhp->zfs_name));
} }
/* add the mounted entry into our cache */
libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint,
mntopts);
return (0); return (0);
} }
@ -395,26 +387,23 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
int int
zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
{ {
struct mnttab search = { 0 }, entry; libzfs_handle_t *hdl = zhp->zfs_hdl;
struct mnttab entry;
char *mntpt = NULL; char *mntpt = NULL;
/* check to see if need to unmount the filesystem */ /* check to see if we need to unmount the filesystem */
search.mnt_special = zhp->zfs_name;
search.mnt_fstype = MNTTYPE_ZFS;
rewind(zhp->zfs_hdl->libzfs_mnttab);
if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) { libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) {
/* /*
* mountpoint may have come from a call to * mountpoint may have come from a call to
* getmnt/getmntany if it isn't NULL. If it is NULL, * getmnt/getmntany if it isn't NULL. If it is NULL,
* we know it comes from getmntany which can then get * we know it comes from libzfs_mnttab_find which can
* overwritten later. We strdup it to play it safe. * then get freed later. We strdup it to play it safe.
*/ */
if (mountpoint == NULL) if (mountpoint == NULL)
mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp); mntpt = zfs_strdup(hdl, entry.mnt_mountp);
else else
mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint); mntpt = zfs_strdup(hdl, mountpoint);
/* /*
* Unshare and unmount the filesystem * Unshare and unmount the filesystem
@ -422,11 +411,12 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0) if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0)
return (-1); return (-1);
if (unmount_one(zhp->zfs_hdl, mntpt, flags) != 0) { if (unmount_one(hdl, mntpt, flags) != 0) {
free(mntpt); free(mntpt);
(void) zfs_shareall(zhp); (void) zfs_shareall(zhp);
return (-1); return (-1);
} }
libzfs_mnttab_remove(hdl, zhp->zfs_name);
free(mntpt); free(mntpt);
} }
@ -859,7 +849,7 @@ unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint,
char *mntpt; char *mntpt;
/* /*
* Mountpoint could get trashed if libshare calls getmntany * Mountpoint could get trashed if libshare calls getmntany
* which id does during API initialization, so strdup the * which it does during API initialization, so strdup the
* value. * value.
*/ */
mntpt = zfs_strdup(hdl, mountpoint); mntpt = zfs_strdup(hdl, mountpoint);
@ -897,18 +887,17 @@ int
zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint,
zfs_share_proto_t *proto) zfs_share_proto_t *proto)
{ {
struct mnttab search = { 0 }, entry; libzfs_handle_t *hdl = zhp->zfs_hdl;
struct mnttab entry;
char *mntpt = NULL; char *mntpt = NULL;
/* check to see if need to unmount the filesystem */ /* check to see if need to unmount the filesystem */
search.mnt_special = (char *)zfs_get_name(zhp);
search.mnt_fstype = MNTTYPE_ZFS;
rewind(zhp->zfs_hdl->libzfs_mnttab); rewind(zhp->zfs_hdl->libzfs_mnttab);
if (mountpoint != NULL) if (mountpoint != NULL)
mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint); mountpoint = mntpt = zfs_strdup(hdl, mountpoint);
if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) { libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) {
zfs_share_proto_t *curr_proto; zfs_share_proto_t *curr_proto;
if (mountpoint == NULL) if (mountpoint == NULL)
@ -917,8 +906,8 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint,
for (curr_proto = proto; *curr_proto != PROTO_END; for (curr_proto = proto; *curr_proto != PROTO_END;
curr_proto++) { curr_proto++) {
if (is_shared(zhp->zfs_hdl, mntpt, *curr_proto) && if (is_shared(hdl, mntpt, *curr_proto) &&
unshare_one(zhp->zfs_hdl, zhp->zfs_name, unshare_one(hdl, zhp->zfs_name,
mntpt, *curr_proto) != 0) { mntpt, *curr_proto) != 0) {
if (mntpt != NULL) if (mntpt != NULL)
free(mntpt); free(mntpt);

View File

@ -1127,7 +1127,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
* mounted datasets in the pool. * mounted datasets in the pool.
*/ */
int int
zpool_export(zpool_handle_t *zhp, boolean_t force) zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
{ {
zfs_cmd_t zc = { 0 }; zfs_cmd_t zc = { 0 };
char msg[1024]; char msg[1024];
@ -1140,6 +1140,7 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_cookie = force; zc.zc_cookie = force;
zc.zc_guid = hardforce;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
switch (errno) { switch (errno) {
@ -1160,6 +1161,18 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
return (0); return (0);
} }
int
zpool_export(zpool_handle_t *zhp, boolean_t force)
{
return (zpool_export_common(zhp, force, B_FALSE));
}
int
zpool_export_force(zpool_handle_t *zhp)
{
return (zpool_export_common(zhp, B_TRUE, B_TRUE));
}
/* /*
* zpool_import() is a contracted interface. Should be kept the same * zpool_import() is a contracted interface. Should be kept the same
* if possible. * if possible.
@ -1182,7 +1195,9 @@ zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
} }
if (nvlist_add_string(props, if (nvlist_add_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0) { zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 ||
nvlist_add_string(props,
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
nvlist_free(props); nvlist_free(props);
return (zfs_error_fmt(hdl, EZFS_NOMEM, return (zfs_error_fmt(hdl, EZFS_NOMEM,
dgettext(TEXT_DOMAIN, "cannot import '%s'"), dgettext(TEXT_DOMAIN, "cannot import '%s'"),

View File

@ -794,6 +794,10 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
"SOURCE")); "SOURCE"));
/* first property is always NAME */
assert(cbp->cb_proplist->pl_prop ==
((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME));
/* /*
* Go through and calculate the widths for each column. For the * Go through and calculate the widths for each column. For the
* 'source' column, we kludge it up by taking the worst-case scenario of * 'source' column, we kludge it up by taking the worst-case scenario of
@ -821,9 +825,13 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
} }
/* /*
* 'VALUE' column * 'VALUE' column. The first property is always the 'name'
* property that was tacked on either by /sbin/zfs's
* zfs_do_get() or when calling zprop_expand_list(), so we
* ignore its width. If the user specified the name property
* to display, then it will be later in the list in any case.
*/ */
if ((pl->pl_prop != ZFS_PROP_NAME || !pl->pl_all) && if (pl != cbp->cb_proplist &&
pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;

View File

@ -4472,7 +4472,7 @@ l2arc_fini(void)
void void
l2arc_start(void) l2arc_start(void)
{ {
if (!(spa_mode & FWRITE)) if (!(spa_mode_global & FWRITE))
return; return;
(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
@ -4482,7 +4482,7 @@ l2arc_start(void)
void void
l2arc_stop(void) l2arc_stop(void)
{ {
if (!(spa_mode & FWRITE)) if (!(spa_mode_global & FWRITE))
return; return;
mutex_enter(&l2arc_feed_thr_lock); mutex_enter(&l2arc_feed_thr_lock);

View File

@ -119,7 +119,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
* We only want to visit blocks that have been claimed but not yet * We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed). * replayed (or, in read-only mode, blocks that *would* be claimed).
*/ */
if (claim_txg == 0 && (spa_mode & FWRITE)) if (claim_txg == 0 && spa_writeable(td->td_spa))
return; return;
zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);

View File

@ -1951,6 +1951,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
if (ds->ds_phys->ds_next_snap_obj) { if (ds->ds_phys->ds_next_snap_obj) {
stat->dds_is_snapshot = B_TRUE; stat->dds_is_snapshot = B_TRUE;
stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
} else {
stat->dds_is_snapshot = B_FALSE;
stat->dds_num_clones = 0;
} }
/* clone origin is really a dsl_dir thing... */ /* clone origin is really a dsl_dir thing... */
@ -1962,6 +1965,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_name(ods, stat->dds_origin);
dsl_dataset_drop_ref(ods, FTAG); dsl_dataset_drop_ref(ods, FTAG);
} else {
stat->dds_origin[0] = '\0';
} }
rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
} }

View File

@ -391,7 +391,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
* We only want to visit blocks that have been claimed but not yet * We only want to visit blocks that have been claimed but not yet
* replayed (or, in read-only mode, blocks that *would* be claimed). * replayed (or, in read-only mode, blocks that *would* be claimed).
*/ */
if (claim_txg == 0 && (spa_mode & FWRITE)) if (claim_txg == 0 && spa_writeable(dp->dp_spa))
return; return;
zilog = zil_alloc(dp->dp_meta_objset, zh); zilog = zil_alloc(dp->dp_meta_objset, zh);
@ -409,9 +409,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
int err; int err;
arc_buf_t *buf = NULL; arc_buf_t *buf = NULL;
if (bp->blk_birth == 0)
return;
if (bp->blk_birth <= dp->dp_scrub_min_txg) if (bp->blk_birth <= dp->dp_scrub_min_txg)
return; return;
@ -740,6 +737,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
void void
dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{ {
spa_t *spa = dp->dp_spa;
zap_cursor_t zc; zap_cursor_t zc;
zap_attribute_t za; zap_attribute_t za;
boolean_t complete = B_TRUE; boolean_t complete = B_TRUE;
@ -747,8 +745,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (dp->dp_scrub_func == SCRUB_FUNC_NONE) if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
return; return;
/* If the spa is not fully loaded, don't bother. */ /*
if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) * If the pool is not loaded, or is trying to unload, leave it alone.
*/
if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
return; return;
if (dp->dp_scrub_restart) { if (dp->dp_scrub_restart) {
@ -757,13 +757,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
} }
if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
/* /*
* We must have resumed after rebooting; reset the vdev * We must have resumed after rebooting; reset the vdev
* stats to know that we're doing a scrub (although it * stats to know that we're doing a scrub (although it
* will think we're just starting now). * will think we're just starting now).
*/ */
vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, vdev_scrub_stat_update(spa->spa_root_vdev,
dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
POOL_SCRUB_EVERYTHING, B_FALSE); POOL_SCRUB_EVERYTHING, B_FALSE);
} }
@ -771,7 +771,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dp->dp_scrub_pausing = B_FALSE; dp->dp_scrub_pausing = B_FALSE;
dp->dp_scrub_start_time = lbolt64; dp->dp_scrub_start_time = lbolt64;
dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
dp->dp_spa->spa_scrub_active = B_TRUE; spa->spa_scrub_active = B_TRUE;
if (dp->dp_scrub_bookmark.zb_objset == 0) { if (dp->dp_scrub_bookmark.zb_objset == 0) {
/* First do the MOS & ORIGIN */ /* First do the MOS & ORIGIN */
@ -779,8 +779,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (dp->dp_scrub_pausing) if (dp->dp_scrub_pausing)
goto out; goto out;
if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, VERIFY(0 == dmu_objset_find_spa(spa,
NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
} else { } else {
scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
@ -830,15 +830,13 @@ out:
VERIFY(0 == zap_update(dp->dp_meta_objset, VERIFY(0 == zap_update(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
&dp->dp_spa->spa_scrub_errors, tx)); &spa->spa_scrub_errors, tx));
/* XXX this is scrub-clean specific */ /* XXX this is scrub-clean specific */
mutex_enter(&dp->dp_spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
while (dp->dp_spa->spa_scrub_inflight > 0) { while (spa->spa_scrub_inflight > 0)
cv_wait(&dp->dp_spa->spa_scrub_io_cv, cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
&dp->dp_spa->spa_scrub_lock); mutex_exit(&spa->spa_scrub_lock);
}
mutex_exit(&dp->dp_spa->spa_scrub_lock);
} }
void void
@ -920,13 +918,17 @@ static int
dsl_pool_scrub_clean_cb(dsl_pool_t *dp, dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
const blkptr_t *bp, const zbookmark_t *zb) const blkptr_t *bp, const zbookmark_t *zb)
{ {
size_t size = BP_GET_LSIZE(bp); size_t size = BP_GET_PSIZE(bp);
int d;
spa_t *spa = dp->dp_spa; spa_t *spa = dp->dp_spa;
boolean_t needs_io; boolean_t needs_io;
int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
int zio_priority; int zio_priority;
ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
if (bp->blk_birth >= dp->dp_scrub_max_txg)
return (0);
count_block(dp->dp_blkstats, bp); count_block(dp->dp_blkstats, bp);
if (dp->dp_scrub_isresilver == 0) { if (dp->dp_scrub_isresilver == 0) {
@ -945,7 +947,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
zio_flags |= ZIO_FLAG_SPECULATIVE; zio_flags |= ZIO_FLAG_SPECULATIVE;
for (d = 0; d < BP_GET_NDVAS(bp); d++) { for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
vdev_t *vd = vdev_lookup_top(spa, vdev_t *vd = vdev_lookup_top(spa,
DVA_GET_VDEV(&bp->blk_dva[d])); DVA_GET_VDEV(&bp->blk_dva[d]));
@ -963,16 +965,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
if (DVA_GET_GANG(&bp->blk_dva[d])) { if (DVA_GET_GANG(&bp->blk_dva[d])) {
/* /*
* Gang members may be spread across multiple * Gang members may be spread across multiple
* vdevs, so the best we can do is look at the * vdevs, so the best estimate we have is the
* pool-wide DTL. * scrub range, which has already been checked.
* XXX -- it would be better to change our * XXX -- it would be better to change our
* allocation policy to ensure that this can't * allocation policy to ensure that all
* happen. * gang members reside on the same vdev.
*/ */
vd = spa->spa_root_vdev; needs_io = B_TRUE;
} else {
needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
bp->blk_birth, 1);
} }
needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
bp->blk_birth, 1);
} }
} }

View File

@ -332,7 +332,8 @@ extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool); extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce);
extern int spa_reset(char *pool); extern int spa_reset(char *pool);
extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_request(spa_t *spa, int flag);
extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag);
@ -351,7 +352,8 @@ extern void spa_inject_delref(spa_t *spa);
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
int replacing); int replacing);
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
@ -475,6 +477,8 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern int spa_mode(spa_t *spa);
/* history logging */ /* history logging */
typedef enum history_log_type { typedef enum history_log_type {
@ -545,7 +549,7 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_bp(bp, fmt, ...) #define dprintf_bp(bp, fmt, ...)
#endif #endif
extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -170,6 +170,7 @@ struct spa {
boolean_t spa_import_faulted; /* allow faulted vdevs */ boolean_t spa_import_faulted; /* allow faulted vdevs */
boolean_t spa_is_root; /* pool is root */ boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */ int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */ spa_log_state_t spa_log_state; /* log state */
/* /*
* spa_refcnt & spa_config_lock must be the last elements * spa_refcnt & spa_config_lock must be the last elements

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_SPACE_MAP_H #ifndef _SYS_SPACE_MAP_H
#define _SYS_SPACE_MAP_H #define _SYS_SPACE_MAP_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/avl.h> #include <sys/avl.h>
#include <sys/dmu.h> #include <sys/dmu.h>
@ -58,6 +56,12 @@ typedef struct space_seg {
uint64_t ss_end; /* ending offset (non-inclusive) */ uint64_t ss_end; /* ending offset (non-inclusive) */
} space_seg_t; } space_seg_t;
typedef struct space_ref {
avl_node_t sr_node; /* AVL node */
uint64_t sr_offset; /* offset (start or end) */
int64_t sr_refcnt; /* associated reference count */
} space_ref_t;
typedef struct space_map_obj { typedef struct space_map_obj {
uint64_t smo_object; /* on-disk space map object */ uint64_t smo_object; /* on-disk space map object */
uint64_t smo_objsize; /* size of the object */ uint64_t smo_objsize; /* size of the object */
@ -133,13 +137,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
extern void space_map_destroy(space_map_t *sm); extern void space_map_destroy(space_map_t *sm);
extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); extern boolean_t space_map_contains(space_map_t *sm,
uint64_t start, uint64_t size);
extern void space_map_vacate(space_map_t *sm, extern void space_map_vacate(space_map_t *sm,
space_map_func_t *func, space_map_t *mdest); space_map_func_t *func, space_map_t *mdest);
extern void space_map_walk(space_map_t *sm, extern void space_map_walk(space_map_t *sm,
space_map_func_t *func, space_map_t *mdest); space_map_func_t *func, space_map_t *mdest);
extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
extern void space_map_union(space_map_t *smd, space_map_t *sms);
extern void space_map_load_wait(space_map_t *sm); extern void space_map_load_wait(space_map_t *sm);
extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
@ -155,6 +158,15 @@ extern void space_map_sync(space_map_t *sm, uint8_t maptype,
extern void space_map_truncate(space_map_obj_t *smo, extern void space_map_truncate(space_map_obj_t *smo,
objset_t *os, dmu_tx_t *tx); objset_t *os, dmu_tx_t *tx);
extern void space_map_ref_create(avl_tree_t *t);
extern void space_map_ref_destroy(avl_tree_t *t);
extern void space_map_ref_add_seg(avl_tree_t *t,
uint64_t start, uint64_t end, int64_t refcnt);
extern void space_map_ref_add_map(avl_tree_t *t,
space_map_t *sm, int64_t refcnt);
extern void space_map_ref_generate_map(avl_tree_t *t,
space_map_t *sm, int64_t minref);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -19,21 +19,24 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_UBERBLOCK_IMPL_H #ifndef _SYS_UBERBLOCK_IMPL_H
#define _SYS_UBERBLOCK_IMPL_H #define _SYS_UBERBLOCK_IMPL_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/uberblock.h> #include <sys/uberblock.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/*
* For zdb use and debugging purposes only
*/
extern uint64_t ub_max_txg;
/* /*
* The uberblock version is incremented whenever an incompatible on-disk * The uberblock version is incremented whenever an incompatible on-disk
* format change is made to the SPA, DMU, or ZAP. * format change is made to the SPA, DMU, or ZAP.

View File

@ -36,6 +36,14 @@
extern "C" { extern "C" {
#endif #endif
typedef enum vdev_dtl_type {
DTL_MISSING, /* 0% replication: no copies of the data */
DTL_PARTIAL, /* less than 100% replication: some copies missing */
DTL_SCRUB, /* unable to fully repair during scrub/resilver */
DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
DTL_TYPES
} vdev_dtl_type_t;
extern boolean_t zfs_nocacheflush; extern boolean_t zfs_nocacheflush;
extern int vdev_open(vdev_t *); extern int vdev_open(vdev_t *);
@ -50,10 +58,14 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
extern boolean_t vdev_is_bootable(vdev_t *vd); extern boolean_t vdev_is_bootable(vdev_t *vd);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
int scrub_done); int scrub_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd, extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp); uint64_t *minp, uint64_t *maxp);

View File

@ -123,8 +123,7 @@ struct vdev {
vdev_t *vdev_parent; /* parent vdev */ vdev_t *vdev_parent; /* parent vdev */
vdev_t **vdev_child; /* array of children */ vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */ uint64_t vdev_children; /* number of children */
space_map_t vdev_dtl_map; /* dirty time log in-core state */ space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */
vdev_stat_t vdev_stat; /* virtual device statistics */ vdev_stat_t vdev_stat; /* virtual device statistics */
/* /*
@ -149,7 +148,7 @@ struct vdev {
* Leaf vdev state. * Leaf vdev state.
*/ */
uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_psize; /* physical device capacity */
space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */
txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_wholedisk; /* true if this is a whole disk */
uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_offline; /* persistent offline state */

View File

@ -26,8 +26,6 @@
#ifndef _SYS_FS_ZFS_VFSOPS_H #ifndef _SYS_FS_ZFS_VFSOPS_H
#define _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/isa_defs.h> #include <sys/isa_defs.h>
#include <sys/types32.h> #include <sys/types32.h>
#include <sys/list.h> #include <sys/list.h>
@ -49,7 +47,6 @@ struct zfsvfs {
uint64_t z_root; /* id of root znode */ uint64_t z_root; /* id of root znode */
uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */
uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_max_blksz; /* maximum block size for files */
uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */
uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_obj; /* fuid table object number */
uint64_t z_fuid_size; /* fuid table size */ uint64_t z_fuid_size; /* fuid table size */
avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */
@ -74,6 +71,7 @@ struct zfsvfs {
boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_issnap; /* true if this is a snapshot */
boolean_t z_vscan; /* virus scan on/off */ boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */
uint64_t z_version; /* ZPL version */ uint64_t z_version; /* ZPL version */
#define ZFS_OBJ_MTX_SZ 64 #define ZFS_OBJ_MTX_SZ 64

View File

@ -335,7 +335,6 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg); uint64_t txg);
typedef int zil_replay_func_t(void *, char *, boolean_t); typedef int zil_replay_func_t(void *, char *, boolean_t);
typedef void zil_replay_cleaner_t(void *);
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@ -350,9 +349,8 @@ extern void zil_free(zilog_t *zilog);
extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
extern void zil_close(zilog_t *zilog); extern void zil_close(zilog_t *zilog);
extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, extern void zil_replay(objset_t *os, void *arg,
zil_replay_func_t *replay_func[TX_MAX_TYPE], zil_replay_func_t *replay_func[TX_MAX_TYPE]);
zil_replay_cleaner_t *replay_cleaner);
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_ZIL_IMPL_H #ifndef _SYS_ZIL_IMPL_H
#define _SYS_ZIL_IMPL_H #define _SYS_ZIL_IMPL_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
@ -74,13 +72,14 @@ struct zilog {
uint64_t zl_commit_seq; /* committed upto this number */ uint64_t zl_commit_seq; /* committed upto this number */
uint64_t zl_lr_seq; /* log record sequence number */ uint64_t zl_lr_seq; /* log record sequence number */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
uint64_t zl_replaying_seq; /* current replay seq number */
uint32_t zl_suspend; /* log suspend count */ uint32_t zl_suspend; /* log suspend count */
kcondvar_t zl_cv_writer; /* log writer thread completion */ kcondvar_t zl_cv_writer; /* log writer thread completion */
kcondvar_t zl_cv_suspend; /* log suspend completion */ kcondvar_t zl_cv_suspend; /* log suspend completion */
uint8_t zl_suspending; /* log is currently suspending */ uint8_t zl_suspending; /* log is currently suspending */
uint8_t zl_keep_first; /* keep first log block in destroy */ uint8_t zl_keep_first; /* keep first log block in destroy */
uint8_t zl_stop_replay; /* don't replay any further */ uint8_t zl_replay; /* replaying records while set */
uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_stop_sync; /* for debugging */
uint8_t zl_writer; /* boolean: write setup in progress */ uint8_t zl_writer; /* boolean: write setup in progress */
uint8_t zl_log_error; /* boolean: log write error */ uint8_t zl_log_error; /* boolean: log write error */

View File

@ -132,12 +132,14 @@ enum zio_compress {
#define ZIO_FLAG_IO_RETRY 0x00400 #define ZIO_FLAG_IO_RETRY 0x00400
#define ZIO_FLAG_IO_REWRITE 0x00800 #define ZIO_FLAG_IO_REWRITE 0x00800
#define ZIO_FLAG_PROBE 0x01000 #define ZIO_FLAG_SELF_HEAL 0x01000
#define ZIO_FLAG_RESILVER 0x02000 #define ZIO_FLAG_RESILVER 0x02000
#define ZIO_FLAG_SCRUB 0x04000 #define ZIO_FLAG_SCRUB 0x04000
#define ZIO_FLAG_SCRUB_THREAD 0x08000 #define ZIO_FLAG_SCRUB_THREAD 0x08000
#define ZIO_FLAG_GANG_CHILD 0x10000 #define ZIO_FLAG_PROBE 0x10000
#define ZIO_FLAG_GANG_CHILD 0x20000
#define ZIO_FLAG_RAW 0x40000
#define ZIO_FLAG_GANG_INHERIT \ #define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \ (ZIO_FLAG_CANFAIL | \
@ -146,6 +148,7 @@ enum zio_compress {
ZIO_FLAG_DONT_RETRY | \ ZIO_FLAG_DONT_RETRY | \
ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_DONT_CACHE | \
ZIO_FLAG_DONT_AGGREGATE | \ ZIO_FLAG_DONT_AGGREGATE | \
ZIO_FLAG_SELF_HEAL | \
ZIO_FLAG_RESILVER | \ ZIO_FLAG_RESILVER | \
ZIO_FLAG_SCRUB | \ ZIO_FLAG_SCRUB | \
ZIO_FLAG_SCRUB_THREAD) ZIO_FLAG_SCRUB_THREAD)
@ -156,6 +159,14 @@ enum zio_compress {
ZIO_FLAG_IO_RETRY | \ ZIO_FLAG_IO_RETRY | \
ZIO_FLAG_PROBE) ZIO_FLAG_PROBE)
#define ZIO_FLAG_AGG_INHERIT \
(ZIO_FLAG_DONT_AGGREGATE | \
ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_SELF_HEAL | \
ZIO_FLAG_RESILVER | \
ZIO_FLAG_SCRUB | \
ZIO_FLAG_SCRUB_THREAD)
#define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101 #define ZIO_PIPELINE_STOP 0x101

View File

@ -720,6 +720,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
vdev_t *vd; vdev_t *vd;
int dshift = 3; int dshift = 3;
int all_zero; int all_zero;
int zio_lock = B_FALSE;
boolean_t allocatable;
uint64_t offset = -1ULL; uint64_t offset = -1ULL;
uint64_t asize; uint64_t asize;
uint64_t distance; uint64_t distance;
@ -778,11 +780,20 @@ top:
all_zero = B_TRUE; all_zero = B_TRUE;
do { do {
vd = mg->mg_vd; vd = mg->mg_vd;
/* /*
* Don't allocate from faulted devices. * Don't allocate from faulted devices.
*/ */
if (!vdev_allocatable(vd)) if (zio_lock) {
spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
allocatable = vdev_allocatable(vd);
spa_config_exit(spa, SCL_ZIO, FTAG);
} else {
allocatable = vdev_allocatable(vd);
}
if (!allocatable)
goto next; goto next;
/* /*
* Avoid writing single-copy data to a failing vdev * Avoid writing single-copy data to a failing vdev
*/ */
@ -858,6 +869,12 @@ next:
goto top; goto top;
} }
if (!zio_lock) {
dshift = 3;
zio_lock = B_TRUE;
goto top;
}
bzero(&dva[d], sizeof (dva_t)); bzero(&dva[d], sizeof (dva_t));
return (ENOSPC); return (ENOSPC);
@ -946,7 +963,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
space_map_claim(&msp->ms_map, offset, size); space_map_claim(&msp->ms_map, offset, size);
if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg); vdev_dirty(vd, VDD_METASLAB, msp, txg);
space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);

View File

@ -488,13 +488,14 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
* Activate an uninitialized pool. * Activate an uninitialized pool.
*/ */
static void static void
spa_activate(spa_t *spa) spa_activate(spa_t *spa, int mode)
{ {
int t, q; int t, q;
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
spa->spa_state = POOL_STATE_ACTIVE; spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
spa->spa_normal_class = metaslab_class_create(); spa->spa_normal_class = metaslab_class_create();
spa->spa_log_class = metaslab_class_create(); spa->spa_log_class = metaslab_class_create();
@ -645,11 +646,6 @@ spa_unload(spa_t *spa)
cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
mutex_exit(&spa->spa_async_root_lock); mutex_exit(&spa->spa_async_root_lock);
/*
* Drop and purge level 2 cache
*/
spa_l2cache_drop(spa);
/* /*
* Close the dsl pool. * Close the dsl pool.
*/ */
@ -658,6 +654,13 @@ spa_unload(spa_t *spa)
spa->spa_dsl_pool = NULL; spa->spa_dsl_pool = NULL;
} }
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
* Drop and purge level 2 cache
*/
spa_l2cache_drop(spa);
/* /*
* Close all vdevs. * Close all vdevs.
*/ */
@ -692,6 +695,8 @@ spa_unload(spa_t *spa)
spa->spa_l2cache.sav_count = 0; spa->spa_l2cache.sav_count = 0;
spa->spa_async_suspended = 0; spa->spa_async_suspended = 0;
spa_config_exit(spa, SCL_ALL, FTAG);
} }
/* /*
@ -903,12 +908,9 @@ spa_load_l2cache(spa_t *spa)
vd = oldvdevs[i]; vd = oldvdevs[i];
if (vd != NULL) { if (vd != NULL) {
if ((spa_mode & FWRITE) && if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd))
pool != 0ULL &&
l2arc_vdev_present(vd)) {
l2arc_remove_vdev(vd); l2arc_remove_vdev(vd);
}
(void) vdev_close(vd); (void) vdev_close(vd);
spa_l2cache_remove(vd); spa_l2cache_remove(vd);
} }
@ -1024,8 +1026,16 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
uint64_t pool_guid; uint64_t pool_guid;
uint64_t version; uint64_t version;
uint64_t autoreplace = 0; uint64_t autoreplace = 0;
int orig_mode = spa->spa_mode;
char *ereport = FM_EREPORT_ZFS_POOL; char *ereport = FM_EREPORT_ZFS_POOL;
/*
* If this is an untrusted config, access the pool in read-only mode.
* This prevents things like resilvering recently removed devices.
*/
if (!mosconfig)
spa->spa_mode = FREAD;
ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa->spa_load_state = state; spa->spa_load_state = state;
@ -1083,12 +1093,13 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
* Validate the labels for all leaf vdevs. We need to grab the config * Validate the labels for all leaf vdevs. We need to grab the config
* lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
*/ */
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (mosconfig) {
error = vdev_validate(rvd); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_config_exit(spa, SCL_ALL, FTAG); error = vdev_validate(rvd);
spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0) if (error != 0)
goto out; goto out;
}
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
error = ENXIO; error = ENXIO;
@ -1190,7 +1201,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_set(spa, newconfig); spa_config_set(spa, newconfig);
spa_unload(spa); spa_unload(spa);
spa_deactivate(spa); spa_deactivate(spa);
spa_activate(spa); spa_activate(spa, orig_mode);
return (spa_load(spa, newconfig, state, B_TRUE)); return (spa_load(spa, newconfig, state, B_TRUE));
} }
@ -1382,10 +1393,11 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
goto out; goto out;
} }
if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { if (spa_writeable(spa)) {
dmu_tx_t *tx; dmu_tx_t *tx;
int need_update = B_FALSE; int need_update = B_FALSE;
int c;
ASSERT(state != SPA_LOAD_TRYIMPORT);
/* /*
* Claim log blocks that haven't been committed yet. * Claim log blocks that haven't been committed yet.
@ -1413,7 +1425,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
state == SPA_LOAD_IMPORT) state == SPA_LOAD_IMPORT)
need_update = B_TRUE; need_update = B_TRUE;
for (c = 0; c < rvd->vdev_children; c++) for (int c = 0; c < rvd->vdev_children; c++)
if (rvd->vdev_child[c]->vdev_ms_array == 0) if (rvd->vdev_child[c]->vdev_ms_array == 0)
need_update = B_TRUE; need_update = B_TRUE;
@ -1423,6 +1435,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
*/ */
if (need_update) if (need_update)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
/*
* Check all DTLs to see if anything needs resilvering.
*/
if (vdev_resilver_needed(rvd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
} }
error = 0; error = 0;
@ -1475,7 +1493,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
} }
if (spa->spa_state == POOL_STATE_UNINITIALIZED) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
spa_activate(spa); spa_activate(spa, spa_mode_global);
error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
@ -1879,11 +1897,9 @@ spa_l2cache_drop(spa_t *spa)
vd = sav->sav_vdevs[i]; vd = sav->sav_vdevs[i];
ASSERT(vd != NULL); ASSERT(vd != NULL);
if ((spa_mode & FWRITE) && if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && pool != 0ULL && l2arc_vdev_present(vd))
l2arc_vdev_present(vd)) {
l2arc_remove_vdev(vd); l2arc_remove_vdev(vd);
}
if (vd->vdev_isl2cache) if (vd->vdev_isl2cache)
spa_l2cache_remove(vd); spa_l2cache_remove(vd);
vdev_clear_stats(vd); vdev_clear_stats(vd);
@ -1924,7 +1940,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(void) nvlist_lookup_string(props, (void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
spa = spa_add(pool, altroot); spa = spa_add(pool, altroot);
spa_activate(spa); spa_activate(spa, spa_mode_global);
spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_txg = txg - 1;
@ -2127,7 +2143,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
(void) nvlist_lookup_string(props, (void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
spa = spa_add(pool, altroot); spa = spa_add(pool, altroot);
spa_activate(spa); spa_activate(spa, spa_mode_global);
if (allowfaulted) if (allowfaulted)
spa->spa_import_faulted = B_TRUE; spa->spa_import_faulted = B_TRUE;
@ -2166,7 +2182,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
VDEV_ALLOC_L2CACHE); VDEV_ALLOC_L2CACHE);
spa_config_exit(spa, SCL_ALL, FTAG); spa_config_exit(spa, SCL_ALL, FTAG);
if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { if (error != 0 || (props && spa_writeable(spa) &&
(error = spa_prop_set(spa, props)))) {
if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
/* /*
* If we failed to load the pool, but 'allowfaulted' is * If we failed to load the pool, but 'allowfaulted' is
@ -2225,7 +2242,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
spa->spa_l2cache.sav_sync = B_TRUE; spa->spa_l2cache.sav_sync = B_TRUE;
} }
if (spa_mode & FWRITE) { if (spa_writeable(spa)) {
/* /*
* Update the config cache to include the newly-imported pool. * Update the config cache to include the newly-imported pool.
*/ */
@ -2373,11 +2390,11 @@ spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
char *cdevid, *cpath; char *cdevid, *cpath;
uint64_t tmptxg; uint64_t tmptxg;
cpath = NULL;
cdevid = NULL;
if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
&cpath) != 0) &cpath) != 0 && nvlist_lookup_string(child[c],
return (EINVAL); ZPOOL_CONFIG_DEVID, &cdevid) != 0)
if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
&cdevid) != 0)
return (EINVAL); return (EINVAL);
if ((spa_check_rootconf(cpath, cdevid, NULL, if ((spa_check_rootconf(cpath, cdevid, NULL,
&tmptxg) == 0) && (tmptxg > txg)) { &tmptxg) == 0) && (tmptxg > txg)) {
@ -2495,7 +2512,7 @@ spa_tryimport(nvlist_t *tryconfig)
*/ */
mutex_enter(&spa_namespace_lock); mutex_enter(&spa_namespace_lock);
spa = spa_add(TRYIMPORT_NAME, NULL); spa = spa_add(TRYIMPORT_NAME, NULL);
spa_activate(spa); spa_activate(spa, FREAD);
/* /*
* Pass off the heavy lifting to spa_load(). * Pass off the heavy lifting to spa_load().
@ -2569,18 +2586,19 @@ spa_tryimport(nvlist_t *tryconfig)
* The act of destroying or exporting a pool is very simple. We make sure there * The act of destroying or exporting a pool is very simple. We make sure there
* is no more pending I/O and any references to the pool are gone. Then, we * is no more pending I/O and any references to the pool are gone. Then, we
* update the pool state and sync all the labels to disk, removing the * update the pool state and sync all the labels to disk, removing the
* configuration from the cache afterwards. * configuration from the cache afterwards. If the 'hardforce' flag is set, then
* we don't sync the labels or remove the configuration cache.
*/ */
static int static int
spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force) boolean_t force, boolean_t hardforce)
{ {
spa_t *spa; spa_t *spa;
if (oldconfig) if (oldconfig)
*oldconfig = NULL; *oldconfig = NULL;
if (!(spa_mode & FWRITE)) if (!(spa_mode_global & FWRITE))
return (EROFS); return (EROFS);
mutex_enter(&spa_namespace_lock); mutex_enter(&spa_namespace_lock);
@ -2641,7 +2659,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
* so mark them all dirty. spa_unload() will do the * so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out. * final sync that pushes these changes out.
*/ */
if (new_state != POOL_STATE_UNINITIALIZED) { if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state; spa->spa_state = new_state;
spa->spa_final_txg = spa_last_synced_txg(spa) + 1; spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
@ -2661,7 +2679,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
if (new_state != POOL_STATE_UNINITIALIZED) { if (new_state != POOL_STATE_UNINITIALIZED) {
spa_config_sync(spa, B_TRUE, B_TRUE); if (!hardforce)
spa_config_sync(spa, B_TRUE, B_TRUE);
spa_remove(spa); spa_remove(spa);
} }
mutex_exit(&spa_namespace_lock); mutex_exit(&spa_namespace_lock);
@ -2675,16 +2694,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
int int
spa_destroy(char *pool) spa_destroy(char *pool)
{ {
return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
B_FALSE, B_FALSE));
} }
/* /*
* Export a storage pool. * Export a storage pool.
*/ */
int int
spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
boolean_t hardforce)
{ {
return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
force, hardforce));
} }
/* /*
@ -2695,7 +2717,7 @@ int
spa_reset(char *pool) spa_reset(char *pool)
{ {
return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
B_FALSE)); B_FALSE, B_FALSE));
} }
/* /*
@ -2711,7 +2733,7 @@ int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot) spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{ {
uint64_t txg; uint64_t txg;
int c, error; int error;
vdev_t *rvd = spa->spa_root_vdev; vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd; vdev_t *vd, *tvd;
nvlist_t **spares, **l2cache; nvlist_t **spares, **l2cache;
@ -2750,7 +2772,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
/* /*
* Transfer each new top-level vdev from vd to rvd. * Transfer each new top-level vdev from vd to rvd.
*/ */
for (c = 0; c < vd->vdev_children; c++) { for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c]; tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd); vdev_remove_child(vd, tvd);
tvd->vdev_id = rvd->vdev_children; tvd->vdev_id = rvd->vdev_children;
@ -2958,10 +2980,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/ */
open_txg = txg + TXG_CONCURRENT_STATES - 1; open_txg = txg + TXG_CONCURRENT_STATES - 1;
mutex_enter(&newvd->vdev_dtl_lock); vdev_dtl_dirty(newvd, DTL_MISSING,
space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, TXG_INITIAL, open_txg - TXG_INITIAL + 1);
open_txg - TXG_INITIAL + 1);
mutex_exit(&newvd->vdev_dtl_lock);
if (newvd->vdev_isspare) if (newvd->vdev_isspare)
spa_spare_activate(newvd); spa_spare_activate(newvd);
@ -3005,10 +3025,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* is a replacing vdev. * is a replacing vdev.
*/ */
int int
spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
{ {
uint64_t txg; uint64_t txg;
int c, t, error; int error;
vdev_t *rvd = spa->spa_root_vdev; vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *pvd, *cvd, *tvd; vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE; boolean_t unspare = B_FALSE;
@ -3027,6 +3047,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
pvd = vd->vdev_parent; pvd = vd->vdev_parent;
/*
* If the parent/child relationship is not as expected, don't do it.
* Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
* vdev that's replacing B with C. The user's intent in replacing
* is to go from M(A,B) to M(A,C). If the user decides to cancel
* the replace by detaching C, the expected behavior is to end up
* M(A,B). But suppose that right after deciding to detach C,
* the replacement of B completes. We would have M(A,C), and then
* ask to detach C, which would leave us with just A -- not what
* the user wanted. To prevent this, we make sure that the
* parent/child relationship hasn't changed -- in this example,
* that C's parent is still the replacing vdev R.
*/
if (pvd->vdev_guid != pguid && pguid != 0)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/* /*
* If replace_done is specified, only remove this device if it's * If replace_done is specified, only remove this device if it's
* the first child of a replacing vdev. For the 'spare' vdev, either * the first child of a replacing vdev. For the 'spare' vdev, either
@ -3053,36 +3089,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
/* /*
* If there's only one replica, you can't detach it. * If this device has the only valid copy of some data,
* we cannot safely detach it.
*/ */
if (pvd->vdev_children <= 1) if (vdev_dtl_required(vd))
return (spa_vdev_exit(spa, NULL, txg, EBUSY)); return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/* ASSERT(pvd->vdev_children >= 2);
* If all siblings have non-empty DTLs, this device may have the only
* valid copy of the data, which means we cannot safely detach it.
*
* XXX -- as in the vdev_offline() case, we really want a more
* precise DTL check.
*/
for (c = 0; c < pvd->vdev_children; c++) {
uint64_t dirty;
cvd = pvd->vdev_child[c];
if (cvd == vd)
continue;
if (vdev_is_dead(cvd))
continue;
mutex_enter(&cvd->vdev_dtl_lock);
dirty = cvd->vdev_dtl_map.sm_space |
cvd->vdev_dtl_scrub.sm_space;
mutex_exit(&cvd->vdev_dtl_lock);
if (!dirty)
break;
}
if (c == pvd->vdev_children)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/* /*
* If we are detaching the second disk from a replacing vdev, then * If we are detaching the second disk from a replacing vdev, then
@ -3108,7 +3121,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
* active spare list for the pool. * active spare list for the pool.
*/ */
if (pvd->vdev_ops == &vdev_spare_ops && if (pvd->vdev_ops == &vdev_spare_ops &&
vd->vdev_id == 0) vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
unspare = B_TRUE; unspare = B_TRUE;
/* /*
@ -3134,14 +3147,18 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
/* /*
* If we need to remove the remaining child from the list of hot spares, * If we need to remove the remaining child from the list of hot spares,
* do it now, marking the vdev as no longer a spare in the process. We * do it now, marking the vdev as no longer a spare in the process.
* must do this before vdev_remove_parent(), because that can change the * We must do this before vdev_remove_parent(), because that can
* GUID if it creates a new toplevel GUID. * change the GUID if it creates a new toplevel GUID. For a similar
* reason, we must remove the spare now, in the same txg as the detach;
* otherwise someone could attach a new sibling, change the GUID, and
* the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
*/ */
if (unspare) { if (unspare) {
ASSERT(cvd->vdev_isspare); ASSERT(cvd->vdev_isspare);
spa_spare_remove(cvd); spa_spare_remove(cvd);
unspare_guid = cvd->vdev_guid; unspare_guid = cvd->vdev_guid;
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
} }
/* /*
@ -3179,7 +3196,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
* But first make sure we're not on any *other* txg's DTL list, to * But first make sure we're not on any *other* txg's DTL list, to
* prevent vd from being accessed after it's freed. * prevent vd from being accessed after it's freed.
*/ */
for (t = 0; t < TXG_SIZE; t++) for (int t = 0; t < TXG_SIZE; t++)
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
vd->vdev_detached = B_TRUE; vd->vdev_detached = B_TRUE;
vdev_dirty(tvd, VDD_DTL, vd, txg); vdev_dirty(tvd, VDD_DTL, vd, txg);
@ -3194,11 +3211,14 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
* list of every other pool. * list of every other pool.
*/ */
if (unspare) { if (unspare) {
spa_t *myspa = spa;
spa = NULL; spa = NULL;
mutex_enter(&spa_namespace_lock); mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) { while ((spa = spa_next(spa)) != NULL) {
if (spa->spa_state != POOL_STATE_ACTIVE) if (spa->spa_state != POOL_STATE_ACTIVE)
continue; continue;
if (spa == myspa)
continue;
spa_open_ref(spa, FTAG); spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock); mutex_exit(&spa_namespace_lock);
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE); (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
@ -3265,10 +3285,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
vdev_t *vd; vdev_t *vd;
nvlist_t **spares, **l2cache, *nv; nvlist_t **spares, **l2cache, *nv;
uint_t nspares, nl2cache; uint_t nspares, nl2cache;
uint64_t txg; uint64_t txg = 0;
int error = 0; int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
txg = spa_vdev_enter(spa); if (!locked)
txg = spa_vdev_enter(spa);
vd = spa_lookup_by_guid(spa, guid, B_FALSE); vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@ -3311,7 +3333,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
error = ENOENT; error = ENOENT;
} }
return (spa_vdev_exit(spa, NULL, txg, error)); if (!locked)
return (spa_vdev_exit(spa, NULL, txg, error));
return (error);
} }
/* /*
@ -3337,13 +3362,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
oldvd = vd->vdev_child[0]; oldvd = vd->vdev_child[0];
newvd = vd->vdev_child[1]; newvd = vd->vdev_child[1];
mutex_enter(&newvd->vdev_dtl_lock); if (vdev_dtl_empty(newvd, DTL_MISSING) &&
if (newvd->vdev_dtl_map.sm_space == 0 && !vdev_dtl_required(oldvd))
newvd->vdev_dtl_scrub.sm_space == 0) {
mutex_exit(&newvd->vdev_dtl_lock);
return (oldvd); return (oldvd);
}
mutex_exit(&newvd->vdev_dtl_lock);
} }
/* /*
@ -3353,15 +3374,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
newvd = vd->vdev_child[0]; newvd = vd->vdev_child[0];
oldvd = vd->vdev_child[1]; oldvd = vd->vdev_child[1];
mutex_enter(&newvd->vdev_dtl_lock);
if (newvd->vdev_unspare && if (newvd->vdev_unspare &&
newvd->vdev_dtl_map.sm_space == 0 && vdev_dtl_empty(newvd, DTL_MISSING) &&
newvd->vdev_dtl_scrub.sm_space == 0) { !vdev_dtl_required(oldvd)) {
newvd->vdev_unspare = 0; newvd->vdev_unspare = 0;
mutex_exit(&newvd->vdev_dtl_lock);
return (oldvd); return (oldvd);
} }
mutex_exit(&newvd->vdev_dtl_lock);
} }
return (NULL); return (NULL);
@ -3370,36 +3388,37 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
static void static void
spa_vdev_resilver_done(spa_t *spa) spa_vdev_resilver_done(spa_t *spa)
{ {
vdev_t *vd; vdev_t *vd, *pvd, *ppvd;
vdev_t *pvd; uint64_t guid, sguid, pguid, ppguid;
uint64_t guid;
uint64_t pguid = 0;
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
pvd = vd->vdev_parent;
ppvd = pvd->vdev_parent;
guid = vd->vdev_guid; guid = vd->vdev_guid;
pguid = pvd->vdev_guid;
ppguid = ppvd->vdev_guid;
sguid = 0;
/* /*
* If we have just finished replacing a hot spared device, then * If we have just finished replacing a hot spared device, then
* we need to detach the parent's first child (the original hot * we need to detach the parent's first child (the original hot
* spare) as well. * spare) as well.
*/ */
pvd = vd->vdev_parent; if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
pvd->vdev_id == 0) {
ASSERT(pvd->vdev_ops == &vdev_replacing_ops); ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
ASSERT(pvd->vdev_parent->vdev_children == 2); ASSERT(ppvd->vdev_children == 2);
pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; sguid = ppvd->vdev_child[1]->vdev_guid;
} }
spa_config_exit(spa, SCL_CONFIG, FTAG); spa_config_exit(spa, SCL_ALL, FTAG);
if (spa_vdev_detach(spa, guid, B_TRUE) != 0) if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return; return;
if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
return; return;
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
} }
spa_config_exit(spa, SCL_CONFIG, FTAG); spa_config_exit(spa, SCL_ALL, FTAG);
} }
/* /*
@ -3938,9 +3957,22 @@ spa_sync(spa_t *spa, uint64_t txg)
* into config changes that go out with this transaction group. * into config changes that go out with this transaction group.
*/ */
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { while (list_head(&spa->spa_state_dirty_list) != NULL) {
vdev_state_clean(vd); /*
vdev_config_dirty(vd); * We need the write lock here because, for aux vdevs,
* calling vdev_config_dirty() modifies sav_config.
* This is ugly and will become unnecessary when we
* eliminate the aux vdev wart by integrating all vdevs
* into the root vdev tree.
*/
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
vdev_state_clean(vd);
vdev_config_dirty(vd);
}
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
} }
spa_config_exit(spa, SCL_STATE, FTAG); spa_config_exit(spa, SCL_STATE, FTAG);

View File

@ -208,6 +208,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(MUTEX_HELD(&spa_namespace_lock));
if (rootdir == NULL)
return;
/* /*
* Iterate over all cachefiles for the pool, past or present. When the * Iterate over all cachefiles for the pool, past or present. When the
* cachefile is changed, the new one is pushed onto this list, allowing * cachefile is changed, the new one is pushed onto this list, allowing

View File

@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock;
static avl_tree_t spa_l2cache_avl; static avl_tree_t spa_l2cache_avl;
kmem_cache_t *spa_buffer_pool; kmem_cache_t *spa_buffer_pool;
int spa_mode; int spa_mode_global;
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG
/* Everything except dprintf is on by default in debug builds */ /* Everything except dprintf is on by default in debug builds */
@ -890,8 +890,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
txg_wait_synced(spa->spa_dsl_pool, txg); txg_wait_synced(spa->spa_dsl_pool, txg);
if (vd != NULL) { if (vd != NULL) {
ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd); vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
} }
/* /*
@ -922,6 +924,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
spa_config_exit(spa, SCL_STATE_ALL, spa); spa_config_exit(spa, SCL_STATE_ALL, spa);
/*
* If anything changed, wait for it to sync. This ensures that,
* from the system administrator's perspective, zpool(1M) commands
* are synchronous. This is important for things like zpool offline:
* when the command completes, you expect no further I/O from ZFS.
*/
if (vd != NULL)
txg_wait_synced(spa->spa_dsl_pool, 0);
return (error); return (error);
} }
@ -1361,7 +1372,7 @@ spa_init(int mode)
avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
offsetof(spa_aux_t, aux_avl)); offsetof(spa_aux_t, aux_avl));
spa_mode = mode; spa_mode_global = mode;
refcount_init(); refcount_init();
unique_init(); unique_init();
@ -1418,3 +1429,15 @@ spa_is_root(spa_t *spa)
{ {
return (spa->spa_is_root); return (spa->spa_is_root);
} }
boolean_t
spa_writeable(spa_t *spa)
{
return (!!(spa->spa_mode & FWRITE));
}
int
spa_mode(spa_t *spa)
{
return (spa->spa_mode);
}

View File

@ -23,8 +23,6 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/dmu.h> #include <sys/dmu.h>
@ -60,6 +58,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
{ {
bzero(sm, sizeof (*sm)); bzero(sm, sizeof (*sm));
cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
avl_create(&sm->sm_root, space_map_seg_compare, avl_create(&sm->sm_root, space_map_seg_compare,
sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
@ -78,6 +78,7 @@ space_map_destroy(space_map_t *sm)
VERIFY3U(sm->sm_space, ==, 0); VERIFY3U(sm->sm_space, ==, 0);
cv_destroy(&sm->sm_load_cv); cv_destroy(&sm->sm_load_cv);
avl_destroy(&sm->sm_root); avl_destroy(&sm->sm_root);
cv_destroy(&sm->sm_load_cv);
} }
void void
@ -183,7 +184,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
sm->sm_space -= size; sm->sm_space -= size;
} }
int boolean_t
space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
{ {
avl_index_t where; avl_index_t where;
@ -223,59 +224,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
{ {
space_seg_t *ss; space_seg_t *ss;
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
}
void
space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
{
avl_tree_t *t = &sm->sm_root;
avl_index_t where;
space_seg_t *ss, search;
uint64_t end = start + size;
uint64_t rm_start, rm_end;
ASSERT(MUTEX_HELD(sm->sm_lock)); ASSERT(MUTEX_HELD(sm->sm_lock));
search.ss_start = start; for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
search.ss_end = start; func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
for (;;) {
ss = avl_find(t, &search, &where);
if (ss == NULL)
ss = avl_nearest(t, where, AVL_AFTER);
if (ss == NULL || ss->ss_start >= end)
break;
rm_start = MAX(ss->ss_start, start);
rm_end = MIN(ss->ss_end, end);
space_map_remove(sm, rm_start, rm_end - rm_start);
}
}
/*
* Replace smd with the union of smd and sms.
*/
void
space_map_union(space_map_t *smd, space_map_t *sms)
{
avl_tree_t *t = &sms->sm_root;
space_seg_t *ss;
ASSERT(MUTEX_HELD(smd->sm_lock));
/*
* For each source segment, remove any intersections with the
* destination, then add the source segment to the destination.
*/
for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
}
} }
/* /*
@ -507,3 +459,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
smo->smo_objsize = 0; smo->smo_objsize = 0;
smo->smo_alloc = 0; smo->smo_alloc = 0;
} }
/*
* Space map reference trees.
*
* A space map is a collection of integers. Every integer is either
* in the map, or it's not. A space map reference tree generalizes
* the idea: it allows its members to have arbitrary reference counts,
* as opposed to the implicit reference count of 0 or 1 in a space map.
* This representation comes in handy when computing the union or
* intersection of multiple space maps. For example, the union of
* N space maps is the subset of the reference tree with refcnt >= 1.
* The intersection of N space maps is the subset with refcnt >= N.
*
* [It's very much like a Fourier transform. Unions and intersections
* are hard to perform in the 'space map domain', so we convert the maps
* into the 'reference count domain', where it's trivial, then invert.]
*
* vdev_dtl_reassess() uses computations of this form to determine
* DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
* has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
* has an outage wherever refcnt >= vdev_children.
*/
static int
space_map_ref_compare(const void *x1, const void *x2)
{
const space_ref_t *sr1 = x1;
const space_ref_t *sr2 = x2;
if (sr1->sr_offset < sr2->sr_offset)
return (-1);
if (sr1->sr_offset > sr2->sr_offset)
return (1);
if (sr1 < sr2)
return (-1);
if (sr1 > sr2)
return (1);
return (0);
}
void
space_map_ref_create(avl_tree_t *t)
{
avl_create(t, space_map_ref_compare,
sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
}
void
space_map_ref_destroy(avl_tree_t *t)
{
space_ref_t *sr;
void *cookie = NULL;
while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(sr, sizeof (*sr));
avl_destroy(t);
}
static void
space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
{
space_ref_t *sr;
sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
sr->sr_offset = offset;
sr->sr_refcnt = refcnt;
avl_add(t, sr);
}
void
space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
int64_t refcnt)
{
space_map_ref_add_node(t, start, refcnt);
space_map_ref_add_node(t, end, -refcnt);
}
/*
* Convert (or add) a space map into a reference tree.
*/
void
space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
{
space_seg_t *ss;
ASSERT(MUTEX_HELD(sm->sm_lock));
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
}
/*
* Convert a reference tree into a space map. The space map will contain
* all members of the reference tree for which refcnt >= minref.
*/
void
space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
{
uint64_t start = -1ULL;
int64_t refcnt = 0;
space_ref_t *sr;
ASSERT(MUTEX_HELD(sm->sm_lock));
space_map_vacate(sm, NULL, NULL);
for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
refcnt += sr->sr_refcnt;
if (refcnt >= minref) {
if (start == -1ULL) {
start = sr->sr_offset;
}
} else {
if (start != -1ULL) {
uint64_t end = sr->sr_offset;
ASSERT(start <= end);
if (end > start)
space_map_add(sm, start, end - start);
start = -1ULL;
}
}
}
ASSERT(refcnt == 0);
ASSERT(start == -1ULL);
}

View File

@ -94,6 +94,12 @@ txg_fini(dsl_pool_t *dp)
rw_destroy(&tx->tx_suspend); rw_destroy(&tx->tx_suspend);
mutex_destroy(&tx->tx_sync_lock); mutex_destroy(&tx->tx_sync_lock);
cv_destroy(&tx->tx_sync_more_cv);
cv_destroy(&tx->tx_sync_done_cv);
cv_destroy(&tx->tx_quiesce_more_cv);
cv_destroy(&tx->tx_quiesce_done_cv);
cv_destroy(&tx->tx_exit_cv);
for (c = 0; c < max_ncpus; c++) { for (c = 0; c < max_ncpus; c++) {
int i; int i;

View File

@ -318,8 +318,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) {
space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
&vd->vdev_dtl_lock);
}
txg_list_create(&vd->vdev_ms_list, txg_list_create(&vd->vdev_ms_list,
offsetof(struct metaslab, ms_txg_node)); offsetof(struct metaslab, ms_txg_node));
txg_list_create(&vd->vdev_dtl_list, txg_list_create(&vd->vdev_dtl_list,
@ -476,7 +478,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
if (alloctype == VDEV_ALLOC_LOAD) { if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl.smo_object); &vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare); &vd->vdev_unspare);
} }
@ -568,12 +570,14 @@ vdev_free(vdev_t *vd)
txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_ms_list);
txg_list_destroy(&vd->vdev_dtl_list); txg_list_destroy(&vd->vdev_dtl_list);
mutex_enter(&vd->vdev_dtl_lock); mutex_enter(&vd->vdev_dtl_lock);
space_map_unload(&vd->vdev_dtl_map); for (int t = 0; t < DTL_TYPES; t++) {
space_map_destroy(&vd->vdev_dtl_map); space_map_unload(&vd->vdev_dtl[t]);
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); space_map_destroy(&vd->vdev_dtl[t]);
space_map_destroy(&vd->vdev_dtl_scrub); }
mutex_exit(&vd->vdev_dtl_lock); mutex_exit(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_probe_lock);
@ -711,14 +715,18 @@ vdev_remove_parent(vdev_t *cvd)
vdev_remove_child(mvd, cvd); vdev_remove_child(mvd, cvd);
vdev_remove_child(pvd, mvd); vdev_remove_child(pvd, mvd);
/* /*
* If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
* Otherwise, we could have detached an offline device, and when we * Otherwise, we could have detached an offline device, and when we
* go to import the pool we'll think we have two top-level vdevs, * go to import the pool we'll think we have two top-level vdevs,
* instead of a different version of the same top-level vdev. * instead of a different version of the same top-level vdev.
*/ */
if (mvd->vdev_top == mvd) if (mvd->vdev_top == mvd) {
cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid; uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
cvd->vdev_id = mvd->vdev_id; cvd->vdev_id = mvd->vdev_id;
vdev_add_child(pvd, cvd); vdev_add_child(pvd, cvd);
vdev_top_update(cvd->vdev_top, cvd->vdev_top); vdev_top_update(cvd->vdev_top, cvd->vdev_top);
@ -817,6 +825,7 @@ typedef struct vdev_probe_stats {
static void static void
vdev_probe_done(zio_t *zio) vdev_probe_done(zio_t *zio)
{ {
spa_t *spa = zio->io_spa;
vdev_probe_stats_t *vps = zio->io_private; vdev_probe_stats_t *vps = zio->io_private;
vdev_t *vd = vps->vps_vd; vdev_t *vd = vps->vps_vd;
@ -824,7 +833,7 @@ vdev_probe_done(zio_t *zio)
ASSERT(zio->io_vd == vd); ASSERT(zio->io_vd == vd);
if (zio->io_error == 0) if (zio->io_error == 0)
vps->vps_readable = 1; vps->vps_readable = 1;
if (zio->io_error == 0 && (spa_mode & FWRITE)) { if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vps->vps_root, vd, zio_nowait(zio_write_phys(vps->vps_root, vd,
zio->io_offset, zio->io_size, zio->io_data, zio->io_offset, zio->io_size, zio->io_data,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
@ -845,12 +854,12 @@ vdev_probe_done(zio_t *zio)
vd->vdev_cant_write |= !vps->vps_writeable; vd->vdev_cant_write |= !vps->vps_writeable;
if (vdev_readable(vd) && if (vdev_readable(vd) &&
(vdev_writeable(vd) || !(spa_mode & FWRITE))) { (vdev_writeable(vd) || !spa_writeable(spa))) {
zio->io_error = 0; zio->io_error = 0;
} else { } else {
ASSERT(zio->io_error != 0); ASSERT(zio->io_error != 0);
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
zio->io_spa, vd, NULL, 0, 0); spa, vd, NULL, 0, 0);
zio->io_error = ENXIO; zio->io_error = ENXIO;
} }
kmem_free(vps, sizeof (*vps)); kmem_free(vps, sizeof (*vps));
@ -919,12 +928,15 @@ vdev_probe(vdev_t *vd, zio_t *pio)
int int
vdev_open(vdev_t *vd) vdev_open(vdev_t *vd)
{ {
spa_t *spa = vd->vdev_spa;
int error; int error;
int c; int c;
uint64_t osize = 0; uint64_t osize = 0;
uint64_t asize, psize; uint64_t asize, psize;
uint64_t ashift = 0; uint64_t ashift = 0;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_state == VDEV_STATE_OFFLINE);
@ -1058,16 +1070,12 @@ vdev_open(vdev_t *vd)
/* /*
* If a leaf vdev has a DTL, and seems healthy, then kick off a * If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a * resilver. But don't do this if we are doing a reopen for a scrub,
* scrub, since this would just restart the scrub we are already * since this would just restart the scrub we are already doing.
* doing.
*/ */
if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
mutex_enter(&vd->vdev_dtl_lock); vdev_resilver_needed(vd, NULL, NULL))
if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER);
spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
mutex_exit(&vd->vdev_dtl_lock);
}
return (0); return (0);
} }
@ -1168,6 +1176,10 @@ vdev_validate(vdev_t *vd)
void void
vdev_close(vdev_t *vd) vdev_close(vdev_t *vd)
{ {
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
vd->vdev_ops->vdev_op_close(vd); vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd); vdev_cache_purge(vd);
@ -1286,34 +1298,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
} }
/*
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
* the vdev has less than perfect replication. There are three kinds of DTL:
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
* DTL_PARTIAL: txgs for which data is available, but not fully replicated
*
* DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
* scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
* txgs that was scrubbed.
*
* DTL_OUTAGE: txgs which cannot currently be read, whether due to
* persistent errors or just some device being offline.
* Unlike the other three, the DTL_OUTAGE map is not generally
* maintained; it's only computed when needed, typically to
* determine whether a device can be detached.
*
* For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
* either has the data or it doesn't.
*
* For interior vdevs such as mirror and RAID-Z the picture is more complex.
* A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
* if any child is less than fully replicated, then so is its parent.
* A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
* comprising only those txgs which appear in 'maxfaults' or more children;
* those are the txgs we don't have enough replication to read. For example,
* double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
* thus, its DTL_MISSING consists of the set of txgs that appear in more than
* two child DTL_MISSING maps.
*
* It should be clear from the above that to compute the DTLs and outage maps
* for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
* Therefore, that is all we keep on disk. When loading the pool, or after
* a configuration change, we generate all other DTLs from first principles.
*/
void void
vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
{ {
space_map_t *sm = &vd->vdev_dtl[t];
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
mutex_enter(sm->sm_lock); mutex_enter(sm->sm_lock);
if (!space_map_contains(sm, txg, size)) if (!space_map_contains(sm, txg, size))
space_map_add(sm, txg, size); space_map_add(sm, txg, size);
mutex_exit(sm->sm_lock); mutex_exit(sm->sm_lock);
} }
int boolean_t
vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
{ {
int dirty; space_map_t *sm = &vd->vdev_dtl[t];
boolean_t dirty = B_FALSE;
/* ASSERT(t < DTL_TYPES);
* Quick test without the lock -- covers the common case that ASSERT(vd != vd->vdev_spa->spa_root_vdev);
* there are no dirty time segments.
*/
if (sm->sm_space == 0)
return (0);
mutex_enter(sm->sm_lock); mutex_enter(sm->sm_lock);
dirty = space_map_contains(sm, txg, size); if (sm->sm_space != 0)
dirty = space_map_contains(sm, txg, size);
mutex_exit(sm->sm_lock); mutex_exit(sm->sm_lock);
return (dirty); return (dirty);
} }
boolean_t
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
{
space_map_t *sm = &vd->vdev_dtl[t];
boolean_t empty;
mutex_enter(sm->sm_lock);
empty = (sm->sm_space == 0);
mutex_exit(sm->sm_lock);
return (empty);
}
/* /*
* Reassess DTLs after a config change or scrub completion. * Reassess DTLs after a config change or scrub completion.
*/ */
@ -1321,11 +1387,19 @@ void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
int c; avl_tree_t reftree;
int minref;
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (vd->vdev_children == 0) { for (int c = 0; c < vd->vdev_children; c++)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
if (vd == spa->spa_root_vdev)
return;
if (vd->vdev_ops->vdev_op_leaf) {
mutex_enter(&vd->vdev_dtl_lock); mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 && if (scrub_txg != 0 &&
(spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
@ -1336,12 +1410,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
* will be valid, so excise the old region and * will be valid, so excise the old region and
* fold in the scrub dtl. Otherwise, leave the * fold in the scrub dtl. Otherwise, leave the
* dtl as-is if there was an error. * dtl as-is if there was an error.
*
* There's little trick here: to excise the beginning
* of the DTL_MISSING map, we put it into a reference
* tree and then add a segment with refcnt -1 that
* covers the range [0, scrub_txg). This means
* that each txg in that range has refcnt -1 or 0.
* We then add DTL_SCRUB with a refcnt of 2, so that
* entries in the range [0, scrub_txg) will have a
* positive refcnt -- either 1 or 2. We then convert
* the reference tree into the new DTL_MISSING map.
*/ */
space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); space_map_ref_create(&reftree);
space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); space_map_ref_add_map(&reftree,
&vd->vdev_dtl[DTL_MISSING], 1);
space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
space_map_ref_add_map(&reftree,
&vd->vdev_dtl[DTL_SCRUB], 2);
space_map_ref_generate_map(&reftree,
&vd->vdev_dtl[DTL_MISSING], 1);
space_map_ref_destroy(&reftree);
} }
space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
if (scrub_done) if (scrub_done)
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
if (!vdev_readable(vd))
space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
else
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
mutex_exit(&vd->vdev_dtl_lock); mutex_exit(&vd->vdev_dtl_lock);
if (txg != 0) if (txg != 0)
@ -1349,35 +1449,34 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
return; return;
} }
/*
* Make sure the DTLs are always correct under the scrub lock.
*/
if (vd == spa->spa_root_vdev)
mutex_enter(&spa->spa_scrub_lock);
mutex_enter(&vd->vdev_dtl_lock); mutex_enter(&vd->vdev_dtl_lock);
space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); for (int t = 0; t < DTL_TYPES; t++) {
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); if (t == DTL_SCRUB)
mutex_exit(&vd->vdev_dtl_lock); continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
for (c = 0; c < vd->vdev_children; c++) { minref = 1; /* i.e. non-zero */
vdev_t *cvd = vd->vdev_child[c]; else if (vd->vdev_nparity != 0)
vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); minref = vd->vdev_nparity + 1; /* RAID-Z */
mutex_enter(&vd->vdev_dtl_lock); else
space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); minref = vd->vdev_children; /* any kind of mirror */
space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); space_map_ref_create(&reftree);
mutex_exit(&vd->vdev_dtl_lock); for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
mutex_exit(&cvd->vdev_dtl_lock);
}
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
space_map_ref_destroy(&reftree);
} }
mutex_exit(&vd->vdev_dtl_lock);
if (vd == spa->spa_root_vdev)
mutex_exit(&spa->spa_scrub_lock);
} }
static int static int
vdev_dtl_load(vdev_t *vd) vdev_dtl_load(vdev_t *vd)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
space_map_obj_t *smo = &vd->vdev_dtl; space_map_obj_t *smo = &vd->vdev_dtl_smo;
objset_t *mos = spa->spa_meta_objset; objset_t *mos = spa->spa_meta_objset;
dmu_buf_t *db; dmu_buf_t *db;
int error; int error;
@ -1395,7 +1494,8 @@ vdev_dtl_load(vdev_t *vd)
dmu_buf_rele(db, FTAG); dmu_buf_rele(db, FTAG);
mutex_enter(&vd->vdev_dtl_lock); mutex_enter(&vd->vdev_dtl_lock);
error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
NULL, SM_ALLOC, smo, mos);
mutex_exit(&vd->vdev_dtl_lock); mutex_exit(&vd->vdev_dtl_lock);
return (error); return (error);
@ -1405,8 +1505,8 @@ void
vdev_dtl_sync(vdev_t *vd, uint64_t txg) vdev_dtl_sync(vdev_t *vd, uint64_t txg)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
space_map_obj_t *smo = &vd->vdev_dtl; space_map_obj_t *smo = &vd->vdev_dtl_smo;
space_map_t *sm = &vd->vdev_dtl_map; space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
objset_t *mos = spa->spa_meta_objset; objset_t *mos = spa->spa_meta_objset;
space_map_t smsync; space_map_t smsync;
kmutex_t smlock; kmutex_t smlock;
@ -1462,6 +1562,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx); dmu_tx_commit(tx);
} }
/*
* Determine whether the specified vdev can be offlined/detached/removed
* without losing data.
*/
boolean_t
vdev_dtl_required(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
vdev_t *tvd = vd->vdev_top;
uint8_t cant_read = vd->vdev_cant_read;
boolean_t required;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
if (vd == spa->spa_root_vdev || vd == tvd)
return (B_TRUE);
/*
* Temporarily mark the device as unreadable, and then determine
* whether this results in any DTL outages in the top-level vdev.
* If not, we can safely offline/detach/remove the device.
*/
vd->vdev_cant_read = B_TRUE;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
return (required);
}
/* /*
* Determine if resilver is needed, and if so the txg range. * Determine if resilver is needed, and if so the txg range.
*/ */
@ -1474,19 +1605,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
if (vd->vdev_children == 0) { if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock); mutex_enter(&vd->vdev_dtl_lock);
if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
vdev_writeable(vd)) {
space_seg_t *ss; space_seg_t *ss;
ss = avl_first(&vd->vdev_dtl_map.sm_root); ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
thismin = ss->ss_start - 1; thismin = ss->ss_start - 1;
ss = avl_last(&vd->vdev_dtl_map.sm_root); ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
thismax = ss->ss_end; thismax = ss->ss_end;
needed = B_TRUE; needed = B_TRUE;
} }
mutex_exit(&vd->vdev_dtl_lock); mutex_exit(&vd->vdev_dtl_lock);
} else { } else {
int c; for (int c = 0; c < vd->vdev_children; c++) {
for (c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c]; vdev_t *cvd = vd->vdev_child[c];
uint64_t cmin, cmax; uint64_t cmin, cmax;
@ -1508,12 +1639,10 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
void void
vdev_load(vdev_t *vd) vdev_load(vdev_t *vd)
{ {
int c;
/* /*
* Recursively load all children. * Recursively load all children.
*/ */
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
vdev_load(vd->vdev_child[c]); vdev_load(vd->vdev_child[c]);
/* /*
@ -1733,11 +1862,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
vd->vdev_parent->vdev_child[0] == vd) vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE; vd->vdev_unspare = B_TRUE;
(void) spa_vdev_state_exit(spa, vd, 0); return (spa_vdev_state_exit(spa, vd, 0));
VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
return (0);
} }
int int
@ -1758,13 +1883,10 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
*/ */
if (!vd->vdev_offline) { if (!vd->vdev_offline) {
/* /*
* If this device's top-level vdev has a non-empty DTL, * If this device has the only valid copy of some data,
* don't allow the device to be offlined. * don't allow it to be offlined.
*
* XXX -- make this more precise by allowing the offline
* as long as the remaining devices don't have any DTL holes.
*/ */
if (vd->vdev_top->vdev_dtl_map.sm_space != 0) if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, EBUSY)); return (spa_vdev_state_exit(spa, NULL, EBUSY));
/* /*
@ -1774,7 +1896,7 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
*/ */
vd->vdev_offline = B_TRUE; vd->vdev_offline = B_TRUE;
vdev_reopen(vd->vdev_top); vdev_reopen(vd->vdev_top);
if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
vd->vdev_offline = B_FALSE; vd->vdev_offline = B_FALSE;
vdev_reopen(vd->vdev_top); vdev_reopen(vd->vdev_top);
return (spa_vdev_state_exit(spa, NULL, EBUSY)); return (spa_vdev_state_exit(spa, NULL, EBUSY));
@ -1855,13 +1977,17 @@ vdev_writeable(vdev_t *vd)
boolean_t boolean_t
vdev_allocatable(vdev_t *vd) vdev_allocatable(vdev_t *vd)
{ {
uint64_t state = vd->vdev_state;
/* /*
* We currently allow allocations from vdevs which maybe in the * We currently allow allocations from vdevs which may be in the
* process of reopening (i.e. VDEV_STATE_CLOSED). If the device * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
* fails to reopen then we'll catch it later when we're holding * fails to reopen then we'll catch it later when we're holding
* the proper locks. * the proper locks. Note that we have to get the vdev state
* in a local variable because although it changes atomically,
* we're asking two separate questions about it.
*/ */
return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) && return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
!vd->vdev_cant_write); !vd->vdev_cant_write);
} }
@ -1932,7 +2058,8 @@ vdev_clear_stats(vdev_t *vd)
void void
vdev_stat_update(zio_t *zio, uint64_t psize) vdev_stat_update(zio_t *zio, uint64_t psize)
{ {
vdev_t *rvd = zio->io_spa->spa_root_vdev; spa_t *spa = zio->io_spa;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
vdev_t *pvd; vdev_t *pvd;
uint64_t txg = zio->io_txg; uint64_t txg = zio->io_txg;
@ -1965,21 +2092,23 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
return; return;
ASSERT(vd == zio->io_vd); ASSERT(vd == zio->io_vd);
if (!(flags & ZIO_FLAG_IO_BYPASS)) {
mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_BYPASS)
vs->vs_ops[type]++; return;
vs->vs_bytes[type] += psize;
mutex_exit(&vd->vdev_stat_lock); mutex_enter(&vd->vdev_stat_lock);
}
if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_IO_REPAIR) {
ASSERT(zio->io_delegate_list == NULL);
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_SCRUB_THREAD) if (flags & ZIO_FLAG_SCRUB_THREAD)
vs->vs_scrub_repaired += psize; vs->vs_scrub_repaired += psize;
else if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize; vs->vs_self_healed += psize;
mutex_exit(&vd->vdev_stat_lock);
} }
vs->vs_ops[type]++;
vs->vs_bytes[type] += psize;
mutex_exit(&vd->vdev_stat_lock);
return; return;
} }
@ -1997,19 +2126,39 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vs->vs_write_errors++; vs->vs_write_errors++;
mutex_exit(&vd->vdev_stat_lock); mutex_exit(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) { if (type == ZIO_TYPE_WRITE && txg != 0 &&
if (flags & ZIO_FLAG_SCRUB_THREAD) { (!(flags & ZIO_FLAG_IO_REPAIR) ||
ASSERT(flags & ZIO_FLAG_IO_REPAIR); (flags & ZIO_FLAG_SCRUB_THREAD))) {
for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) /*
vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); * This is either a normal write (not a repair), or it's a
} * repair induced by the scrub thread. In the normal case,
if (!(flags & ZIO_FLAG_IO_REPAIR)) { * we commit the DTL change in the same txg as the block
if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) * was born. In the scrub-induced repair case, we know that
* scrubs run in first-pass syncing context, so we commit
* the DTL change in spa->spa_syncing_txg.
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
* reads, because we have no transactional context in which to
* do so -- and it's not clear that it'd be desirable anyway.
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
if (flags & ZIO_FLAG_SCRUB_THREAD) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
commit_txg = spa->spa_syncing_txg;
}
ASSERT(commit_txg >= spa->spa_syncing_txg);
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return; return;
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
} }
if (vd != rvd)
vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
} }
} }
@ -2222,7 +2371,8 @@ vdev_state_clean(vdev_t *vd)
void void
vdev_propagate_state(vdev_t *vd) vdev_propagate_state(vdev_t *vd)
{ {
vdev_t *rvd = vd->vdev_spa->spa_root_vdev; spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0; int degraded = 0, faulted = 0;
int corrupted = 0; int corrupted = 0;
int c; int c;
@ -2233,7 +2383,7 @@ vdev_propagate_state(vdev_t *vd)
child = vd->vdev_child[c]; child = vd->vdev_child[c];
if (!vdev_readable(child) || if (!vdev_readable(child) ||
(!vdev_writeable(child) && (spa_mode & FWRITE))) { (!vdev_writeable(child) && spa_writeable(spa))) {
/* /*
* Root special: if there is a top-level log * Root special: if there is a top-level log
* device, treat the root vdev as if it were * device, treat the root vdev as if it were

View File

@ -61,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
*/ */
ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
if (error) { if (error) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@ -105,7 +105,8 @@ vdev_file_close(vdev_t *vd)
if (vf->vf_vnode != NULL) { if (vf->vf_vnode != NULL) {
(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
kcred, NULL);
VN_RELE(vf->vf_vnode); VN_RELE(vf->vf_vnode);
} }

View File

@ -277,9 +277,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_islog) == 0); vd->vdev_islog) == 0);
} }
if (vd->vdev_dtl.smo_object != 0) if (vd->vdev_dtl_smo.smo_object != 0)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
vd->vdev_dtl.smo_object) == 0); vd->vdev_dtl_smo.smo_object) == 0);
if (getstats) { if (getstats) {
vdev_stat_t vs; vdev_stat_t vs;
@ -524,9 +524,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
return (EBUSY); return (EBUSY);
ASSERT(reason != VDEV_LABEL_REMOVE ||
vdev_inuse(vd, crtxg, reason, NULL, NULL));
/* /*
* If this is a request to add or replace a spare or l2cache device * If this is a request to add or replace a spare or l2cache device
* that is in use elsewhere on the system, then we must update the * that is in use elsewhere on the system, then we must update the
@ -708,6 +705,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
* ========================================================================== * ==========================================================================
*/ */
/*
* For use by zdb and debugging purposes only
*/
uint64_t ub_max_txg = UINT64_MAX;
/* /*
* Consider the following situation: txg is safely synced to disk. We've * Consider the following situation: txg is safely synced to disk. We've
* written the first uberblock for txg + 1, and then we lose power. When we * written the first uberblock for txg + 1, and then we lose power. When we
@ -745,7 +747,8 @@ vdev_uberblock_load_done(zio_t *zio)
if (zio->io_error == 0 && uberblock_verify(ub) == 0) { if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock); mutex_enter(&rio->io_lock);
if (vdev_uberblock_compare(ub, ubbest) > 0) if (ub->ub_txg <= ub_max_txg &&
vdev_uberblock_compare(ub, ubbest) > 0)
*ubbest = *ub; *ubbest = *ub;
mutex_exit(&rio->io_lock); mutex_exit(&rio->io_lock);
} }

View File

@ -225,7 +225,7 @@ vdev_mirror_child_select(zio_t *zio)
mc->mc_skipped = 1; mc->mc_skipped = 1;
continue; continue;
} }
if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
return (c); return (c);
mc->mc_error = ESTALE; mc->mc_error = ESTALE;
mc->mc_skipped = 1; mc->mc_skipped = 1;
@ -282,20 +282,10 @@ vdev_mirror_io_start(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_type == ZIO_TYPE_WRITE);
/* /*
* If this is a resilvering I/O to a replacing vdev, * Writes go to all children.
* only the last child should be written -- unless the
* first child happens to have a DTL entry here as well.
* All other writes go to all children.
*/ */
if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && c = 0;
!vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, children = mm->mm_children;
zio->io_txg, 1)) {
c = mm->mm_children - 1;
children = 1;
} else {
c = 0;
children = mm->mm_children;
}
} }
while (children--) { while (children--) {
@ -398,7 +388,7 @@ vdev_mirror_io_done(zio_t *zio)
ASSERT(zio->io_error != 0); ASSERT(zio->io_error != 0);
} }
if (good_copies && (spa_mode & FWRITE) && if (good_copies && spa_writeable(zio->io_spa) &&
(unexpected_errors || (unexpected_errors ||
(zio->io_flags & ZIO_FLAG_RESILVER) || (zio->io_flags & ZIO_FLAG_RESILVER) ||
((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
@ -419,7 +409,7 @@ vdev_mirror_io_done(zio_t *zio)
if (mc->mc_tried) if (mc->mc_tried)
continue; continue;
if (!(zio->io_flags & ZIO_FLAG_SCRUB) && if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1)) zio->io_txg, 1))
continue; continue;
mc->mc_error = ESTALE; mc->mc_error = ESTALE;
@ -429,7 +419,8 @@ vdev_mirror_io_done(zio_t *zio)
mc->mc_vd, mc->mc_offset, mc->mc_vd, mc->mc_offset,
zio->io_data, zio->io_size, zio->io_data, zio->io_size,
ZIO_TYPE_WRITE, zio->io_priority, ZIO_TYPE_WRITE, zio->io_priority,
ZIO_FLAG_IO_REPAIR, NULL, NULL)); ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
} }
} }
} }

View File

@ -176,6 +176,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
zio_t *fio, *lio, *aio, *dio; zio_t *fio, *lio, *aio, *dio;
avl_tree_t *tree; avl_tree_t *tree;
uint64_t size; uint64_t size;
int flags;
ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT(MUTEX_HELD(&vq->vq_lock));
@ -187,21 +188,32 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
tree = fio->io_vdev_tree; tree = fio->io_vdev_tree;
size = fio->io_size; size = fio->io_size;
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
!((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && /*
size + dio->io_size <= zfs_vdev_aggregation_limit) { * We can aggregate I/Os that are adjacent and of the
dio->io_delegate_next = fio; * same flavor, as expressed by the AGG_INHERIT flags.
fio = dio; * The latter is necessary so that certain attributes
size += dio->io_size; * of the I/O, such as whether it's a normal I/O or a
} * scrub/resilver, can be preserved in the aggregate.
*/
while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && while ((dio = AVL_PREV(tree, fio)) != NULL &&
!((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && IS_ADJACENT(dio, fio) &&
size + dio->io_size <= zfs_vdev_aggregation_limit) { (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
lio->io_delegate_next = dio; size + dio->io_size <= zfs_vdev_aggregation_limit) {
lio = dio; dio->io_delegate_next = fio;
size += dio->io_size; fio = dio;
size += dio->io_size;
}
while ((dio = AVL_NEXT(tree, lio)) != NULL &&
IS_ADJACENT(lio, dio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
size + dio->io_size <= zfs_vdev_aggregation_limit) {
lio->io_delegate_next = dio;
lio = dio;
size += dio->io_size;
}
} }
if (fio != lio) { if (fio != lio) {
@ -212,7 +224,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
buf, size, fio->io_type, ZIO_PRIORITY_NOW, buf, size, fio->io_type, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL); vdev_queue_agg_io_done, NULL);
aio->io_delegate_list = fio; aio->io_delegate_list = fio;

View File

@ -687,7 +687,7 @@ vdev_raidz_io_start(zio_t *zio)
rc->rc_skipped = 1; rc->rc_skipped = 1;
continue; continue;
} }
if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
if (c >= rm->rm_firstdatacol) if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++; rm->rm_missingdata++;
else else
@ -1165,7 +1165,7 @@ vdev_raidz_io_done(zio_t *zio)
done: done:
zio_checksum_verified(zio); zio_checksum_verified(zio);
if (zio->io_error == 0 && (spa_mode & FWRITE) && if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
(unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
/* /*
* Use the good data we have in hand to repair damaged children. * Use the good data we have in hand to repair damaged children.
@ -1180,7 +1180,8 @@ done:
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size, rc->rc_offset, rc->rc_data, rc->rc_size,
ZIO_TYPE_WRITE, zio->io_priority, ZIO_TYPE_WRITE, zio->io_priority,
ZIO_FLAG_IO_REPAIR, NULL, NULL)); ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
} }
} }
} }

View File

@ -2148,12 +2148,12 @@ top:
} }
} }
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock); mutex_exit(&zp->z_lock);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -2208,7 +2208,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
*check_privs = B_TRUE; *check_privs = B_TRUE;
if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ if (zfsvfs->z_replay) {
*working_mode = 0; *working_mode = 0;
return (0); return (0);
} }

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/vfs.h> #include <sys/vfs.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
@ -63,6 +61,20 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
while (ptr < end) { while (ptr < end) {
if (zfs_layout) { if (zfs_layout) {
/*
* Avoid overrun. Embedded aces can have one
* of several sizes. We don't know exactly
* how many our present, only the size of the
* buffer containing them. That size may be
* larger than needed to hold the aces
* present. As long as we do not do any
* swapping beyond the end of our block we are
* okay. It it safe to swap any non-ace data
* within the block since it is just zeros.
*/
if (ptr + sizeof (zfs_ace_hdr_t) > end) {
break;
}
zacep = (zfs_ace_t *)ptr; zacep = (zfs_ace_t *)ptr;
zacep->z_hdr.z_access_mask = zacep->z_hdr.z_access_mask =
BSWAP_32(zacep->z_hdr.z_access_mask); BSWAP_32(zacep->z_hdr.z_access_mask);
@ -71,6 +83,10 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
BSWAP_16(zacep->z_hdr.z_type); BSWAP_16(zacep->z_hdr.z_type);
entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
} else { } else {
/* Overrun avoidance */
if (ptr + sizeof (ace_t) > end) {
break;
}
acep = (ace_t *)ptr; acep = (ace_t *)ptr;
acep->a_access_mask = BSWAP_32(acep->a_access_mask); acep->a_access_mask = BSWAP_32(acep->a_access_mask);
acep->a_flags = BSWAP_16(acep->a_flags); acep->a_flags = BSWAP_16(acep->a_flags);
@ -87,8 +103,14 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
break; break;
case ACE_IDENTIFIER_GROUP: case ACE_IDENTIFIER_GROUP:
default: default:
/* Overrun avoidance */
if (zfs_layout) { if (zfs_layout) {
zacep->z_fuid = BSWAP_64(zacep->z_fuid); if (ptr + sizeof (zfs_ace_t) <= end) {
zacep->z_fuid = BSWAP_64(zacep->z_fuid);
} else {
entry_size = sizeof (zfs_ace_t);
break;
}
} }
switch (ace_type) { switch (ace_type) {
case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@ -169,7 +191,8 @@ zfs_znode_byteswap(void *buf, size_t size)
if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
ZFS_ACE_SPACE); ZFS_ACE_SPACE);
} else } else {
zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
ACE_SLOT_CNT); ACE_SLOT_CNT);
}
} }

View File

@ -561,24 +561,6 @@ zfs_rmnode(znode_t *zp)
ASSERT(ZTOV(zp)->v_count == 0); ASSERT(ZTOV(zp)->v_count == 0);
ASSERT(zp->z_phys->zp_links == 0); ASSERT(zp->z_phys->zp_links == 0);
/*
* If this is a ZIL replay then leave the object in the unlinked set.
* Otherwise we can get a deadlock, because the delete can be
* quite large and span multiple tx's and txgs, but each replay
* creates a tx to atomically run the replay function and mark the
* replay record as complete. We deadlock trying to start a tx in
* a new txg to further the deletion but can't because the replay
* tx hasn't finished.
*
* We actually delete the object if we get a failure to create an
* object in zil_replay_log_record(), or after calling zil_replay().
*/
if (zfsvfs->z_assign >= TXG_INITIAL) {
zfs_znode_dmu_fini(zp);
zfs_znode_free(zp);
return;
}
/* /*
* If this is an attribute directory, purge its contents. * If this is an attribute directory, purge its contents.
*/ */
@ -845,9 +827,9 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
FUID_SIZE_ESTIMATE(zfsvfs)); FUID_SIZE_ESTIMATE(zfsvfs));
} }
} }
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) if (error == ERESTART)
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
return (error); return (error);
@ -930,7 +912,7 @@ top:
error = zfs_make_xattrdir(zp, &va, xvpp, cr); error = zfs_make_xattrdir(zp, &va, xvpp, cr);
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
/* NB: we already did dmu_tx_wait() if necessary */ /* NB: we already did dmu_tx_wait() if necessary */
goto top; goto top;
} }
@ -959,7 +941,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
uid_t fowner; uid_t fowner;
zfsvfs_t *zfsvfs = zdp->z_zfsvfs; zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ if (zdp->z_zfsvfs->z_replay)
return (0); return (0);
if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)

View File

@ -519,7 +519,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
uint32_t rid; uint32_t rid;
idmap_stat status; idmap_stat status;
uint64_t idx; uint64_t idx;
boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
zfs_fuid_t *zfuid = NULL; zfs_fuid_t *zfuid = NULL;
zfs_fuid_info_t *fuidp; zfs_fuid_info_t *fuidp;
@ -534,7 +533,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
return (id); return (id);
if (is_replay) { if (zfsvfs->z_replay) {
fuidp = zfsvfs->z_fuid_replay; fuidp = zfsvfs->z_fuid_replay;
/* /*
@ -584,7 +583,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
if (!is_replay) if (!zfsvfs->z_replay)
zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
else if (zfuid != NULL) { else if (zfuid != NULL) {
list_remove(&fuidp->z_fuids, zfuid); list_remove(&fuidp->z_fuids, zfuid);

View File

@ -856,9 +856,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
{ {
int error; int error;
boolean_t force = (boolean_t)zc->zc_cookie; boolean_t force = (boolean_t)zc->zc_cookie;
boolean_t hardforce = (boolean_t)zc->zc_guid;
zfs_log_history(zc); zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force); error = spa_export(zc->zc_name, NULL, force, hardforce);
return (error); return (error);
} }
@ -1162,7 +1163,7 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error); return (error);
error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
spa_close(spa, FTAG); spa_close(spa, FTAG);
return (error); return (error);

View File

@ -45,13 +45,33 @@
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/zfs_fuid.h> #include <sys/zfs_fuid.h>
#include <sys/ddi.h> #include <sys/ddi.h>
#include <sys/dsl_dataset.h>
#define ZFS_HANDLE_REPLAY(zilog, tx) \
if (zilog->zl_replay) { \
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
zilog->zl_replaying_seq; \
return; \
}
/* /*
* All the functions in this file are used to construct the log entries * These zfs_log_* functions must be called within a dmu tx, in one
* to record transactions. They allocate * an intent log transaction * of 2 contexts depending on zilog->z_replay:
* structure (itx_t) and save within it all the information necessary to *
* possibly replay the transaction. The itx is then assigned a sequence * Non replay mode
* number and inserted in the in-memory list anchored in the zilog. * ---------------
* We need to record the transaction so that if it is committed to
* the Intent Log then it can be replayed. An intent log transaction
* structure (itx_t) is allocated and all the information necessary to
* possibly replay the transaction is saved in it. The itx is then assigned
* a sequence number and inserted in the in-memory list anchored in the zilog.
*
* Replay mode
* -----------
* We need to mark the intent log record as replayed in the log header.
* This is done in the same transaction as the replay so that they
* commit atomically.
*/ */
int int
@ -231,6 +251,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zilog == NULL) if (zilog == NULL)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
/* /*
* If we have FUIDs present then add in space for * If we have FUIDs present then add in space for
* domains and ACE fuid's if any. * domains and ACE fuid's if any.
@ -334,6 +356,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zilog == NULL) if (zilog == NULL)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
itx = zil_itx_create(txtype, sizeof (*lr) + namesize); itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_remove_t *)&itx->itx_lr; lr = (lr_remove_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id; lr->lr_doid = dzp->z_id;
@ -358,6 +382,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zilog == NULL) if (zilog == NULL)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
itx = zil_itx_create(txtype, sizeof (*lr) + namesize); itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_link_t *)&itx->itx_lr; lr = (lr_link_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id; lr->lr_doid = dzp->z_id;
@ -385,6 +411,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zilog == NULL) if (zilog == NULL)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
lr = (lr_create_t *)&itx->itx_lr; lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id; lr->lr_doid = dzp->z_id;
@ -419,6 +447,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zilog == NULL) if (zilog == NULL)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr; lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id; lr->lr_sdoid = sdzp->z_id;
@ -451,6 +481,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
if (zilog == NULL || zp->z_unlinked) if (zilog == NULL || zp->z_unlinked)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
/* /*
* Writes are handled in three different ways: * Writes are handled in three different ways:
* *
@ -549,6 +581,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
if (zilog == NULL || zp->z_unlinked) if (zilog == NULL || zp->z_unlinked)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
itx = zil_itx_create(txtype, sizeof (*lr)); itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_truncate_t *)&itx->itx_lr; lr = (lr_truncate_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id; lr->lr_foid = zp->z_id;
@ -578,6 +612,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
if (zilog == NULL || zp->z_unlinked) if (zilog == NULL || zp->z_unlinked)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
/* /*
* If XVATTR set, then log record size needs to allow * If XVATTR set, then log record size needs to allow
* for lr_attr_t + xvattr mask, mapsize and create time * for lr_attr_t + xvattr mask, mapsize and create time
@ -644,6 +680,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
if (zilog == NULL || zp->z_unlinked) if (zilog == NULL || zp->z_unlinked)
return; return;
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
TX_ACL_V0 : TX_ACL; TX_ACL_V0 : TX_ACL;

View File

@ -583,21 +583,50 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* allow replays to succeed. * allow replays to succeed.
*/ */
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; if (readonly != 0)
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
else
zfs_unlinked_drain(zfsvfs);
/* zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
* Parse and replay the intent log. if (zil_disable) {
*/ zil_destroy(zfsvfs->z_log, 0);
zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, zfsvfs->z_log = NULL;
zfs_replay_vector, zfs_unlinked_drain); } else {
/*
zfs_unlinked_drain(zfsvfs); * Parse and replay the intent log.
*
* Because of ziltest, this must be done after
* zfs_unlinked_drain(). (Further note: ziltest
* doesn't use readonly mounts, where
* zfs_unlinked_drain() isn't called.) This is because
* ziltest causes spa_sync() to think it's committed,
* but actually it is not, so the intent log contains
* many txg's worth of changes.
*
* In particular, if object N is in the unlinked set in
* the last txg to actually sync, then it could be
* actually freed in a later txg and then reallocated
* in a yet later txg. This would write a "create
* object N" record to the intent log. Normally, this
* would be fine because the spa_sync() would have
* written out the fact that object N is free, before
* we could write the "create object N" intent log
* record.
*
* But when we are in ziltest mode, we advance the "open
* txg" without actually spa_sync()-ing the changes to
* disk. So we would see that object N is still
* allocated and in the unlinked set, and there is an
* intent log record saying to allocate it.
*/
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
zfsvfs->z_replay = B_FALSE;
}
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
} }
if (!zil_disable)
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
return (0); return (0);
} }
@ -634,7 +663,6 @@ zfs_domount(vfs_t *vfsp, char *osname)
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
zfsvfs->z_vfs = vfsp; zfsvfs->z_vfs = vfsp;
zfsvfs->z_parent = zfsvfs; zfsvfs->z_parent = zfsvfs;
zfsvfs->z_assign = TXG_NOWAIT;
zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;

View File

@ -105,9 +105,7 @@
* (3) All range locks must be grabbed before calling dmu_tx_assign(), * (3) All range locks must be grabbed before calling dmu_tx_assign(),
* as they can span dmu_tx_assign() calls. * as they can span dmu_tx_assign() calls.
* *
* (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
* In normal operation, this will be TXG_NOWAIT. During ZIL replay,
* it will be a specific txg. Either way, dmu_tx_assign() never blocks.
* This is critical because we don't want to block while holding locks. * This is critical because we don't want to block while holding locks.
* Note, in particular, that if a lock is sometimes acquired before * Note, in particular, that if a lock is sometimes acquired before
* the tx assigns, and sometimes after (e.g. z_lock), then failing to * the tx assigns, and sometimes after (e.g. z_lock), then failing to
@ -124,6 +122,8 @@
* (5) If the operation succeeded, generate the intent log entry for it * (5) If the operation succeeded, generate the intent log entry for it
* before dropping locks. This ensures that the ordering of events * before dropping locks. This ensures that the ordering of events
* in the intent log matches the order in which they actually occurred. * in the intent log matches the order in which they actually occurred.
* During ZIL replay the zfs_log_* functions will update the sequence
* number to indicate the zil transaction has replayed.
* *
* (6) At the end of each vnode op, the DMU tx must always commit, * (6) At the end of each vnode op, the DMU tx must always commit,
* regardless of whether there were any errors. * regardless of whether there were any errors.
@ -139,12 +139,12 @@
* rw_enter(...); // grab any other locks you need * rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx * tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify * dmu_tx_hold_*(); // hold each object you might modify
* error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
* if (error) { * if (error) {
* rw_exit(...); // drop locks * rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry * zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes * VN_RELE(...); // release held vnodes
* if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { * if (error == ERESTART) {
* dmu_tx_wait(tx); * dmu_tx_wait(tx);
* dmu_tx_abort(tx); * dmu_tx_abort(tx);
* goto top; * goto top;
@ -698,10 +698,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
if (error == ERESTART && if (error == ERESTART) {
zfsvfs->z_assign == TXG_NOWAIT) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
continue; continue;
@ -807,7 +806,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* If we're in replay mode, or we made no progress, return error. * If we're in replay mode, or we made no progress, return error.
* Otherwise, it's at least a partial write, so it's successful. * Otherwise, it's at least a partial write, so it's successful.
*/ */
if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (error); return (error);
} }
@ -1233,11 +1232,10 @@ top:
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE); 0, SPA_MAXBLOCKSIZE);
} }
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART && if (error == ERESTART) {
zfsvfs->z_assign == TXG_NOWAIT) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -1449,11 +1447,11 @@ top:
/* charge as an update -- would be nice not to charge at all */ /* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
VN_RELE(vp); VN_RELE(vp);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -1659,10 +1657,10 @@ top:
if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE); 0, SPA_MAXBLOCKSIZE);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -1789,13 +1787,13 @@ top:
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_parent_lock);
rw_exit(&zp->z_name_lock); rw_exit(&zp->z_name_lock);
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
VN_RELE(vp); VN_RELE(vp);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -2342,6 +2340,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
zilog_t *zilog; zilog_t *zilog;
dmu_tx_t *tx; dmu_tx_t *tx;
vattr_t oldva; vattr_t oldva;
xvattr_t tmpxvattr;
uint_t mask = vap->va_mask; uint_t mask = vap->va_mask;
uint_t saved_mask; uint_t saved_mask;
int trim_mask = 0; int trim_mask = 0;
@ -2396,6 +2395,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
*/ */
xoap = xva_getxoptattr(xvap); xoap = xva_getxoptattr(xvap);
xva_init(&tmpxvattr);
/* /*
* Immutable files can only alter immutable bit and atime * Immutable files can only alter immutable bit and atime
*/ */
@ -2518,28 +2519,78 @@ top:
oldva.va_mode = pzp->zp_mode; oldva.va_mode = pzp->zp_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) { if (mask & AT_XVATTR) {
if ((need_policy == FALSE) && /*
(XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && * Update xvattr mask to include only those attributes
xoap->xoa_appendonly != * that are actually changing.
((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || *
(XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && * the bits will be restored prior to actually setting
xoap->xoa_nounlink != * the attributes so the caller thinks they were set.
((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || */
(XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
xoap->xoa_immutable != if (xoap->xoa_appendonly !=
((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
(XVA_ISSET_REQ(xvap, XAT_NODUMP) && need_policy = TRUE;
xoap->xoa_nodump != } else {
((pzp->zp_flags & ZFS_NODUMP) != 0)) || XVA_CLR_REQ(xvap, XAT_APPENDONLY);
(XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
xoap->xoa_av_modified != }
((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || }
((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
((vp->v_type != VREG && xoap->xoa_av_quarantined) || if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
xoap->xoa_av_quarantined != if (xoap->xoa_nounlink !=
((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || need_policy = TRUE;
(XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { } else {
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
}
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
if (xoap->xoa_immutable !=
((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
}
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
if (xoap->xoa_nodump !=
((pzp->zp_flags & ZFS_NODUMP) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NODUMP);
XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
}
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
if (xoap->xoa_av_modified !=
((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
}
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
if ((vp->v_type != VREG &&
xoap->xoa_av_quarantined) ||
xoap->xoa_av_quarantined !=
((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
}
}
if (need_policy == FALSE &&
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
need_policy = TRUE; need_policy = TRUE;
} }
} }
@ -2649,7 +2700,7 @@ top:
dmu_tx_hold_bonus(tx, attrzp->z_id); dmu_tx_hold_bonus(tx, attrzp->z_id);
} }
err = dmu_tx_assign(tx, zfsvfs->z_assign); err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err) { if (err) {
if (attrzp) if (attrzp)
VN_RELE(ZTOV(attrzp)); VN_RELE(ZTOV(attrzp));
@ -2659,7 +2710,7 @@ top:
aclp = NULL; aclp = NULL;
} }
if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (err == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -2732,6 +2783,31 @@ top:
*/ */
if (xoap && (mask & AT_XVATTR)) { if (xoap && (mask & AT_XVATTR)) {
/*
* restore trimmed off masks
* so that return masks can be set for caller.
*/
if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
XVA_SET_REQ(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
XVA_SET_REQ(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
XVA_SET_REQ(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
XVA_SET_REQ(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
size_t len; size_t len;
dmu_object_info_t doi; dmu_object_info_t doi;
@ -3104,7 +3180,7 @@ top:
if (tzp) if (tzp)
dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
if (zl != NULL) if (zl != NULL)
zfs_rename_unlock(&zl); zfs_rename_unlock(&zl);
@ -3113,7 +3189,7 @@ top:
VN_RELE(ZTOV(szp)); VN_RELE(ZTOV(szp));
if (tzp) if (tzp)
VN_RELE(ZTOV(tzp)); VN_RELE(ZTOV(tzp));
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -3242,10 +3318,10 @@ top:
FUID_SIZE_ESTIMATE(zfsvfs)); FUID_SIZE_ESTIMATE(zfsvfs));
} }
} }
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -3462,10 +3538,10 @@ top:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, szp->z_id); dmu_tx_hold_bonus(tx, szp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -3547,7 +3623,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
len = PAGESIZE; len = PAGESIZE;
/* /*
* If our blocksize is bigger than the page size, try to kluster * If our blocksize is bigger than the page size, try to kluster
* muiltiple pages so that we write a full block (thus avoiding * multiple pages so that we write a full block (thus avoiding
* a read-modify-write). * a read-modify-write).
*/ */
if (off < filesz && zp->z_blksz > PAGESIZE) { if (off < filesz && zp->z_blksz > PAGESIZE) {
@ -3589,9 +3665,9 @@ top:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_write(tx, zp->z_id, off, len);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
err = dmu_tx_assign(tx, zfsvfs->z_assign); err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err != 0) { if (err != 0) {
if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (err == ERESTART) {
zfs_range_unlock(rl); zfs_range_unlock(rl);
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);

View File

@ -734,7 +734,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ if (zfsvfs->z_replay) {
obj = vap->va_nodeid; obj = vap->va_nodeid;
flag |= IS_REPLAY; flag |= IS_REPLAY;
now = vap->va_ctime; /* see zfs_replay_create() */ now = vap->va_ctime; /* see zfs_replay_create() */
@ -1254,9 +1254,9 @@ top:
newblksz = 0; newblksz = 0;
} }
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -1358,9 +1358,9 @@ zfs_trunc(znode_t *zp, uint64_t end)
top: top:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto top; goto top;
@ -1456,9 +1456,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
log: log:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
error = dmu_tx_assign(tx, zfsvfs->z_assign); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto log; goto log;
@ -1562,7 +1562,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
bzero(&zfsvfs, sizeof (zfsvfs_t)); bzero(&zfsvfs, sizeof (zfsvfs_t));
zfsvfs.z_os = os; zfsvfs.z_os = os;
zfsvfs.z_assign = TXG_NOWAIT;
zfsvfs.z_parent = &zfsvfs; zfsvfs.z_parent = &zfsvfs;
zfsvfs.z_version = version; zfsvfs.z_version = version;
zfsvfs.z_use_fuids = USE_FUIDS(version, os); zfsvfs.z_use_fuids = USE_FUIDS(version, os);

View File

@ -351,14 +351,20 @@ zil_create(zilog_t *zilog)
blk = zh->zh_log; blk = zh->zh_log;
/* /*
* If we don't already have an initial log block, allocate one now. * If we don't already have an initial log block or we have one
* but it's the wrong endianness then allocate one.
*/ */
if (BP_IS_HOLE(&blk)) { if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
tx = dmu_tx_create(zilog->zl_os); tx = dmu_tx_create(zilog->zl_os);
(void) dmu_tx_assign(tx, TXG_WAIT); (void) dmu_tx_assign(tx, TXG_WAIT);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx); txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
zio_free_blk(zilog->zl_spa, &blk, txg);
BP_ZERO(&blk);
}
error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
NULL, txg); NULL, txg);
@ -1219,7 +1225,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
ASSERT(zilog->zl_stop_sync == 0); ASSERT(zilog->zl_stop_sync == 0);
zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
if (zilog->zl_destroy_txg == txg) { if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log; blkptr_t blk = zh->zh_log;
@ -1228,7 +1234,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
ASSERT(spa_sync_pass(spa) == 1); ASSERT(spa_sync_pass(spa) == 1);
bzero(zh, sizeof (zil_header_t)); bzero(zh, sizeof (zil_header_t));
bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
if (zilog->zl_keep_first) { if (zilog->zl_keep_first) {
/* /*
@ -1465,9 +1471,7 @@ zil_resume(zilog_t *zilog)
typedef struct zil_replay_arg { typedef struct zil_replay_arg {
objset_t *zr_os; objset_t *zr_os;
zil_replay_func_t **zr_replay; zil_replay_func_t **zr_replay;
zil_replay_cleaner_t *zr_replay_cleaner;
void *zr_arg; void *zr_arg;
uint64_t *zr_txgp;
boolean_t zr_byteswap; boolean_t zr_byteswap;
char *zr_lrbuf; char *zr_lrbuf;
} zil_replay_arg_t; } zil_replay_arg_t;
@ -1480,9 +1484,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
uint64_t reclen = lr->lrc_reclen; uint64_t reclen = lr->lrc_reclen;
uint64_t txtype = lr->lrc_txtype; uint64_t txtype = lr->lrc_txtype;
char *name; char *name;
int pass, error, sunk; int pass, error;
if (zilog->zl_stop_replay) if (!zilog->zl_replay) /* giving up */
return; return;
if (lr->lrc_txg < claim_txg) /* already committed */ if (lr->lrc_txg < claim_txg) /* already committed */
@ -1494,6 +1498,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
/* Strip case-insensitive bit, still present in log record */ /* Strip case-insensitive bit, still present in log record */
txtype &= ~TX_CI; txtype &= ~TX_CI;
if (txtype == 0 || txtype >= TX_MAX_TYPE) {
error = EINVAL;
goto bad;
}
/* /*
* Make a copy of the data so we can revise and extend it. * Make a copy of the data so we can revise and extend it.
*/ */
@ -1543,70 +1552,17 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
} }
} }
/*
* Replay of large truncates can end up needing additional txs
* and a different txg. If they are nested within the replay tx
* as below then a hang is possible. So we do the truncate here
* and redo the truncate later (a no-op) and update the sequence
* number whilst in the replay tx. Fortunately, it's safe to repeat
* a truncate if we crash and the truncate commits. A create over
* an existing file will also come in as a TX_TRUNCATE record.
*
* Note, remove of large files and renames over large files is
* handled by putting the deleted object on a stable list
* and if necessary force deleting the object outside of the replay
* transaction using the zr_replay_cleaner.
*/
if (txtype == TX_TRUNCATE) {
*zr->zr_txgp = TXG_NOWAIT;
error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf,
zr->zr_byteswap);
if (error)
goto bad;
zr->zr_byteswap = 0; /* only byteswap once */
}
/* /*
* We must now do two things atomically: replay this log record, * We must now do two things atomically: replay this log record,
* and update the log header to reflect the fact that we did so. * and update the log header sequence number to reflect the fact that
* We use the DMU's ability to assign into a specific txg to do this. * we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/ */
for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { for (pass = 1; pass <= 2; pass++) {
uint64_t replay_txg; zilog->zl_replaying_seq = lr->lrc_seq;
dmu_tx_t *replay_tx; /* Only byteswap (if needed) on the 1st pass. */
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
replay_tx = dmu_tx_create(zr->zr_os); zr->zr_byteswap && pass == 1);
error = dmu_tx_assign(replay_tx, TXG_WAIT);
if (error) {
dmu_tx_abort(replay_tx);
break;
}
replay_txg = dmu_tx_get_txg(replay_tx);
if (txtype == 0 || txtype >= TX_MAX_TYPE) {
error = EINVAL;
} else {
/*
* On the first pass, arrange for the replay vector
* to fail its dmu_tx_assign(). That's the only way
* to ensure that those code paths remain well tested.
*
* Only byteswap (if needed) on the 1st pass.
*/
*zr->zr_txgp = replay_txg - (pass == 1);
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
zr->zr_byteswap && pass == 1);
*zr->zr_txgp = TXG_NOWAIT;
}
if (error == 0) {
dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
zilog->zl_replay_seq[replay_txg & TXG_MASK] =
lr->lrc_seq;
}
dmu_tx_commit(replay_tx);
if (!error) if (!error)
return; return;
@ -1614,37 +1570,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
/* /*
* The DMU's dnode layer doesn't see removes until the txg * The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with * commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error other than ERESTART * EEXIST. So if we receive any error we try syncing out
* we try syncing out any removes then retrying the * any removes then retry the transaction.
* transaction.
*/ */
if (error != ERESTART && !sunk) { if (pass == 1)
if (zr->zr_replay_cleaner)
zr->zr_replay_cleaner(zr->zr_arg);
txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
sunk = B_TRUE;
continue; /* retry */
}
if (error != ERESTART)
break;
if (pass != 1)
txg_wait_open(spa_get_dsl(zilog->zl_spa),
replay_txg + 1);
dprintf("pass %d, retrying\n", pass);
} }
bad: bad:
ASSERT(error && error != ERESTART); ASSERT(error);
name = kmem_alloc(MAXNAMELEN, KM_SLEEP); name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
dmu_objset_name(zr->zr_os, name); dmu_objset_name(zr->zr_os, name);
cmn_err(CE_WARN, "ZFS replay transaction error %d, " cmn_err(CE_WARN, "ZFS replay transaction error %d, "
"dataset %s, seq 0x%llx, txtype %llu %s\n", "dataset %s, seq 0x%llx, txtype %llu %s\n",
error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
(lr->lrc_txtype & TX_CI) ? "CI" : ""); (lr->lrc_txtype & TX_CI) ? "CI" : "");
zilog->zl_stop_replay = 1; zilog->zl_replay = B_FALSE;
kmem_free(name, MAXNAMELEN); kmem_free(name, MAXNAMELEN);
} }
@ -1659,9 +1600,7 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
* If this dataset has a non-empty intent log, replay it and destroy it. * If this dataset has a non-empty intent log, replay it and destroy it.
*/ */
void void
zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
zil_replay_func_t *replay_func[TX_MAX_TYPE],
zil_replay_cleaner_t *replay_cleaner)
{ {
zilog_t *zilog = dmu_objset_zil(os); zilog_t *zilog = dmu_objset_zil(os);
const zil_header_t *zh = zilog->zl_header; const zil_header_t *zh = zilog->zl_header;
@ -1674,9 +1613,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
zr.zr_os = os; zr.zr_os = os;
zr.zr_replay = replay_func; zr.zr_replay = replay_func;
zr.zr_replay_cleaner = replay_cleaner;
zr.zr_arg = arg; zr.zr_arg = arg;
zr.zr_txgp = txgp;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
@ -1685,7 +1622,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
*/ */
txg_wait_synced(zilog->zl_dmu_pool, 0); txg_wait_synced(zilog->zl_dmu_pool, 0);
zilog->zl_stop_replay = 0; zilog->zl_replay = B_TRUE;
zilog->zl_replay_time = lbolt; zilog->zl_replay_time = lbolt;
ASSERT(zilog->zl_replay_blks == 0); ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
@ -1694,6 +1631,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
zil_destroy(zilog, B_FALSE); zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
zilog->zl_replay = B_FALSE;
} }
/* /*

View File

@ -767,7 +767,8 @@ zio_read_bp_init(zio_t *zio)
{ {
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_logical == zio && !(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t csize = BP_GET_PSIZE(bp); uint64_t csize = BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(csize); void *cbuf = zio_buf_alloc(csize);
@ -1798,7 +1799,30 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0);
ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
* This prevents spurious resilvering with nested replication.
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
* A is out of date, we'll read from C+D, then use the data to
* resilver A+B -- but we don't actually want to resilver B, just A.
* The top-level mirror has no way to know this, so instead we just
* discard unnecessary repairs as we work our way down the vdev tree.
* The same logic applies to any form of nested replication:
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_delegate_list == NULL);
zio_vdev_io_bypass(zio);
return (ZIO_PIPELINE_CONTINUE);
}
if (vd->vdev_ops->vdev_op_leaf && if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
@ -1814,7 +1838,6 @@ zio_vdev_io_start(zio_t *zio)
zio_interrupt(zio); zio_interrupt(zio);
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_STOP);
} }
} }
return (vd->vdev_ops->vdev_op_io_start(zio)); return (vd->vdev_ops->vdev_op_io_start(zio));
@ -2166,6 +2189,7 @@ zio_done(zio_t *zio)
if ((zio->io_type == ZIO_TYPE_READ || if ((zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_FREE) && zio->io_type == ZIO_TYPE_FREE) &&
zio->io_error == ENXIO && zio->io_error == ENXIO &&
spa->spa_load_state == SPA_LOAD_NONE &&
spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;

View File

@ -75,6 +75,7 @@
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zvol.h> #include <sys/zvol.h>
#include <sys/dumphdr.h> #include <sys/dumphdr.h>
#include <sys/zil_impl.h>
#include "zfs_namecheck.h" #include "zfs_namecheck.h"
@ -113,7 +114,6 @@ typedef struct zvol_state {
uint32_t zv_total_opens; /* total open count */ uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */ zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */ list_t zv_extents; /* List of extents for dump */
uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
znode_t zv_znode; /* for range locking */ znode_t zv_znode; /* for range locking */
} zvol_state_t; } zvol_state_t;
@ -381,7 +381,7 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
error = dmu_tx_assign(tx, zv->zv_txg_assign); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
} else { } else {
@ -558,7 +558,7 @@ zvol_create_minor(const char *name, major_t maj)
ASSERT(error == 0); ASSERT(error == 0);
zv->zv_volblocksize = doi.doi_data_block_size; zv->zv_volblocksize = doi.doi_data_block_size;
zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); zil_replay(os, zv, zvol_replay_vector);
zvol_size_changed(zv, maj); zvol_size_changed(zv, maj);
/* XXX this should handle the possible i/o error */ /* XXX this should handle the possible i/o error */
@ -971,8 +971,16 @@ static void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
{ {
uint32_t blocksize = zv->zv_volblocksize; uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
lr_write_t *lr; lr_write_t *lr;
if (zilog->zl_replay) {
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
zilog->zl_replaying_seq;
return;
}
while (len) { while (len) {
ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
@ -987,7 +995,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
BP_ZERO(&lr->lr_blkptr); BP_ZERO(&lr->lr_blkptr);
(void) zil_itx_assign(zv->zv_zilog, itx, tx); (void) zil_itx_assign(zilog, itx, tx);
len -= nbytes; len -= nbytes;
off += nbytes; off += nbytes;
} }